From 8d91c1e411f55d7ea91b1183a2e9f8088fb4d5be Mon Sep 17 00:00:00 2001 From: AndrĂ© Fabian Silva Delgado Date: Tue, 15 Dec 2015 14:52:16 -0300 Subject: Linux-libre 4.3.2-gnu --- drivers/staging/rdma/Kconfig | 33 + drivers/staging/rdma/Makefile | 5 + drivers/staging/rdma/amso1100/Kbuild | 6 + drivers/staging/rdma/amso1100/Kconfig | 15 + drivers/staging/rdma/amso1100/TODO | 4 + drivers/staging/rdma/amso1100/c2.c | 1241 +++ drivers/staging/rdma/amso1100/c2.h | 547 ++ drivers/staging/rdma/amso1100/c2_ae.c | 327 + drivers/staging/rdma/amso1100/c2_ae.h | 108 + drivers/staging/rdma/amso1100/c2_alloc.c | 142 + drivers/staging/rdma/amso1100/c2_cm.c | 461 + drivers/staging/rdma/amso1100/c2_cq.c | 440 + drivers/staging/rdma/amso1100/c2_intr.c | 219 + drivers/staging/rdma/amso1100/c2_mm.c | 377 + drivers/staging/rdma/amso1100/c2_mq.c | 174 + drivers/staging/rdma/amso1100/c2_mq.h | 106 + drivers/staging/rdma/amso1100/c2_pd.c | 90 + drivers/staging/rdma/amso1100/c2_provider.c | 912 ++ drivers/staging/rdma/amso1100/c2_provider.h | 182 + drivers/staging/rdma/amso1100/c2_qp.c | 1024 ++ drivers/staging/rdma/amso1100/c2_rnic.c | 655 ++ drivers/staging/rdma/amso1100/c2_status.h | 158 + drivers/staging/rdma/amso1100/c2_user.h | 82 + drivers/staging/rdma/amso1100/c2_vq.c | 260 + drivers/staging/rdma/amso1100/c2_vq.h | 63 + drivers/staging/rdma/amso1100/c2_wr.h | 1520 +++ drivers/staging/rdma/ehca/Kconfig | 10 + drivers/staging/rdma/ehca/Makefile | 16 + drivers/staging/rdma/ehca/TODO | 4 + drivers/staging/rdma/ehca/ehca_av.c | 277 + drivers/staging/rdma/ehca/ehca_classes.h | 482 + drivers/staging/rdma/ehca/ehca_classes_pSeries.h | 208 + drivers/staging/rdma/ehca/ehca_cq.c | 397 + drivers/staging/rdma/ehca/ehca_eq.c | 189 + drivers/staging/rdma/ehca/ehca_hca.c | 414 + drivers/staging/rdma/ehca/ehca_irq.c | 870 ++ drivers/staging/rdma/ehca/ehca_irq.h | 77 + drivers/staging/rdma/ehca/ehca_iverbs.h | 218 + drivers/staging/rdma/ehca/ehca_main.c | 1123 +++ drivers/staging/rdma/ehca/ehca_mcast.c | 131 + drivers/staging/rdma/ehca/ehca_mrmw.c | 2593 +++++ drivers/staging/rdma/ehca/ehca_mrmw.h | 132 + drivers/staging/rdma/ehca/ehca_pd.c | 124 + drivers/staging/rdma/ehca/ehca_qes.h | 260 + drivers/staging/rdma/ehca/ehca_qp.c | 2257 +++++ drivers/staging/rdma/ehca/ehca_reqs.c | 953 ++ drivers/staging/rdma/ehca/ehca_sqp.c | 245 + drivers/staging/rdma/ehca/ehca_tools.h | 155 + drivers/staging/rdma/ehca/ehca_uverbs.c | 309 + drivers/staging/rdma/ehca/hcp_if.c | 949 ++ drivers/staging/rdma/ehca/hcp_if.h | 265 + drivers/staging/rdma/ehca/hcp_phyp.c | 82 + drivers/staging/rdma/ehca/hcp_phyp.h | 90 + drivers/staging/rdma/ehca/hipz_fns.h | 68 + drivers/staging/rdma/ehca/hipz_fns_core.h | 100 + drivers/staging/rdma/ehca/hipz_hw.h | 414 + drivers/staging/rdma/ehca/ipz_pt_fn.c | 289 + drivers/staging/rdma/ehca/ipz_pt_fn.h | 289 + drivers/staging/rdma/hfi1/Kconfig | 37 + drivers/staging/rdma/hfi1/Makefile | 19 + drivers/staging/rdma/hfi1/TODO | 6 + drivers/staging/rdma/hfi1/chip.c | 10798 +++++++++++++++++++++ drivers/staging/rdma/hfi1/chip.h | 1035 ++ drivers/staging/rdma/hfi1/chip_registers.h | 1292 +++ drivers/staging/rdma/hfi1/common.h | 415 + drivers/staging/rdma/hfi1/cq.c | 558 ++ drivers/staging/rdma/hfi1/debugfs.c | 899 ++ drivers/staging/rdma/hfi1/debugfs.h | 78 + drivers/staging/rdma/hfi1/device.c | 184 + drivers/staging/rdma/hfi1/device.h | 62 + drivers/staging/rdma/hfi1/diag.c | 1873 ++++ drivers/staging/rdma/hfi1/dma.c | 186 + drivers/staging/rdma/hfi1/driver.c | 1241 +++ drivers/staging/rdma/hfi1/eprom.c | 475 + drivers/staging/rdma/hfi1/eprom.h | 55 + drivers/staging/rdma/hfi1/file_ops.c | 2144 ++++ drivers/staging/rdma/hfi1/firmware.c | 1620 ++++ drivers/staging/rdma/hfi1/hfi.h | 1821 ++++ drivers/staging/rdma/hfi1/init.c | 1722 ++++ drivers/staging/rdma/hfi1/intr.c | 207 + drivers/staging/rdma/hfi1/iowait.h | 186 + drivers/staging/rdma/hfi1/keys.c | 411 + drivers/staging/rdma/hfi1/mad.c | 4257 ++++++++ drivers/staging/rdma/hfi1/mad.h | 325 + drivers/staging/rdma/hfi1/mmap.c | 192 + drivers/staging/rdma/hfi1/mr.c | 551 ++ drivers/staging/rdma/hfi1/opa_compat.h | 129 + drivers/staging/rdma/hfi1/pcie.c | 1253 +++ drivers/staging/rdma/hfi1/pio.c | 1771 ++++ drivers/staging/rdma/hfi1/pio.h | 224 + drivers/staging/rdma/hfi1/pio_copy.c | 858 ++ drivers/staging/rdma/hfi1/platform_config.h | 286 + drivers/staging/rdma/hfi1/qp.c | 1687 ++++ drivers/staging/rdma/hfi1/qp.h | 235 + drivers/staging/rdma/hfi1/qsfp.c | 546 ++ drivers/staging/rdma/hfi1/qsfp.h | 222 + drivers/staging/rdma/hfi1/rc.c | 2426 +++++ drivers/staging/rdma/hfi1/ruc.c | 948 ++ drivers/staging/rdma/hfi1/sdma.c | 2962 ++++++ drivers/staging/rdma/hfi1/sdma.h | 1123 +++ drivers/staging/rdma/hfi1/srq.c | 397 + drivers/staging/rdma/hfi1/sysfs.c | 739 ++ drivers/staging/rdma/hfi1/trace.c | 221 + drivers/staging/rdma/hfi1/trace.h | 1409 +++ drivers/staging/rdma/hfi1/twsi.c | 518 + drivers/staging/rdma/hfi1/twsi.h | 68 + drivers/staging/rdma/hfi1/uc.c | 585 ++ drivers/staging/rdma/hfi1/ud.c | 885 ++ drivers/staging/rdma/hfi1/user_pages.c | 156 + drivers/staging/rdma/hfi1/user_sdma.c | 1444 +++ drivers/staging/rdma/hfi1/user_sdma.h | 89 + drivers/staging/rdma/hfi1/verbs.c | 2145 ++++ drivers/staging/rdma/hfi1/verbs.h | 1151 +++ drivers/staging/rdma/hfi1/verbs_mcast.c | 385 + drivers/staging/rdma/ipath/Kconfig | 16 + drivers/staging/rdma/ipath/Makefile | 37 + drivers/staging/rdma/ipath/TODO | 5 + drivers/staging/rdma/ipath/ipath_common.h | 851 ++ drivers/staging/rdma/ipath/ipath_cq.c | 483 + drivers/staging/rdma/ipath/ipath_debug.h | 99 + drivers/staging/rdma/ipath/ipath_diag.c | 551 ++ drivers/staging/rdma/ipath/ipath_dma.c | 179 + drivers/staging/rdma/ipath/ipath_driver.c | 2789 ++++++ drivers/staging/rdma/ipath/ipath_eeprom.c | 1183 +++ drivers/staging/rdma/ipath/ipath_file_ops.c | 2620 +++++ drivers/staging/rdma/ipath/ipath_fs.c | 422 + drivers/staging/rdma/ipath/ipath_iba6110.c | 1940 ++++ drivers/staging/rdma/ipath/ipath_init_chip.c | 1066 ++ drivers/staging/rdma/ipath/ipath_intr.c | 1273 +++ drivers/staging/rdma/ipath/ipath_kernel.h | 1373 +++ drivers/staging/rdma/ipath/ipath_keys.c | 270 + drivers/staging/rdma/ipath/ipath_mad.c | 1521 +++ drivers/staging/rdma/ipath/ipath_mmap.c | 174 + drivers/staging/rdma/ipath/ipath_mr.c | 425 + drivers/staging/rdma/ipath/ipath_qp.c | 1080 +++ drivers/staging/rdma/ipath/ipath_rc.c | 1969 ++++ drivers/staging/rdma/ipath/ipath_registers.h | 512 + drivers/staging/rdma/ipath/ipath_ruc.c | 734 ++ drivers/staging/rdma/ipath/ipath_sdma.c | 818 ++ drivers/staging/rdma/ipath/ipath_srq.c | 380 + drivers/staging/rdma/ipath/ipath_stats.c | 347 + drivers/staging/rdma/ipath/ipath_sysfs.c | 1238 +++ drivers/staging/rdma/ipath/ipath_uc.c | 547 ++ drivers/staging/rdma/ipath/ipath_ud.c | 580 ++ drivers/staging/rdma/ipath/ipath_user_pages.c | 229 + drivers/staging/rdma/ipath/ipath_user_sdma.c | 875 ++ drivers/staging/rdma/ipath/ipath_user_sdma.h | 52 + drivers/staging/rdma/ipath/ipath_verbs.c | 2365 +++++ drivers/staging/rdma/ipath/ipath_verbs.h | 939 ++ drivers/staging/rdma/ipath/ipath_verbs_mcast.c | 364 + drivers/staging/rdma/ipath/ipath_wc_ppc64.c | 49 + drivers/staging/rdma/ipath/ipath_wc_x86_64.c | 144 + 152 files changed, 111151 insertions(+) create mode 100644 drivers/staging/rdma/Kconfig create mode 100644 drivers/staging/rdma/Makefile create mode 100644 drivers/staging/rdma/amso1100/Kbuild create mode 100644 drivers/staging/rdma/amso1100/Kconfig create mode 100644 drivers/staging/rdma/amso1100/TODO create mode 100644 drivers/staging/rdma/amso1100/c2.c create mode 100644 drivers/staging/rdma/amso1100/c2.h create mode 100644 drivers/staging/rdma/amso1100/c2_ae.c create mode 100644 drivers/staging/rdma/amso1100/c2_ae.h create mode 100644 drivers/staging/rdma/amso1100/c2_alloc.c create mode 100644 drivers/staging/rdma/amso1100/c2_cm.c create mode 100644 drivers/staging/rdma/amso1100/c2_cq.c create mode 100644 drivers/staging/rdma/amso1100/c2_intr.c create mode 100644 drivers/staging/rdma/amso1100/c2_mm.c create mode 100644 drivers/staging/rdma/amso1100/c2_mq.c create mode 100644 drivers/staging/rdma/amso1100/c2_mq.h create mode 100644 drivers/staging/rdma/amso1100/c2_pd.c create mode 100644 drivers/staging/rdma/amso1100/c2_provider.c create mode 100644 drivers/staging/rdma/amso1100/c2_provider.h create mode 100644 drivers/staging/rdma/amso1100/c2_qp.c create mode 100644 drivers/staging/rdma/amso1100/c2_rnic.c create mode 100644 drivers/staging/rdma/amso1100/c2_status.h create mode 100644 drivers/staging/rdma/amso1100/c2_user.h create mode 100644 drivers/staging/rdma/amso1100/c2_vq.c create mode 100644 drivers/staging/rdma/amso1100/c2_vq.h create mode 100644 drivers/staging/rdma/amso1100/c2_wr.h create mode 100644 drivers/staging/rdma/ehca/Kconfig create mode 100644 drivers/staging/rdma/ehca/Makefile create mode 100644 drivers/staging/rdma/ehca/TODO create mode 100644 drivers/staging/rdma/ehca/ehca_av.c create mode 100644 drivers/staging/rdma/ehca/ehca_classes.h create mode 100644 drivers/staging/rdma/ehca/ehca_classes_pSeries.h create mode 100644 drivers/staging/rdma/ehca/ehca_cq.c create mode 100644 drivers/staging/rdma/ehca/ehca_eq.c create mode 100644 drivers/staging/rdma/ehca/ehca_hca.c create mode 100644 drivers/staging/rdma/ehca/ehca_irq.c create mode 100644 drivers/staging/rdma/ehca/ehca_irq.h create mode 100644 drivers/staging/rdma/ehca/ehca_iverbs.h create mode 100644 drivers/staging/rdma/ehca/ehca_main.c create mode 100644 drivers/staging/rdma/ehca/ehca_mcast.c create mode 100644 drivers/staging/rdma/ehca/ehca_mrmw.c create mode 100644 drivers/staging/rdma/ehca/ehca_mrmw.h create mode 100644 drivers/staging/rdma/ehca/ehca_pd.c create mode 100644 drivers/staging/rdma/ehca/ehca_qes.h create mode 100644 drivers/staging/rdma/ehca/ehca_qp.c create mode 100644 drivers/staging/rdma/ehca/ehca_reqs.c create mode 100644 drivers/staging/rdma/ehca/ehca_sqp.c create mode 100644 drivers/staging/rdma/ehca/ehca_tools.h create mode 100644 drivers/staging/rdma/ehca/ehca_uverbs.c create mode 100644 drivers/staging/rdma/ehca/hcp_if.c create mode 100644 drivers/staging/rdma/ehca/hcp_if.h create mode 100644 drivers/staging/rdma/ehca/hcp_phyp.c create mode 100644 drivers/staging/rdma/ehca/hcp_phyp.h create mode 100644 drivers/staging/rdma/ehca/hipz_fns.h create mode 100644 drivers/staging/rdma/ehca/hipz_fns_core.h create mode 100644 drivers/staging/rdma/ehca/hipz_hw.h create mode 100644 drivers/staging/rdma/ehca/ipz_pt_fn.c create mode 100644 drivers/staging/rdma/ehca/ipz_pt_fn.h create mode 100644 drivers/staging/rdma/hfi1/Kconfig create mode 100644 drivers/staging/rdma/hfi1/Makefile create mode 100644 drivers/staging/rdma/hfi1/TODO create mode 100644 drivers/staging/rdma/hfi1/chip.c create mode 100644 drivers/staging/rdma/hfi1/chip.h create mode 100644 drivers/staging/rdma/hfi1/chip_registers.h create mode 100644 drivers/staging/rdma/hfi1/common.h create mode 100644 drivers/staging/rdma/hfi1/cq.c create mode 100644 drivers/staging/rdma/hfi1/debugfs.c create mode 100644 drivers/staging/rdma/hfi1/debugfs.h create mode 100644 drivers/staging/rdma/hfi1/device.c create mode 100644 drivers/staging/rdma/hfi1/device.h create mode 100644 drivers/staging/rdma/hfi1/diag.c create mode 100644 drivers/staging/rdma/hfi1/dma.c create mode 100644 drivers/staging/rdma/hfi1/driver.c create mode 100644 drivers/staging/rdma/hfi1/eprom.c create mode 100644 drivers/staging/rdma/hfi1/eprom.h create mode 100644 drivers/staging/rdma/hfi1/file_ops.c create mode 100644 drivers/staging/rdma/hfi1/firmware.c create mode 100644 drivers/staging/rdma/hfi1/hfi.h create mode 100644 drivers/staging/rdma/hfi1/init.c create mode 100644 drivers/staging/rdma/hfi1/intr.c create mode 100644 drivers/staging/rdma/hfi1/iowait.h create mode 100644 drivers/staging/rdma/hfi1/keys.c create mode 100644 drivers/staging/rdma/hfi1/mad.c create mode 100644 drivers/staging/rdma/hfi1/mad.h create mode 100644 drivers/staging/rdma/hfi1/mmap.c create mode 100644 drivers/staging/rdma/hfi1/mr.c create mode 100644 drivers/staging/rdma/hfi1/opa_compat.h create mode 100644 drivers/staging/rdma/hfi1/pcie.c create mode 100644 drivers/staging/rdma/hfi1/pio.c create mode 100644 drivers/staging/rdma/hfi1/pio.h create mode 100644 drivers/staging/rdma/hfi1/pio_copy.c create mode 100644 drivers/staging/rdma/hfi1/platform_config.h create mode 100644 drivers/staging/rdma/hfi1/qp.c create mode 100644 drivers/staging/rdma/hfi1/qp.h create mode 100644 drivers/staging/rdma/hfi1/qsfp.c create mode 100644 drivers/staging/rdma/hfi1/qsfp.h create mode 100644 drivers/staging/rdma/hfi1/rc.c create mode 100644 drivers/staging/rdma/hfi1/ruc.c create mode 100644 drivers/staging/rdma/hfi1/sdma.c create mode 100644 drivers/staging/rdma/hfi1/sdma.h create mode 100644 drivers/staging/rdma/hfi1/srq.c create mode 100644 drivers/staging/rdma/hfi1/sysfs.c create mode 100644 drivers/staging/rdma/hfi1/trace.c create mode 100644 drivers/staging/rdma/hfi1/trace.h create mode 100644 drivers/staging/rdma/hfi1/twsi.c create mode 100644 drivers/staging/rdma/hfi1/twsi.h create mode 100644 drivers/staging/rdma/hfi1/uc.c create mode 100644 drivers/staging/rdma/hfi1/ud.c create mode 100644 drivers/staging/rdma/hfi1/user_pages.c create mode 100644 drivers/staging/rdma/hfi1/user_sdma.c create mode 100644 drivers/staging/rdma/hfi1/user_sdma.h create mode 100644 drivers/staging/rdma/hfi1/verbs.c create mode 100644 drivers/staging/rdma/hfi1/verbs.h create mode 100644 drivers/staging/rdma/hfi1/verbs_mcast.c create mode 100644 drivers/staging/rdma/ipath/Kconfig create mode 100644 drivers/staging/rdma/ipath/Makefile create mode 100644 drivers/staging/rdma/ipath/TODO create mode 100644 drivers/staging/rdma/ipath/ipath_common.h create mode 100644 drivers/staging/rdma/ipath/ipath_cq.c create mode 100644 drivers/staging/rdma/ipath/ipath_debug.h create mode 100644 drivers/staging/rdma/ipath/ipath_diag.c create mode 100644 drivers/staging/rdma/ipath/ipath_dma.c create mode 100644 drivers/staging/rdma/ipath/ipath_driver.c create mode 100644 drivers/staging/rdma/ipath/ipath_eeprom.c create mode 100644 drivers/staging/rdma/ipath/ipath_file_ops.c create mode 100644 drivers/staging/rdma/ipath/ipath_fs.c create mode 100644 drivers/staging/rdma/ipath/ipath_iba6110.c create mode 100644 drivers/staging/rdma/ipath/ipath_init_chip.c create mode 100644 drivers/staging/rdma/ipath/ipath_intr.c create mode 100644 drivers/staging/rdma/ipath/ipath_kernel.h create mode 100644 drivers/staging/rdma/ipath/ipath_keys.c create mode 100644 drivers/staging/rdma/ipath/ipath_mad.c create mode 100644 drivers/staging/rdma/ipath/ipath_mmap.c create mode 100644 drivers/staging/rdma/ipath/ipath_mr.c create mode 100644 drivers/staging/rdma/ipath/ipath_qp.c create mode 100644 drivers/staging/rdma/ipath/ipath_rc.c create mode 100644 drivers/staging/rdma/ipath/ipath_registers.h create mode 100644 drivers/staging/rdma/ipath/ipath_ruc.c create mode 100644 drivers/staging/rdma/ipath/ipath_sdma.c create mode 100644 drivers/staging/rdma/ipath/ipath_srq.c create mode 100644 drivers/staging/rdma/ipath/ipath_stats.c create mode 100644 drivers/staging/rdma/ipath/ipath_sysfs.c create mode 100644 drivers/staging/rdma/ipath/ipath_uc.c create mode 100644 drivers/staging/rdma/ipath/ipath_ud.c create mode 100644 drivers/staging/rdma/ipath/ipath_user_pages.c create mode 100644 drivers/staging/rdma/ipath/ipath_user_sdma.c create mode 100644 drivers/staging/rdma/ipath/ipath_user_sdma.h create mode 100644 drivers/staging/rdma/ipath/ipath_verbs.c create mode 100644 drivers/staging/rdma/ipath/ipath_verbs.h create mode 100644 drivers/staging/rdma/ipath/ipath_verbs_mcast.c create mode 100644 drivers/staging/rdma/ipath/ipath_wc_ppc64.c create mode 100644 drivers/staging/rdma/ipath/ipath_wc_x86_64.c (limited to 'drivers/staging/rdma') diff --git a/drivers/staging/rdma/Kconfig b/drivers/staging/rdma/Kconfig new file mode 100644 index 000000000..d7f62359d --- /dev/null +++ b/drivers/staging/rdma/Kconfig @@ -0,0 +1,33 @@ +menuconfig STAGING_RDMA + bool "RDMA staging drivers" + depends on INFINIBAND + depends on PCI || BROKEN + depends on HAS_IOMEM + depends on NET + depends on INET + default n + ---help--- + This option allows you to select a number of RDMA drivers that + fall into one of two categories: deprecated drivers being held + here before finally being removed or new drivers that still need + some work before being moved to the normal RDMA driver area. + + If you wish to work on these drivers, to help improve them, or + to report problems you have with them, please use the + linux-rdma@vger.kernel.org mailing list. + + If in doubt, say N here. + + +# Please keep entries in alphabetic order +if STAGING_RDMA + +source "drivers/staging/rdma/amso1100/Kconfig" + +source "drivers/staging/rdma/ehca/Kconfig" + +source "drivers/staging/rdma/hfi1/Kconfig" + +source "drivers/staging/rdma/ipath/Kconfig" + +endif diff --git a/drivers/staging/rdma/Makefile b/drivers/staging/rdma/Makefile new file mode 100644 index 000000000..139d78ef2 --- /dev/null +++ b/drivers/staging/rdma/Makefile @@ -0,0 +1,5 @@ +# Entries for RDMA_STAGING tree +obj-$(CONFIG_INFINIBAND_AMSO1100) += amso1100/ +obj-$(CONFIG_INFINIBAND_EHCA) += ehca/ +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ +obj-$(CONFIG_INFINIBAND_IPATH) += ipath/ diff --git a/drivers/staging/rdma/amso1100/Kbuild b/drivers/staging/rdma/amso1100/Kbuild new file mode 100644 index 000000000..950dfabcd --- /dev/null +++ b/drivers/staging/rdma/amso1100/Kbuild @@ -0,0 +1,6 @@ +ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG + +obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o + +iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \ + c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o diff --git a/drivers/staging/rdma/amso1100/Kconfig b/drivers/staging/rdma/amso1100/Kconfig new file mode 100644 index 000000000..e6ce5f209 --- /dev/null +++ b/drivers/staging/rdma/amso1100/Kconfig @@ -0,0 +1,15 @@ +config INFINIBAND_AMSO1100 + tristate "Ammasso 1100 HCA support" + depends on PCI && INET + ---help--- + This is a low-level driver for the Ammasso 1100 host + channel adapter (HCA). + +config INFINIBAND_AMSO1100_DEBUG + bool "Verbose debugging output" + depends on INFINIBAND_AMSO1100 + default n + ---help--- + This option causes the amso1100 driver to produce a bunch of + debug messages. Select this if you are developing the driver + or trying to diagnose a problem. diff --git a/drivers/staging/rdma/amso1100/TODO b/drivers/staging/rdma/amso1100/TODO new file mode 100644 index 000000000..18b00a5cb --- /dev/null +++ b/drivers/staging/rdma/amso1100/TODO @@ -0,0 +1,4 @@ +7/2015 + +The amso1100 driver has been deprecated and moved to drivers/staging. +It will be removed in the 4.6 merge window. diff --git a/drivers/staging/rdma/amso1100/c2.c b/drivers/staging/rdma/amso1100/c2.c new file mode 100644 index 000000000..766a71cce --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2.c @@ -0,0 +1,1241 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "c2.h" +#include "c2_provider.h" + +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK + | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN; + +static int debug = -1; /* defaults above */ +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)"); + +static int c2_up(struct net_device *netdev); +static int c2_down(struct net_device *netdev); +static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev); +static void c2_tx_interrupt(struct net_device *netdev); +static void c2_rx_interrupt(struct net_device *netdev); +static irqreturn_t c2_interrupt(int irq, void *dev_id); +static void c2_tx_timeout(struct net_device *netdev); +static int c2_change_mtu(struct net_device *netdev, int new_mtu); +static void c2_reset(struct c2_port *c2_port); + +static struct pci_device_id c2_pci_table[] = { + { PCI_DEVICE(0x18b8, 0xb001) }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, c2_pci_table); + +static void c2_print_macaddr(struct net_device *netdev) +{ + pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq); +} + +static void c2_set_rxbufsize(struct c2_port *c2_port) +{ + struct net_device *netdev = c2_port->netdev; + + if (netdev->mtu > RX_BUF_SIZE) + c2_port->rx_buf_size = + netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) + + NET_IP_ALIGN; + else + c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE; +} + +/* + * Allocate TX ring elements and chain them together. + * One-to-one association of adapter descriptors with ring elements. + */ +static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr, + dma_addr_t base, void __iomem * mmio_txp_ring) +{ + struct c2_tx_desc *tx_desc; + struct c2_txp_desc __iomem *txp_desc; + struct c2_element *elem; + int i; + + tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL); + if (!tx_ring->start) + return -ENOMEM; + + elem = tx_ring->start; + tx_desc = vaddr; + txp_desc = mmio_txp_ring; + for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) { + tx_desc->len = 0; + tx_desc->status = 0; + + /* Set TXP_HTXD_UNINIT */ + __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL), + (void __iomem *) txp_desc + C2_TXP_ADDR); + __raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT), + (void __iomem *) txp_desc + C2_TXP_FLAGS); + + elem->skb = NULL; + elem->ht_desc = tx_desc; + elem->hw_desc = txp_desc; + + if (i == tx_ring->count - 1) { + elem->next = tx_ring->start; + tx_desc->next_offset = base; + } else { + elem->next = elem + 1; + tx_desc->next_offset = + base + (i + 1) * sizeof(*tx_desc); + } + } + + tx_ring->to_use = tx_ring->to_clean = tx_ring->start; + + return 0; +} + +/* + * Allocate RX ring elements and chain them together. + * One-to-one association of adapter descriptors with ring elements. + */ +static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr, + dma_addr_t base, void __iomem * mmio_rxp_ring) +{ + struct c2_rx_desc *rx_desc; + struct c2_rxp_desc __iomem *rxp_desc; + struct c2_element *elem; + int i; + + rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL); + if (!rx_ring->start) + return -ENOMEM; + + elem = rx_ring->start; + rx_desc = vaddr; + rxp_desc = mmio_rxp_ring; + for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) { + rx_desc->len = 0; + rx_desc->status = 0; + + /* Set RXP_HRXD_UNINIT */ + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK), + (void __iomem *) rxp_desc + C2_RXP_STATUS); + __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT); + __raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL), + (void __iomem *) rxp_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT), + (void __iomem *) rxp_desc + C2_RXP_FLAGS); + + elem->skb = NULL; + elem->ht_desc = rx_desc; + elem->hw_desc = rxp_desc; + + if (i == rx_ring->count - 1) { + elem->next = rx_ring->start; + rx_desc->next_offset = base; + } else { + elem->next = elem + 1; + rx_desc->next_offset = + base + (i + 1) * sizeof(*rx_desc); + } + } + + rx_ring->to_use = rx_ring->to_clean = rx_ring->start; + + return 0; +} + +/* Setup buffer for receiving */ +static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem) +{ + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_rx_desc *rx_desc = elem->ht_desc; + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen; + struct c2_rxp_hdr *rxp_hdr; + + skb = dev_alloc_skb(c2_port->rx_buf_size); + if (unlikely(!skb)) { + pr_debug("%s: out of memory for receive\n", + c2_port->netdev->name); + return -ENOMEM; + } + + /* Zero out the rxp hdr in the sk_buff */ + memset(skb->data, 0, sizeof(*rxp_hdr)); + + skb->dev = c2_port->netdev; + + maplen = c2_port->rx_buf_size; + mapaddr = + pci_map_single(c2dev->pcidev, skb->data, maplen, + PCI_DMA_FROMDEVICE); + + /* Set the sk_buff RXP_header to RXP_HRXD_READY */ + rxp_hdr = (struct c2_rxp_hdr *) skb->data; + rxp_hdr->flags = RXP_HRXD_READY; + + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)), + elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + + elem->skb = skb; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + rx_desc->len = maplen; + + return 0; +} + +/* + * Allocate buffers for the Rx ring + * For receive: rx_ring.to_clean is next received frame + */ +static int c2_rx_fill(struct c2_port *c2_port) +{ + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + int ret = 0; + + elem = rx_ring->start; + do { + if (c2_rx_alloc(c2_port, elem)) { + ret = 1; + break; + } + } while ((elem = elem->next) != rx_ring->start); + + rx_ring->to_clean = rx_ring->start; + return ret; +} + +/* Free all buffers in RX ring, assumes receiver stopped */ +static void c2_rx_clean(struct c2_port *c2_port) +{ + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + struct c2_rx_desc *rx_desc; + + elem = rx_ring->start; + do { + rx_desc = elem->ht_desc; + rx_desc->len = 0; + + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew(0, elem->hw_desc + C2_RXP_COUNT); + __raw_writew(0, elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL), + elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT), + elem->hw_desc + C2_RXP_FLAGS); + + if (elem->skb) { + pci_unmap_single(c2dev->pcidev, elem->mapaddr, + elem->maplen, PCI_DMA_FROMDEVICE); + dev_kfree_skb(elem->skb); + elem->skb = NULL; + } + } while ((elem = elem->next) != rx_ring->start); +} + +static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem) +{ + struct c2_tx_desc *tx_desc = elem->ht_desc; + + tx_desc->len = 0; + + pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen, + PCI_DMA_TODEVICE); + + if (elem->skb) { + dev_kfree_skb_any(elem->skb); + elem->skb = NULL; + } + + return 0; +} + +/* Free all buffers in TX ring, assumes transmitter stopped */ +static void c2_tx_clean(struct c2_port *c2_port) +{ + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + struct c2_txp_desc txp_htxd; + int retry; + unsigned long flags; + + spin_lock_irqsave(&c2_port->tx_lock, flags); + + elem = tx_ring->start; + + do { + retry = 0; + do { + txp_htxd.flags = + readw(elem->hw_desc + C2_TXP_FLAGS); + + if (txp_htxd.flags == TXP_HTXD_READY) { + retry = 1; + __raw_writew(0, + elem->hw_desc + C2_TXP_LEN); + __raw_writeq(0, + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE), + elem->hw_desc + C2_TXP_FLAGS); + c2_port->netdev->stats.tx_dropped++; + break; + } else { + __raw_writew(0, + elem->hw_desc + C2_TXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT), + elem->hw_desc + C2_TXP_FLAGS); + } + + c2_tx_free(c2_port->c2dev, elem); + + } while ((elem = elem->next) != tx_ring->start); + } while (retry); + + c2_port->tx_avail = c2_port->tx_ring.count - 1; + c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start; + + if (c2_port->tx_avail > MAX_SKB_FRAGS + 1) + netif_wake_queue(c2_port->netdev); + + spin_unlock_irqrestore(&c2_port->tx_lock, flags); +} + +/* + * Process transmit descriptors marked 'DONE' by the firmware, + * freeing up their unneeded sk_buffs. + */ +static void c2_tx_interrupt(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + struct c2_txp_desc txp_htxd; + + spin_lock(&c2_port->tx_lock); + + for (elem = tx_ring->to_clean; elem != tx_ring->to_use; + elem = elem->next) { + txp_htxd.flags = + be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS)); + + if (txp_htxd.flags != TXP_HTXD_DONE) + break; + + if (netif_msg_tx_done(c2_port)) { + /* PCI reads are expensive in fast path */ + txp_htxd.len = + be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN)); + pr_debug("%s: tx done slot %3Zu status 0x%x len " + "%5u bytes\n", + netdev->name, elem - tx_ring->start, + txp_htxd.flags, txp_htxd.len); + } + + c2_tx_free(c2dev, elem); + ++(c2_port->tx_avail); + } + + tx_ring->to_clean = elem; + + if (netif_queue_stopped(netdev) + && c2_port->tx_avail > MAX_SKB_FRAGS + 1) + netif_wake_queue(netdev); + + spin_unlock(&c2_port->tx_lock); +} + +static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem) +{ + struct c2_rx_desc *rx_desc = elem->ht_desc; + struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data; + + if (rxp_hdr->status != RXP_HRXD_OK || + rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) { + pr_debug("BAD RXP_HRXD\n"); + pr_debug(" rx_desc : %p\n", rx_desc); + pr_debug(" index : %Zu\n", + elem - c2_port->rx_ring.start); + pr_debug(" len : %u\n", rx_desc->len); + pr_debug(" rxp_hdr : %p [PA %p]\n", rxp_hdr, + (void *) __pa((unsigned long) rxp_hdr)); + pr_debug(" flags : 0x%x\n", rxp_hdr->flags); + pr_debug(" status: 0x%x\n", rxp_hdr->status); + pr_debug(" len : %u\n", rxp_hdr->len); + pr_debug(" rsvd : 0x%x\n", rxp_hdr->rsvd); + } + + /* Setup the skb for reuse since we're dropping this pkt */ + elem->skb->data = elem->skb->head; + skb_reset_tail_pointer(elem->skb); + + /* Zero out the rxp hdr in the sk_buff */ + memset(elem->skb->data, 0, sizeof(*rxp_hdr)); + + /* Write the descriptor to the adapter's rx ring */ + __raw_writew(0, elem->hw_desc + C2_RXP_STATUS); + __raw_writew(0, elem->hw_desc + C2_RXP_COUNT); + __raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)), + elem->hw_desc + C2_RXP_LEN); + __raw_writeq((__force u64) cpu_to_be64(elem->mapaddr), + elem->hw_desc + C2_RXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + + pr_debug("packet dropped\n"); + c2_port->netdev->stats.rx_dropped++; +} + +static void c2_rx_interrupt(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *rx_ring = &c2_port->rx_ring; + struct c2_element *elem; + struct c2_rx_desc *rx_desc; + struct c2_rxp_hdr *rxp_hdr; + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen, buflen; + unsigned long flags; + + spin_lock_irqsave(&c2dev->lock, flags); + + /* Begin where we left off */ + rx_ring->to_clean = rx_ring->start + c2dev->cur_rx; + + for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean; + elem = elem->next) { + rx_desc = elem->ht_desc; + mapaddr = elem->mapaddr; + maplen = elem->maplen; + skb = elem->skb; + rxp_hdr = (struct c2_rxp_hdr *) skb->data; + + if (rxp_hdr->flags != RXP_HRXD_DONE) + break; + buflen = rxp_hdr->len; + + /* Sanity check the RXP header */ + if (rxp_hdr->status != RXP_HRXD_OK || + buflen > (rx_desc->len - sizeof(*rxp_hdr))) { + c2_rx_error(c2_port, elem); + continue; + } + + /* + * Allocate and map a new skb for replenishing the host + * RX desc + */ + if (c2_rx_alloc(c2_port, elem)) { + c2_rx_error(c2_port, elem); + continue; + } + + /* Unmap the old skb */ + pci_unmap_single(c2dev->pcidev, mapaddr, maplen, + PCI_DMA_FROMDEVICE); + + prefetch(skb->data); + + /* + * Skip past the leading 8 bytes comprising of the + * "struct c2_rxp_hdr", prepended by the adapter + * to the usual Ethernet header ("struct ethhdr"), + * to the start of the raw Ethernet packet. + * + * Fix up the various fields in the sk_buff before + * passing it up to netif_rx(). The transfer size + * (in bytes) specified by the adapter len field of + * the "struct rxp_hdr_t" does NOT include the + * "sizeof(struct c2_rxp_hdr)". + */ + skb->data += sizeof(*rxp_hdr); + skb_set_tail_pointer(skb, buflen); + skb->len = buflen; + skb->protocol = eth_type_trans(skb, netdev); + + netif_rx(skb); + + netdev->stats.rx_packets++; + netdev->stats.rx_bytes += buflen; + } + + /* Save where we left off */ + rx_ring->to_clean = elem; + c2dev->cur_rx = elem - rx_ring->start; + C2_SET_CUR_RX(c2dev, c2dev->cur_rx); + + spin_unlock_irqrestore(&c2dev->lock, flags); +} + +/* + * Handle netisr0 TX & RX interrupts. + */ +static irqreturn_t c2_interrupt(int irq, void *dev_id) +{ + unsigned int netisr0, dmaisr; + int handled = 0; + struct c2_dev *c2dev = (struct c2_dev *) dev_id; + + /* Process CCILNET interrupts */ + netisr0 = readl(c2dev->regs + C2_NISR0); + if (netisr0) { + + /* + * There is an issue with the firmware that always + * provides the status of RX for both TX & RX + * interrupts. So process both queues here. + */ + c2_rx_interrupt(c2dev->netdev); + c2_tx_interrupt(c2dev->netdev); + + /* Clear the interrupt */ + writel(netisr0, c2dev->regs + C2_NISR0); + handled++; + } + + /* Process RNIC interrupts */ + dmaisr = readl(c2dev->regs + C2_DISR); + if (dmaisr) { + writel(dmaisr, c2dev->regs + C2_DISR); + c2_rnic_interrupt(c2dev); + handled++; + } + + if (handled) { + return IRQ_HANDLED; + } else { + return IRQ_NONE; + } +} + +static int c2_up(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_element *elem; + struct c2_rxp_hdr *rxp_hdr; + struct in_device *in_dev; + size_t rx_size, tx_size; + int ret, i; + unsigned int netimr0; + + if (netif_msg_ifup(c2_port)) + pr_debug("%s: enabling interface\n", netdev->name); + + /* Set the Rx buffer size based on MTU */ + c2_set_rxbufsize(c2_port); + + /* Allocate DMA'able memory for Tx/Rx host descriptor rings */ + rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc); + tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc); + + c2_port->mem_size = tx_size + rx_size; + c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size, + &c2_port->dma); + if (c2_port->mem == NULL) { + pr_debug("Unable to allocate memory for " + "host descriptor rings\n"); + return -ENOMEM; + } + + /* Create the Rx host descriptor ring */ + if ((ret = + c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma, + c2dev->mmio_rxp_ring))) { + pr_debug("Unable to create RX ring\n"); + goto bail0; + } + + /* Allocate Rx buffers for the host descriptor ring */ + if (c2_rx_fill(c2_port)) { + pr_debug("Unable to fill RX ring\n"); + goto bail1; + } + + /* Create the Tx host descriptor ring */ + if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size, + c2_port->dma + rx_size, + c2dev->mmio_txp_ring))) { + pr_debug("Unable to create TX ring\n"); + goto bail1; + } + + /* Set the TX pointer to where we left off */ + c2_port->tx_avail = c2_port->tx_ring.count - 1; + c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean = + c2_port->tx_ring.start + c2dev->cur_tx; + + /* missing: Initialize MAC */ + + BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean); + + /* Reset the adapter, ensures the driver is in sync with the RXP */ + c2_reset(c2_port); + + /* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */ + for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count; + i++, elem++) { + rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data; + rxp_hdr->flags = 0; + __raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY), + elem->hw_desc + C2_RXP_FLAGS); + } + + /* Enable network packets */ + netif_start_queue(netdev); + + /* Enable IRQ */ + writel(0, c2dev->regs + C2_IDIS); + netimr0 = readl(c2dev->regs + C2_NIMR0); + netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT); + writel(netimr0, c2dev->regs + C2_NIMR0); + + /* Tell the stack to ignore arp requests for ipaddrs bound to + * other interfaces. This is needed to prevent the host stack + * from responding to arp requests to the ipaddr bound on the + * rdma interface. + */ + in_dev = in_dev_get(netdev); + IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1); + in_dev_put(in_dev); + + return 0; + + bail1: + c2_rx_clean(c2_port); + kfree(c2_port->rx_ring.start); + + bail0: + pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem, + c2_port->dma); + + return ret; +} + +static int c2_down(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + + if (netif_msg_ifdown(c2_port)) + pr_debug("%s: disabling interface\n", + netdev->name); + + /* Wait for all the queued packets to get sent */ + c2_tx_interrupt(netdev); + + /* Disable network packets */ + netif_stop_queue(netdev); + + /* Disable IRQs by clearing the interrupt mask */ + writel(1, c2dev->regs + C2_IDIS); + writel(0, c2dev->regs + C2_NIMR0); + + /* missing: Stop transmitter */ + + /* missing: Stop receiver */ + + /* Reset the adapter, ensures the driver is in sync with the RXP */ + c2_reset(c2_port); + + /* missing: Turn off LEDs here */ + + /* Free all buffers in the host descriptor rings */ + c2_tx_clean(c2_port); + c2_rx_clean(c2_port); + + /* Free the host descriptor rings */ + kfree(c2_port->rx_ring.start); + kfree(c2_port->tx_ring.start); + pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem, + c2_port->dma); + + return 0; +} + +static void c2_reset(struct c2_port *c2_port) +{ + struct c2_dev *c2dev = c2_port->c2dev; + unsigned int cur_rx = c2dev->cur_rx; + + /* Tell the hardware to quiesce */ + C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI); + + /* + * The hardware will reset the C2_PCI_HRX_QUI bit once + * the RXP is quiesced. Wait 2 seconds for this. + */ + ssleep(2); + + cur_rx = C2_GET_CUR_RX(c2dev); + + if (cur_rx & C2_PCI_HRX_QUI) + pr_debug("c2_reset: failed to quiesce the hardware!\n"); + + cur_rx &= ~C2_PCI_HRX_QUI; + + c2dev->cur_rx = cur_rx; + + pr_debug("Current RX: %u\n", c2dev->cur_rx); +} + +static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + struct c2_dev *c2dev = c2_port->c2dev; + struct c2_ring *tx_ring = &c2_port->tx_ring; + struct c2_element *elem; + dma_addr_t mapaddr; + u32 maplen; + unsigned long flags; + unsigned int i; + + spin_lock_irqsave(&c2_port->tx_lock, flags); + + if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) { + netif_stop_queue(netdev); + spin_unlock_irqrestore(&c2_port->tx_lock, flags); + + pr_debug("%s: Tx ring full when queue awake!\n", + netdev->name); + return NETDEV_TX_BUSY; + } + + maplen = skb_headlen(skb); + mapaddr = + pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE); + + elem = tx_ring->to_use; + elem->skb = skb; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + + /* Tell HW to xmit */ + __raw_writeq((__force u64) cpu_to_be64(mapaddr), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(maplen), + elem->hw_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY), + elem->hw_desc + C2_TXP_FLAGS); + + netdev->stats.tx_packets++; + netdev->stats.tx_bytes += maplen; + + /* Loop thru additional data fragments and queue them */ + if (skb_shinfo(skb)->nr_frags) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + maplen = skb_frag_size(frag); + mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag, + 0, maplen, DMA_TO_DEVICE); + elem = elem->next; + elem->skb = NULL; + elem->mapaddr = mapaddr; + elem->maplen = maplen; + + /* Tell HW to xmit */ + __raw_writeq((__force u64) cpu_to_be64(mapaddr), + elem->hw_desc + C2_TXP_ADDR); + __raw_writew((__force u16) cpu_to_be16(maplen), + elem->hw_desc + C2_TXP_LEN); + __raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY), + elem->hw_desc + C2_TXP_FLAGS); + + netdev->stats.tx_packets++; + netdev->stats.tx_bytes += maplen; + } + } + + tx_ring->to_use = elem->next; + c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1); + + if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) { + netif_stop_queue(netdev); + if (netif_msg_tx_queued(c2_port)) + pr_debug("%s: transmit queue full\n", + netdev->name); + } + + spin_unlock_irqrestore(&c2_port->tx_lock, flags); + + netdev->trans_start = jiffies; + + return NETDEV_TX_OK; +} + +static void c2_tx_timeout(struct net_device *netdev) +{ + struct c2_port *c2_port = netdev_priv(netdev); + + if (netif_msg_timer(c2_port)) + pr_debug("%s: tx timeout\n", netdev->name); + + c2_tx_clean(c2_port); +} + +static int c2_change_mtu(struct net_device *netdev, int new_mtu) +{ + int ret = 0; + + if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) + return -EINVAL; + + netdev->mtu = new_mtu; + + if (netif_running(netdev)) { + c2_down(netdev); + + c2_up(netdev); + } + + return ret; +} + +static const struct net_device_ops c2_netdev = { + .ndo_open = c2_up, + .ndo_stop = c2_down, + .ndo_start_xmit = c2_xmit_frame, + .ndo_tx_timeout = c2_tx_timeout, + .ndo_change_mtu = c2_change_mtu, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, +}; + +/* Initialize network device */ +static struct net_device *c2_devinit(struct c2_dev *c2dev, + void __iomem * mmio_addr) +{ + struct c2_port *c2_port = NULL; + struct net_device *netdev = alloc_etherdev(sizeof(*c2_port)); + + if (!netdev) { + pr_debug("c2_port etherdev alloc failed"); + return NULL; + } + + SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev); + + netdev->netdev_ops = &c2_netdev; + netdev->watchdog_timeo = C2_TX_TIMEOUT; + netdev->irq = c2dev->pcidev->irq; + + c2_port = netdev_priv(netdev); + c2_port->netdev = netdev; + c2_port->c2dev = c2dev; + c2_port->msg_enable = netif_msg_init(debug, default_msg); + c2_port->tx_ring.count = C2_NUM_TX_DESC; + c2_port->rx_ring.count = C2_NUM_RX_DESC; + + spin_lock_init(&c2_port->tx_lock); + + /* Copy our 48-bit ethernet hardware address */ + memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6); + + /* Validate the MAC address */ + if (!is_valid_ether_addr(netdev->dev_addr)) { + pr_debug("Invalid MAC Address\n"); + c2_print_macaddr(netdev); + free_netdev(netdev); + return NULL; + } + + c2dev->netdev = netdev; + + return netdev; +} + +static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) +{ + int ret = 0, i; + unsigned long reg0_start, reg0_flags, reg0_len; + unsigned long reg2_start, reg2_flags, reg2_len; + unsigned long reg4_start, reg4_flags, reg4_len; + unsigned kva_map_size; + struct net_device *netdev = NULL; + struct c2_dev *c2dev = NULL; + void __iomem *mmio_regs = NULL; + + printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n", + DRV_VERSION); + + /* Enable PCI device */ + ret = pci_enable_device(pcidev); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to enable PCI device\n", + pci_name(pcidev)); + goto bail0; + } + + reg0_start = pci_resource_start(pcidev, BAR_0); + reg0_len = pci_resource_len(pcidev, BAR_0); + reg0_flags = pci_resource_flags(pcidev, BAR_0); + + reg2_start = pci_resource_start(pcidev, BAR_2); + reg2_len = pci_resource_len(pcidev, BAR_2); + reg2_flags = pci_resource_flags(pcidev, BAR_2); + + reg4_start = pci_resource_start(pcidev, BAR_4); + reg4_len = pci_resource_len(pcidev, BAR_4); + reg4_flags = pci_resource_flags(pcidev, BAR_4); + + pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len); + pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len); + pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len); + + /* Make sure PCI base addr are MMIO */ + if (!(reg0_flags & IORESOURCE_MEM) || + !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) { + printk(KERN_ERR PFX "PCI regions not an MMIO resource\n"); + ret = -ENODEV; + goto bail1; + } + + /* Check for weird/broken PCI region reporting */ + if ((reg0_len < C2_REG0_SIZE) || + (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) { + printk(KERN_ERR PFX "Invalid PCI region sizes\n"); + ret = -ENODEV; + goto bail1; + } + + /* Reserve PCI I/O and memory resources */ + ret = pci_request_regions(pcidev, DRV_NAME); + if (ret) { + printk(KERN_ERR PFX "%s: Unable to request regions\n", + pci_name(pcidev)); + goto bail1; + } + + if ((sizeof(dma_addr_t) > 4)) { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64)); + if (ret < 0) { + printk(KERN_ERR PFX "64b DMA configuration failed\n"); + goto bail2; + } + } else { + ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32)); + if (ret < 0) { + printk(KERN_ERR PFX "32b DMA configuration failed\n"); + goto bail2; + } + } + + /* Enables bus-mastering on the device */ + pci_set_master(pcidev); + + /* Remap the adapter PCI registers in BAR4 */ + mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET, + sizeof(struct c2_adapter_pci_regs)); + if (!mmio_regs) { + printk(KERN_ERR PFX + "Unable to remap adapter PCI registers in BAR4\n"); + ret = -EIO; + goto bail2; + } + + /* Validate PCI regs magic */ + for (i = 0; i < sizeof(c2_magic); i++) { + if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) { + printk(KERN_ERR PFX "Downlevel Firmware boot loader " + "[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash " + "utility to update your boot loader\n", + i + 1, sizeof(c2_magic), + readb(mmio_regs + C2_REGS_MAGIC + i), + c2_magic[i]); + printk(KERN_ERR PFX "Adapter not claimed\n"); + iounmap(mmio_regs); + ret = -EIO; + goto bail2; + } + } + + /* Validate the adapter version */ + if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) { + printk(KERN_ERR PFX "Version mismatch " + "[fw=%u, c2=%u], Adapter not claimed\n", + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)), + C2_VERSION); + ret = -EINVAL; + iounmap(mmio_regs); + goto bail2; + } + + /* Validate the adapter IVN */ + if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) { + printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using " + "the OpenIB device support kit. " + "[fw=0x%x, c2=0x%x], Adapter not claimed\n", + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)), + C2_IVN); + ret = -EINVAL; + iounmap(mmio_regs); + goto bail2; + } + + /* Allocate hardware structure */ + c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev)); + if (!c2dev) { + printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", + pci_name(pcidev)); + ret = -ENOMEM; + iounmap(mmio_regs); + goto bail2; + } + + memset(c2dev, 0, sizeof(*c2dev)); + spin_lock_init(&c2dev->lock); + c2dev->pcidev = pcidev; + c2dev->cur_tx = 0; + + /* Get the last RX index */ + c2dev->cur_rx = + (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) - + 0xffffc000) / sizeof(struct c2_rxp_desc); + + /* Request an interrupt line for the driver */ + ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev); + if (ret) { + printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n", + pci_name(pcidev), pcidev->irq); + iounmap(mmio_regs); + goto bail3; + } + + /* Set driver specific data */ + pci_set_drvdata(pcidev, c2dev); + + /* Initialize network device */ + if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) { + ret = -ENOMEM; + iounmap(mmio_regs); + goto bail4; + } + + /* Save off the actual size prior to unmapping mmio_regs */ + kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE)); + + /* Unmap the adapter PCI registers in BAR4 */ + iounmap(mmio_regs); + + /* Register network device */ + ret = register_netdev(netdev); + if (ret) { + printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", + ret); + goto bail5; + } + + /* Disable network packets */ + netif_stop_queue(netdev); + + /* Remap the adapter HRXDQ PA space to kernel VA space */ + c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET, + C2_RXP_HRXDQ_SIZE); + if (!c2dev->mmio_rxp_ring) { + printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n"); + ret = -EIO; + goto bail6; + } + + /* Remap the adapter HTXDQ PA space to kernel VA space */ + c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET, + C2_TXP_HTXDQ_SIZE); + if (!c2dev->mmio_txp_ring) { + printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n"); + ret = -EIO; + goto bail7; + } + + /* Save off the current RX index in the last 4 bytes of the TXP Ring */ + C2_SET_CUR_RX(c2dev, c2dev->cur_rx); + + /* Remap the PCI registers in adapter BAR0 to kernel VA space */ + c2dev->regs = ioremap_nocache(reg0_start, reg0_len); + if (!c2dev->regs) { + printk(KERN_ERR PFX "Unable to remap BAR0\n"); + ret = -EIO; + goto bail8; + } + + /* Remap the PCI registers in adapter BAR4 to kernel VA space */ + c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET; + c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET, + kva_map_size); + if (!c2dev->kva) { + printk(KERN_ERR PFX "Unable to remap BAR4\n"); + ret = -EIO; + goto bail9; + } + + /* Print out the MAC address */ + c2_print_macaddr(netdev); + + ret = c2_rnic_init(c2dev); + if (ret) { + printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret); + goto bail10; + } + + ret = c2_register_device(c2dev); + if (ret) + goto bail10; + + return 0; + + bail10: + iounmap(c2dev->kva); + + bail9: + iounmap(c2dev->regs); + + bail8: + iounmap(c2dev->mmio_txp_ring); + + bail7: + iounmap(c2dev->mmio_rxp_ring); + + bail6: + unregister_netdev(netdev); + + bail5: + free_netdev(netdev); + + bail4: + free_irq(pcidev->irq, c2dev); + + bail3: + ib_dealloc_device(&c2dev->ibdev); + + bail2: + pci_release_regions(pcidev); + + bail1: + pci_disable_device(pcidev); + + bail0: + return ret; +} + +static void c2_remove(struct pci_dev *pcidev) +{ + struct c2_dev *c2dev = pci_get_drvdata(pcidev); + struct net_device *netdev = c2dev->netdev; + + /* Unregister with OpenIB */ + c2_unregister_device(c2dev); + + /* Clean up the RNIC resources */ + c2_rnic_term(c2dev); + + /* Remove network device from the kernel */ + unregister_netdev(netdev); + + /* Free network device */ + free_netdev(netdev); + + /* Free the interrupt line */ + free_irq(pcidev->irq, c2dev); + + /* missing: Turn LEDs off here */ + + /* Unmap adapter PA space */ + iounmap(c2dev->kva); + iounmap(c2dev->regs); + iounmap(c2dev->mmio_txp_ring); + iounmap(c2dev->mmio_rxp_ring); + + /* Free the hardware structure */ + ib_dealloc_device(&c2dev->ibdev); + + /* Release reserved PCI I/O and memory resources */ + pci_release_regions(pcidev); + + /* Disable PCI device */ + pci_disable_device(pcidev); + + /* Clear driver specific data */ + pci_set_drvdata(pcidev, NULL); +} + +static struct pci_driver c2_pci_driver = { + .name = DRV_NAME, + .id_table = c2_pci_table, + .probe = c2_probe, + .remove = c2_remove, +}; + +module_pci_driver(c2_pci_driver); diff --git a/drivers/staging/rdma/amso1100/c2.h b/drivers/staging/rdma/amso1100/c2.h new file mode 100644 index 000000000..d619d7358 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2.h @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __C2_H +#define __C2_H + +#include +#include +#include +#include +#include +#include + +#include "c2_provider.h" +#include "c2_mq.h" +#include "c2_status.h" + +#define DRV_NAME "c2" +#define DRV_VERSION "1.1" +#define PFX DRV_NAME ": " + +#define BAR_0 0 +#define BAR_2 2 +#define BAR_4 4 + +#define RX_BUF_SIZE (1536 + 8) +#define ETH_JUMBO_MTU 9000 +#define C2_MAGIC "CEPHEUS" +#define C2_VERSION 4 +#define C2_IVN (18 & 0x7fffffff) + +#define C2_REG0_SIZE (16 * 1024) +#define C2_REG2_SIZE (2 * 1024 * 1024) +#define C2_REG4_SIZE (256 * 1024 * 1024) +#define C2_NUM_TX_DESC 341 +#define C2_NUM_RX_DESC 256 +#define C2_PCI_REGS_OFFSET (0x10000) +#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2)) +#define C2_RXP_HRXDQ_SIZE (4096) +#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE) +#define C2_TXP_HTXDQ_SIZE (4096) +#define C2_TX_TIMEOUT (6*HZ) + +/* CEPHEUS */ +static const u8 c2_magic[] = { + 0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53 +}; + +enum adapter_pci_regs { + C2_REGS_MAGIC = 0x0000, + C2_REGS_VERS = 0x0008, + C2_REGS_IVN = 0x000C, + C2_REGS_PCI_WINSIZE = 0x0010, + C2_REGS_Q0_QSIZE = 0x0014, + C2_REGS_Q0_MSGSIZE = 0x0018, + C2_REGS_Q0_POOLSTART = 0x001C, + C2_REGS_Q0_SHARED = 0x0020, + C2_REGS_Q1_QSIZE = 0x0024, + C2_REGS_Q1_MSGSIZE = 0x0028, + C2_REGS_Q1_SHARED = 0x0030, + C2_REGS_Q2_QSIZE = 0x0034, + C2_REGS_Q2_MSGSIZE = 0x0038, + C2_REGS_Q2_SHARED = 0x0040, + C2_REGS_ENADDR = 0x004C, + C2_REGS_RDMA_ENADDR = 0x0054, + C2_REGS_HRX_CUR = 0x006C, +}; + +struct c2_adapter_pci_regs { + char reg_magic[8]; + u32 version; + u32 ivn; + u32 pci_window_size; + u32 q0_q_size; + u32 q0_msg_size; + u32 q0_pool_start; + u32 q0_shared; + u32 q1_q_size; + u32 q1_msg_size; + u32 q1_pool_start; + u32 q1_shared; + u32 q2_q_size; + u32 q2_msg_size; + u32 q2_pool_start; + u32 q2_shared; + u32 log_start; + u32 log_size; + u8 host_enaddr[8]; + u8 rdma_enaddr[8]; + u32 crash_entry; + u32 crash_ready[2]; + u32 fw_txd_cur; + u32 fw_hrxd_cur; + u32 fw_rxd_cur; +}; + +enum pci_regs { + C2_HISR = 0x0000, + C2_DISR = 0x0004, + C2_HIMR = 0x0008, + C2_DIMR = 0x000C, + C2_NISR0 = 0x0010, + C2_NISR1 = 0x0014, + C2_NIMR0 = 0x0018, + C2_NIMR1 = 0x001C, + C2_IDIS = 0x0020, +}; + +enum { + C2_PCI_HRX_INT = 1 << 8, + C2_PCI_HTX_INT = 1 << 17, + C2_PCI_HRX_QUI = 1 << 31, +}; + +/* + * Cepheus registers in BAR0. + */ +struct c2_pci_regs { + u32 hostisr; + u32 dmaisr; + u32 hostimr; + u32 dmaimr; + u32 netisr0; + u32 netisr1; + u32 netimr0; + u32 netimr1; + u32 int_disable; +}; + +/* TXP flags */ +enum c2_txp_flags { + TXP_HTXD_DONE = 0, + TXP_HTXD_READY = 1 << 0, + TXP_HTXD_UNINIT = 1 << 1, +}; + +/* RXP flags */ +enum c2_rxp_flags { + RXP_HRXD_UNINIT = 0, + RXP_HRXD_READY = 1 << 0, + RXP_HRXD_DONE = 1 << 1, +}; + +/* RXP status */ +enum c2_rxp_status { + RXP_HRXD_ZERO = 0, + RXP_HRXD_OK = 1 << 0, + RXP_HRXD_BUF_OV = 1 << 1, +}; + +/* TXP descriptor fields */ +enum txp_desc { + C2_TXP_FLAGS = 0x0000, + C2_TXP_LEN = 0x0002, + C2_TXP_ADDR = 0x0004, +}; + +/* RXP descriptor fields */ +enum rxp_desc { + C2_RXP_FLAGS = 0x0000, + C2_RXP_STATUS = 0x0002, + C2_RXP_COUNT = 0x0004, + C2_RXP_LEN = 0x0006, + C2_RXP_ADDR = 0x0008, +}; + +struct c2_txp_desc { + u16 flags; + u16 len; + u64 addr; +} __attribute__ ((packed)); + +struct c2_rxp_desc { + u16 flags; + u16 status; + u16 count; + u16 len; + u64 addr; +} __attribute__ ((packed)); + +struct c2_rxp_hdr { + u16 flags; + u16 status; + u16 len; + u16 rsvd; +} __attribute__ ((packed)); + +struct c2_tx_desc { + u32 len; + u32 status; + dma_addr_t next_offset; +}; + +struct c2_rx_desc { + u32 len; + u32 status; + dma_addr_t next_offset; +}; + +struct c2_alloc { + u32 last; + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c2_array { + struct { + void **page; + int used; + } *page_list; +}; + +/* + * The MQ shared pointer pool is organized as a linked list of + * chunks. Each chunk contains a linked list of free shared pointers + * that can be allocated to a given user mode client. + * + */ +struct sp_chunk { + struct sp_chunk *next; + dma_addr_t dma_addr; + DEFINE_DMA_UNMAP_ADDR(mapping); + u16 head; + u16 shared_ptr[0]; +}; + +struct c2_pd_table { + u32 last; + u32 max; + spinlock_t lock; + unsigned long *table; +}; + +struct c2_qp_table { + struct idr idr; + spinlock_t lock; +}; + +struct c2_element { + struct c2_element *next; + void *ht_desc; /* host descriptor */ + void __iomem *hw_desc; /* hardware descriptor */ + struct sk_buff *skb; + dma_addr_t mapaddr; + u32 maplen; +}; + +struct c2_ring { + struct c2_element *to_clean; + struct c2_element *to_use; + struct c2_element *start; + unsigned long count; +}; + +struct c2_dev { + struct ib_device ibdev; + void __iomem *regs; + void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */ + void __iomem *mmio_rxp_ring; + spinlock_t lock; + struct pci_dev *pcidev; + struct net_device *netdev; + struct net_device *pseudo_netdev; + unsigned int cur_tx; + unsigned int cur_rx; + u32 adapter_handle; + int device_cap_flags; + void __iomem *kva; /* KVA device memory */ + unsigned long pa; /* PA device memory */ + void **qptr_array; + + struct kmem_cache *host_msg_cache; + + struct list_head cca_link; /* adapter list */ + struct list_head eh_wakeup_list; /* event wakeup list */ + wait_queue_head_t req_vq_wo; + + /* Cached RNIC properties */ + struct ib_device_attr props; + + struct c2_pd_table pd_table; + struct c2_qp_table qp_table; + int ports; /* num of GigE ports */ + int devnum; + spinlock_t vqlock; /* sync vbs req MQ */ + + /* Verbs Queues */ + struct c2_mq req_vq; /* Verbs Request MQ */ + struct c2_mq rep_vq; /* Verbs Reply MQ */ + struct c2_mq aeq; /* Async Events MQ */ + + /* Kernel client MQs */ + struct sp_chunk *kern_mqsp_pool; + + /* Device updates these values when posting messages to a host + * target queue */ + u16 req_vq_shared; + u16 rep_vq_shared; + u16 aeq_shared; + u16 irq_claimed; + + /* + * Shared host target pages for user-accessible MQs. + */ + int hthead; /* index of first free entry */ + void *htpages; /* kernel vaddr */ + int htlen; /* length of htpages memory */ + void *htuva; /* user mapped vaddr */ + spinlock_t htlock; /* serialize allocation */ + + u64 adapter_hint_uva; /* access to the activity FIFO */ + + // spinlock_t aeq_lock; + // spinlock_t rnic_lock; + + __be16 *hint_count; + dma_addr_t hint_count_dma; + u16 hints_read; + + int init; /* TRUE if it's ready */ + char ae_cache_name[16]; + char vq_cache_name[16]; +}; + +struct c2_port { + u32 msg_enable; + struct c2_dev *c2dev; + struct net_device *netdev; + + spinlock_t tx_lock; + u32 tx_avail; + struct c2_ring tx_ring; + struct c2_ring rx_ring; + + void *mem; /* PCI memory for host rings */ + dma_addr_t dma; + unsigned long mem_size; + + u32 rx_buf_size; +}; + +/* + * Activity FIFO registers in BAR0. + */ +#define PCI_BAR0_HOST_HINT 0x100 +#define PCI_BAR0_ADAPTER_HINT 0x2000 + +/* + * Ammasso PCI vendor id and Cepheus PCI device id. + */ +#define CQ_ARMED 0x01 +#define CQ_WAIT_FOR_DMA 0x80 + +/* + * The format of a hint is as follows: + * Lower 16 bits are the count of hints for the queue. + * Next 15 bits are the qp_index + * Upper most bit depends on who reads it: + * If read by producer, then it means Full (1) or Not-Full (0) + * If read by consumer, then it means Empty (1) or Not-Empty (0) + */ +#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count) +#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16) +#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF) + + +/* + * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t + * struct. + */ +#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000 + +#ifndef readq +static inline u64 readq(const void __iomem * addr) +{ + u64 ret = readl(addr + 4); + ret <<= 32; + ret |= readl(addr); + + return ret; +} +#endif + +#ifndef writeq +static inline void __raw_writeq(u64 val, void __iomem * addr) +{ + __raw_writel((u32) (val), addr); + __raw_writel((u32) (val >> 32), (addr + 4)); +} +#endif + +#define C2_SET_CUR_RX(c2dev, cur_rx) \ + __raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092) + +#define C2_GET_CUR_RX(c2dev) \ + be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092)) + +static inline struct c2_dev *to_c2dev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct c2_dev, ibdev); +} + +static inline int c2_errno(void *reply) +{ + switch (c2_wr_get_result(reply)) { + case C2_OK: + return 0; + case CCERR_NO_BUFS: + case CCERR_INSUFFICIENT_RESOURCES: + case CCERR_ZERO_RDMA_READ_RESOURCES: + return -ENOMEM; + case CCERR_MR_IN_USE: + case CCERR_QP_IN_USE: + return -EBUSY; + case CCERR_ADDR_IN_USE: + return -EADDRINUSE; + case CCERR_ADDR_NOT_AVAIL: + return -EADDRNOTAVAIL; + case CCERR_CONN_RESET: + return -ECONNRESET; + case CCERR_NOT_IMPLEMENTED: + case CCERR_INVALID_WQE: + return -ENOSYS; + case CCERR_QP_NOT_PRIVILEGED: + return -EPERM; + case CCERR_STACK_ERROR: + return -EPROTO; + case CCERR_ACCESS_VIOLATION: + case CCERR_BASE_AND_BOUNDS_VIOLATION: + return -EFAULT; + case CCERR_STAG_STATE_NOT_INVALID: + case CCERR_INVALID_ADDRESS: + case CCERR_INVALID_CQ: + case CCERR_INVALID_EP: + case CCERR_INVALID_MODIFIER: + case CCERR_INVALID_MTU: + case CCERR_INVALID_PD_ID: + case CCERR_INVALID_QP: + case CCERR_INVALID_RNIC: + case CCERR_INVALID_STAG: + return -EINVAL; + default: + return -EAGAIN; + } +} + +/* Device */ +extern int c2_register_device(struct c2_dev *c2dev); +extern void c2_unregister_device(struct c2_dev *c2dev); +extern int c2_rnic_init(struct c2_dev *c2dev); +extern void c2_rnic_term(struct c2_dev *c2dev); +extern void c2_rnic_interrupt(struct c2_dev *c2dev); +extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask); +extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask); + +/* QPs */ +extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd, + struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp); +extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp); +extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn); +extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp, + struct ib_qp_attr *attr, int attr_mask); +extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp, + int ord, int ird); +extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr); +extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr); +extern void c2_init_qp_table(struct c2_dev *c2dev); +extern void c2_cleanup_qp_table(struct c2_dev *c2dev); +extern void c2_set_qp_state(struct c2_qp *, int); +extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn); + +/* PDs */ +extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd); +extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd); +extern int c2_init_pd_table(struct c2_dev *c2dev); +extern void c2_cleanup_pd_table(struct c2_dev *c2dev); + +/* CQs */ +extern int c2_init_cq(struct c2_dev *c2dev, int entries, + struct c2_ucontext *ctx, struct c2_cq *cq); +extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq); +extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index); +extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index); +extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); +extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); + +/* CM */ +extern int c2_llp_connect(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param); +extern int c2_llp_accept(struct iw_cm_id *cm_id, + struct iw_cm_conn_param *iw_param); +extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, + u8 pdata_len); +extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog); +extern int c2_llp_service_destroy(struct iw_cm_id *cm_id); + +/* MM */ +extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list, + int page_size, int pbl_depth, u32 length, + u32 off, u64 *va, enum c2_acf acf, + struct c2_mr *mr); +extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index); + +/* AE */ +extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index); + +/* MQSP Allocator */ +extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **root); +extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root); +extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head, + dma_addr_t *dma_addr, gfp_t gfp_mask); +extern void c2_free_mqsp(__be16* mqsp); +#endif diff --git a/drivers/staging/rdma/amso1100/c2_ae.c b/drivers/staging/rdma/amso1100/c2_ae.c new file mode 100644 index 000000000..cedda2523 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_ae.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include +#include "c2_status.h" +#include "c2_ae.h" + +static int c2_convert_cm_status(u32 c2_status) +{ + switch (c2_status) { + case C2_CONN_STATUS_SUCCESS: + return 0; + case C2_CONN_STATUS_REJECTED: + return -ENETRESET; + case C2_CONN_STATUS_REFUSED: + return -ECONNREFUSED; + case C2_CONN_STATUS_TIMEDOUT: + return -ETIMEDOUT; + case C2_CONN_STATUS_NETUNREACH: + return -ENETUNREACH; + case C2_CONN_STATUS_HOSTUNREACH: + return -EHOSTUNREACH; + case C2_CONN_STATUS_INVALID_RNIC: + return -EINVAL; + case C2_CONN_STATUS_INVALID_QP: + return -EINVAL; + case C2_CONN_STATUS_INVALID_QP_STATE: + return -EINVAL; + case C2_CONN_STATUS_ADDR_NOT_AVAIL: + return -EADDRNOTAVAIL; + default: + printk(KERN_ERR PFX + "%s - Unable to convert CM status: %d\n", + __func__, c2_status); + return -EIO; + } +} + +static const char* to_event_str(int event) +{ + static const char* event_str[] = { + "CCAE_REMOTE_SHUTDOWN", + "CCAE_ACTIVE_CONNECT_RESULTS", + "CCAE_CONNECTION_REQUEST", + "CCAE_LLP_CLOSE_COMPLETE", + "CCAE_TERMINATE_MESSAGE_RECEIVED", + "CCAE_LLP_CONNECTION_RESET", + "CCAE_LLP_CONNECTION_LOST", + "CCAE_LLP_SEGMENT_SIZE_INVALID", + "CCAE_LLP_INVALID_CRC", + "CCAE_LLP_BAD_FPDU", + "CCAE_INVALID_DDP_VERSION", + "CCAE_INVALID_RDMA_VERSION", + "CCAE_UNEXPECTED_OPCODE", + "CCAE_INVALID_DDP_QUEUE_NUMBER", + "CCAE_RDMA_READ_NOT_ENABLED", + "CCAE_RDMA_WRITE_NOT_ENABLED", + "CCAE_RDMA_READ_TOO_SMALL", + "CCAE_NO_L_BIT", + "CCAE_TAGGED_INVALID_STAG", + "CCAE_TAGGED_BASE_BOUNDS_VIOLATION", + "CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION", + "CCAE_TAGGED_INVALID_PD", + "CCAE_WRAP_ERROR", + "CCAE_BAD_CLOSE", + "CCAE_BAD_LLP_CLOSE", + "CCAE_INVALID_MSN_RANGE", + "CCAE_INVALID_MSN_GAP", + "CCAE_IRRQ_OVERFLOW", + "CCAE_IRRQ_MSN_GAP", + "CCAE_IRRQ_MSN_RANGE", + "CCAE_IRRQ_INVALID_STAG", + "CCAE_IRRQ_BASE_BOUNDS_VIOLATION", + "CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION", + "CCAE_IRRQ_INVALID_PD", + "CCAE_IRRQ_WRAP_ERROR", + "CCAE_CQ_SQ_COMPLETION_OVERFLOW", + "CCAE_CQ_RQ_COMPLETION_ERROR", + "CCAE_QP_SRQ_WQE_ERROR", + "CCAE_QP_LOCAL_CATASTROPHIC_ERROR", + "CCAE_CQ_OVERFLOW", + "CCAE_CQ_OPERATION_ERROR", + "CCAE_SRQ_LIMIT_REACHED", + "CCAE_QP_RQ_LIMIT_REACHED", + "CCAE_SRQ_CATASTROPHIC_ERROR", + "CCAE_RNIC_CATASTROPHIC_ERROR" + }; + + if (event < CCAE_REMOTE_SHUTDOWN || + event > CCAE_RNIC_CATASTROPHIC_ERROR) + return ""; + + event -= CCAE_REMOTE_SHUTDOWN; + return event_str[event]; +} + +static const char *to_qp_state_str(int state) +{ + switch (state) { + case C2_QP_STATE_IDLE: + return "C2_QP_STATE_IDLE"; + case C2_QP_STATE_CONNECTING: + return "C2_QP_STATE_CONNECTING"; + case C2_QP_STATE_RTS: + return "C2_QP_STATE_RTS"; + case C2_QP_STATE_CLOSING: + return "C2_QP_STATE_CLOSING"; + case C2_QP_STATE_TERMINATE: + return "C2_QP_STATE_TERMINATE"; + case C2_QP_STATE_ERROR: + return "C2_QP_STATE_ERROR"; + default: + return ""; + } +} + +void c2_ae_event(struct c2_dev *c2dev, u32 mq_index) +{ + struct c2_mq *mq = c2dev->qptr_array[mq_index]; + union c2wr *wr; + void *resource_user_context; + struct iw_cm_event cm_event; + struct ib_event ib_event; + enum c2_resource_indicator resource_indicator; + enum c2_event_id event_id; + unsigned long flags; + int status; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr; + + /* + * retrieve the message + */ + wr = c2_mq_consume(mq); + if (!wr) + return; + + memset(&ib_event, 0, sizeof(ib_event)); + memset(&cm_event, 0, sizeof(cm_event)); + + event_id = c2_wr_get_id(wr); + resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type); + resource_user_context = + (void *) (unsigned long) wr->ae.ae_generic.user_context; + + status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr)); + + pr_debug("event received c2_dev=%p, event_id=%d, " + "resource_indicator=%d, user_context=%p, status = %d\n", + c2dev, event_id, resource_indicator, resource_user_context, + status); + + switch (resource_indicator) { + case C2_RES_IND_QP:{ + + struct c2_qp *qp = (struct c2_qp *)resource_user_context; + struct iw_cm_id *cm_id = qp->cm_id; + struct c2wr_ae_active_connect_results *res; + + if (!cm_id) { + pr_debug("event received, but cm_id is , qp=%p!\n", + qp); + goto ignore_it; + } + pr_debug("%s: event = %s, user_context=%llx, " + "resource_type=%x, " + "resource=%x, qp_state=%s\n", + __func__, + to_event_str(event_id), + (unsigned long long) wr->ae.ae_generic.user_context, + be32_to_cpu(wr->ae.ae_generic.resource_type), + be32_to_cpu(wr->ae.ae_generic.resource), + to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state))); + + c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state)); + + switch (event_id) { + case CCAE_ACTIVE_CONNECT_RESULTS: + res = &wr->ae.ae_active_connect_results; + cm_event.event = IW_CM_EVENT_CONNECT_REPLY; + laddr->sin_addr.s_addr = res->laddr; + raddr->sin_addr.s_addr = res->raddr; + laddr->sin_port = res->lport; + raddr->sin_port = res->rport; + if (status == 0) { + cm_event.private_data_len = + be32_to_cpu(res->private_data_length); + cm_event.private_data = res->private_data; + } else { + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + cm_event.private_data_len = 0; + cm_event.private_data = NULL; + } + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + case CCAE_TERMINATE_MESSAGE_RECEIVED: + case CCAE_CQ_SQ_COMPLETION_OVERFLOW: + ib_event.device = &c2dev->ibdev; + ib_event.element.qp = &qp->ibqp; + ib_event.event = IB_EVENT_QP_REQ_ERR; + + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&ib_event, + qp->ibqp. + qp_context); + break; + case CCAE_BAD_CLOSE: + case CCAE_LLP_CLOSE_COMPLETE: + case CCAE_LLP_CONNECTION_RESET: + case CCAE_LLP_CONNECTION_LOST: + BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b); + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + cm_event.event = IW_CM_EVENT_CLOSE; + cm_event.status = 0; + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + default: + BUG_ON(1); + pr_debug("%s:%d Unexpected event_id=%d on QP=%p, " + "CM_ID=%p\n", + __func__, __LINE__, + event_id, qp, cm_id); + break; + } + break; + } + + case C2_RES_IND_EP:{ + + struct c2wr_ae_connection_request *req = + &wr->ae.ae_connection_request; + struct iw_cm_id *cm_id = + (struct iw_cm_id *)resource_user_context; + + pr_debug("C2_RES_IND_EP event_id=%d\n", event_id); + if (event_id != CCAE_CONNECTION_REQUEST) { + pr_debug("%s: Invalid event_id: %d\n", + __func__, event_id); + break; + } + cm_event.event = IW_CM_EVENT_CONNECT_REQUEST; + cm_event.provider_data = (void*)(unsigned long)req->cr_handle; + laddr->sin_addr.s_addr = req->laddr; + raddr->sin_addr.s_addr = req->raddr; + laddr->sin_port = req->lport; + raddr->sin_port = req->rport; + cm_event.private_data_len = + be32_to_cpu(req->private_data_length); + cm_event.private_data = req->private_data; + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; + + if (cm_id->event_handler) + cm_id->event_handler(cm_id, &cm_event); + break; + } + + case C2_RES_IND_CQ:{ + struct c2_cq *cq = + (struct c2_cq *) resource_user_context; + + pr_debug("IB_EVENT_CQ_ERR\n"); + ib_event.device = &c2dev->ibdev; + ib_event.element.cq = &cq->ibcq; + ib_event.event = IB_EVENT_CQ_ERR; + + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&ib_event, + cq->ibcq.cq_context); + break; + } + + default: + printk("Bad resource indicator = %d\n", + resource_indicator); + break; + } + + ignore_it: + c2_mq_free(mq); +} diff --git a/drivers/staging/rdma/amso1100/c2_ae.h b/drivers/staging/rdma/amso1100/c2_ae.h new file mode 100644 index 000000000..3a065c33b --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_ae.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_AE_H_ +#define _C2_AE_H_ + +/* + * WARNING: If you change this file, also bump C2_IVN_BASE + * in common/include/clustercore/c2_ivn.h. + */ + +/* + * Asynchronous Event Identifiers + * + * These start at 0x80 only so it's obvious from inspection that + * they are not work-request statuses. This isn't critical. + * + * NOTE: these event id's must fit in eight bits. + */ +enum c2_event_id { + CCAE_REMOTE_SHUTDOWN = 0x80, + CCAE_ACTIVE_CONNECT_RESULTS, + CCAE_CONNECTION_REQUEST, + CCAE_LLP_CLOSE_COMPLETE, + CCAE_TERMINATE_MESSAGE_RECEIVED, + CCAE_LLP_CONNECTION_RESET, + CCAE_LLP_CONNECTION_LOST, + CCAE_LLP_SEGMENT_SIZE_INVALID, + CCAE_LLP_INVALID_CRC, + CCAE_LLP_BAD_FPDU, + CCAE_INVALID_DDP_VERSION, + CCAE_INVALID_RDMA_VERSION, + CCAE_UNEXPECTED_OPCODE, + CCAE_INVALID_DDP_QUEUE_NUMBER, + CCAE_RDMA_READ_NOT_ENABLED, + CCAE_RDMA_WRITE_NOT_ENABLED, + CCAE_RDMA_READ_TOO_SMALL, + CCAE_NO_L_BIT, + CCAE_TAGGED_INVALID_STAG, + CCAE_TAGGED_BASE_BOUNDS_VIOLATION, + CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION, + CCAE_TAGGED_INVALID_PD, + CCAE_WRAP_ERROR, + CCAE_BAD_CLOSE, + CCAE_BAD_LLP_CLOSE, + CCAE_INVALID_MSN_RANGE, + CCAE_INVALID_MSN_GAP, + CCAE_IRRQ_OVERFLOW, + CCAE_IRRQ_MSN_GAP, + CCAE_IRRQ_MSN_RANGE, + CCAE_IRRQ_INVALID_STAG, + CCAE_IRRQ_BASE_BOUNDS_VIOLATION, + CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION, + CCAE_IRRQ_INVALID_PD, + CCAE_IRRQ_WRAP_ERROR, + CCAE_CQ_SQ_COMPLETION_OVERFLOW, + CCAE_CQ_RQ_COMPLETION_ERROR, + CCAE_QP_SRQ_WQE_ERROR, + CCAE_QP_LOCAL_CATASTROPHIC_ERROR, + CCAE_CQ_OVERFLOW, + CCAE_CQ_OPERATION_ERROR, + CCAE_SRQ_LIMIT_REACHED, + CCAE_QP_RQ_LIMIT_REACHED, + CCAE_SRQ_CATASTROPHIC_ERROR, + CCAE_RNIC_CATASTROPHIC_ERROR +/* WARNING If you add more id's, make sure their values fit in eight bits. */ +}; + +/* + * Resource Indicators and Identifiers + */ +enum c2_resource_indicator { + C2_RES_IND_QP = 1, + C2_RES_IND_EP, + C2_RES_IND_CQ, + C2_RES_IND_SRQ, +}; + +#endif /* _C2_AE_H_ */ diff --git a/drivers/staging/rdma/amso1100/c2_alloc.c b/drivers/staging/rdma/amso1100/c2_alloc.c new file mode 100644 index 000000000..78d247ec6 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_alloc.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "c2.h" + +static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **head) +{ + int i; + struct sp_chunk *new_head; + dma_addr_t dma_addr; + + new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE, + &dma_addr, gfp_mask); + if (new_head == NULL) + return -ENOMEM; + + new_head->dma_addr = dma_addr; + dma_unmap_addr_set(new_head, mapping, new_head->dma_addr); + + new_head->next = NULL; + new_head->head = 0; + + /* build list where each index is the next free slot */ + for (i = 0; + i < (PAGE_SIZE - sizeof(struct sp_chunk) - + sizeof(u16)) / sizeof(u16) - 1; + i++) { + new_head->shared_ptr[i] = i + 1; + } + /* terminate list */ + new_head->shared_ptr[i] = 0xFFFF; + + *head = new_head; + return 0; +} + +int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask, + struct sp_chunk **root) +{ + return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root); +} + +void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root) +{ + struct sp_chunk *next; + + while (root) { + next = root->next; + dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root, + dma_unmap_addr(root, mapping)); + root = next; + } +} + +__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head, + dma_addr_t *dma_addr, gfp_t gfp_mask) +{ + u16 mqsp; + + while (head) { + mqsp = head->head; + if (mqsp != 0xFFFF) { + head->head = head->shared_ptr[mqsp]; + break; + } else if (head->next == NULL) { + if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) == + 0) { + head = head->next; + mqsp = head->head; + head->head = head->shared_ptr[mqsp]; + break; + } else + return NULL; + } else + head = head->next; + } + if (head) { + *dma_addr = head->dma_addr + + ((unsigned long) &(head->shared_ptr[mqsp]) - + (unsigned long) head); + pr_debug("%s addr %p dma_addr %llx\n", __func__, + &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr); + return (__force __be16 *) &(head->shared_ptr[mqsp]); + } + return NULL; +} + +void c2_free_mqsp(__be16 *mqsp) +{ + struct sp_chunk *head; + u16 idx; + + /* The chunk containing this ptr begins at the page boundary */ + head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK); + + /* Link head to new mqsp */ + *mqsp = (__force __be16) head->head; + + /* Compute the shared_ptr index */ + idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1; + idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1; + + /* Point this index at the head */ + head->shared_ptr[idx] = head->head; + + /* Point head at this index */ + head->head = idx; +} diff --git a/drivers/staging/rdma/amso1100/c2_cm.c b/drivers/staging/rdma/amso1100/c2_cm.c new file mode 100644 index 000000000..23bfa94fb --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_cm.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "c2.h" +#include "c2_wr.h" +#include "c2_vq.h" +#include + +int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct c2_dev *c2dev = to_c2dev(cm_id->device); + struct ib_qp *ibqp; + struct c2_qp *qp; + struct c2wr_qp_connect_req *wr; /* variable size needs a malloc. */ + struct c2_vq_req *vq_req; + int err; + struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr; + + if (cm_id->remote_addr.ss_family != AF_INET) + return -ENOSYS; + + ibqp = c2_get_qp(cm_id->device, iw_param->qpn); + if (!ibqp) + return -EINVAL; + qp = to_c2qp(ibqp); + + /* Associate QP <--> CM_ID */ + cm_id->provider_data = qp; + cm_id->add_ref(cm_id); + qp->cm_id = cm_id; + + /* + * only support the max private_data length + */ + if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) { + err = -EINVAL; + goto bail0; + } + /* + * Set the rdma read limits + */ + err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird); + if (err) + goto bail0; + + /* + * Create and send a WR_QP_CONNECT... + */ + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail1; + } + + c2_wr_set_id(wr, CCWR_QP_CONNECT); + wr->hdr.context = 0; + wr->rnic_handle = c2dev->adapter_handle; + wr->qp_handle = qp->adapter_handle; + + wr->remote_addr = raddr->sin_addr.s_addr; + wr->remote_port = raddr->sin_port; + + /* + * Move any private data from the callers's buf into + * the WR. + */ + if (iw_param->private_data) { + wr->private_data_length = + cpu_to_be32(iw_param->private_data_len); + memcpy(&wr->private_data[0], iw_param->private_data, + iw_param->private_data_len); + } else + wr->private_data_length = 0; + + /* + * Send WR to adapter. NOTE: There is no synch reply from + * the adapter. + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + vq_req_free(c2dev, vq_req); + + bail1: + kfree(wr); + bail0: + if (err) { + /* + * If we fail, release reference on QP and + * disassociate QP from CM_ID + */ + cm_id->provider_data = NULL; + qp->cm_id = NULL; + cm_id->rem_ref(cm_id); + } + return err; +} + +int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog) +{ + struct c2_dev *c2dev; + struct c2wr_ep_listen_create_req wr; + struct c2wr_ep_listen_create_rep *reply; + struct c2_vq_req *vq_req; + int err; + struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr; + + if (cm_id->local_addr.ss_family != AF_INET) + return -ENOSYS; + + c2dev = to_c2dev(cm_id->device); + if (c2dev == NULL) + return -EINVAL; + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE); + wr.hdr.context = (u64) (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.local_addr = laddr->sin_addr.s_addr; + wr.local_port = laddr->sin_port; + wr.backlog = cpu_to_be32(backlog); + wr.user_context = (u64) (unsigned long) cm_id; + + /* + * Reference the request struct. Dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply = + (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + if ((err = c2_errno(reply)) != 0) + goto bail1; + + /* + * Keep the adapter handle. Used in subsequent destroy + */ + cm_id->provider_data = (void*)(unsigned long) reply->ep_handle; + + /* + * free vq stuff + */ + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + return 0; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + + +int c2_llp_service_destroy(struct iw_cm_id *cm_id) +{ + + struct c2_dev *c2dev; + struct c2wr_ep_listen_destroy_req wr; + struct c2wr_ep_listen_destroy_rep *reply; + struct c2_vq_req *vq_req; + int err; + + c2dev = to_c2dev(cm_id->device); + if (c2dev == NULL) + return -EINVAL; + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.ep_handle = (u32)(unsigned long)cm_id->provider_data; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + if ((err = c2_errno(reply)) != 0) + goto bail1; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + struct c2_dev *c2dev = to_c2dev(cm_id->device); + struct c2_qp *qp; + struct ib_qp *ibqp; + struct c2wr_cr_accept_req *wr; /* variable length WR */ + struct c2_vq_req *vq_req; + struct c2wr_cr_accept_rep *reply; /* VQ Reply msg ptr. */ + int err; + + ibqp = c2_get_qp(cm_id->device, iw_param->qpn); + if (!ibqp) + return -EINVAL; + qp = to_c2qp(ibqp); + + /* Set the RDMA read limits */ + err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird); + if (err) + goto bail0; + + /* Allocate verbs request. */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail0; + } + vq_req->qp = qp; + vq_req->cm_id = cm_id; + vq_req->event = IW_CM_EVENT_ESTABLISHED; + + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail1; + } + + /* Build the WR */ + c2_wr_set_id(wr, CCWR_CR_ACCEPT); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->ep_handle = (u32) (unsigned long) cm_id->provider_data; + wr->qp_handle = qp->adapter_handle; + + /* Replace the cr_handle with the QP after accept */ + cm_id->provider_data = qp; + cm_id->add_ref(cm_id); + qp->cm_id = cm_id; + + cm_id->provider_data = qp; + + /* Validate private_data length */ + if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) { + err = -EINVAL; + goto bail1; + } + + if (iw_param->private_data) { + wr->private_data_length = cpu_to_be32(iw_param->private_data_len); + memcpy(&wr->private_data[0], + iw_param->private_data, iw_param->private_data_len); + } else + wr->private_data_length = 0; + + /* Reference the request struct. Dereferenced in the int handler. */ + vq_req_get(c2dev, vq_req); + + /* Send WR to adapter */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + /* Wait for reply from adapter */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + /* Check that reply is present */ + reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + if (!err) + c2_set_qp_state(qp, C2_QP_STATE_RTS); + bail1: + kfree(wr); + vq_req_free(c2dev, vq_req); + bail0: + if (err) { + /* + * If we fail, release reference on QP and + * disassociate QP from CM_ID + */ + cm_id->provider_data = NULL; + qp->cm_id = NULL; + cm_id->rem_ref(cm_id); + } + return err; +} + +int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + struct c2_dev *c2dev; + struct c2wr_cr_reject_req wr; + struct c2_vq_req *vq_req; + struct c2wr_cr_reject_rep *reply; + int err; + + c2dev = to_c2dev(cm_id->device); + + /* + * Allocate verbs request. + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_CR_REJECT); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.ep_handle = (u32) (unsigned long) cm_id->provider_data; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + /* + * Process reply + */ + reply = (struct c2wr_cr_reject_rep *) (unsigned long) + vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + err = c2_errno(reply); + /* + * free vq stuff + */ + vq_repbuf_free(c2dev, reply); + + bail0: + vq_req_free(c2dev, vq_req); + return err; +} diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c new file mode 100644 index 000000000..1b63185b4 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_cq.c @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#include "c2.h" +#include "c2_vq.h" +#include "c2_status.h" + +#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1)) + +static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn) +{ + struct c2_cq *cq; + unsigned long flags; + + spin_lock_irqsave(&c2dev->lock, flags); + cq = c2dev->qptr_array[cqn]; + if (!cq) { + spin_unlock_irqrestore(&c2dev->lock, flags); + return NULL; + } + atomic_inc(&cq->refcount); + spin_unlock_irqrestore(&c2dev->lock, flags); + return cq; +} + +static void c2_cq_put(struct c2_cq *cq) +{ + if (atomic_dec_and_test(&cq->refcount)) + wake_up(&cq->wait); +} + +void c2_cq_event(struct c2_dev *c2dev, u32 mq_index) +{ + struct c2_cq *cq; + + cq = c2_cq_get(c2dev, mq_index); + if (!cq) { + printk("discarding events on destroyed CQN=%d\n", mq_index); + return; + } + + (*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context); + c2_cq_put(cq); +} + +void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index) +{ + struct c2_cq *cq; + struct c2_mq *q; + + cq = c2_cq_get(c2dev, mq_index); + if (!cq) + return; + + spin_lock_irq(&cq->lock); + q = &cq->mq; + if (q && !c2_mq_empty(q)) { + u16 priv = q->priv; + struct c2wr_ce *msg; + + while (priv != be16_to_cpu(*q->shared)) { + msg = (struct c2wr_ce *) + (q->msg_pool.host + priv * q->msg_size); + if (msg->qp_user_context == (u64) (unsigned long) qp) { + msg->qp_user_context = (u64) 0; + } + priv = (priv + 1) % q->q_size; + } + } + spin_unlock_irq(&cq->lock); + c2_cq_put(cq); +} + +static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status) +{ + switch (status) { + case C2_OK: + return IB_WC_SUCCESS; + case CCERR_FLUSHED: + return IB_WC_WR_FLUSH_ERR; + case CCERR_BASE_AND_BOUNDS_VIOLATION: + return IB_WC_LOC_PROT_ERR; + case CCERR_ACCESS_VIOLATION: + return IB_WC_LOC_ACCESS_ERR; + case CCERR_TOTAL_LENGTH_TOO_BIG: + return IB_WC_LOC_LEN_ERR; + case CCERR_INVALID_WINDOW: + return IB_WC_MW_BIND_ERR; + default: + return IB_WC_GENERAL_ERR; + } +} + + +static inline int c2_poll_one(struct c2_dev *c2dev, + struct c2_cq *cq, struct ib_wc *entry) +{ + struct c2wr_ce *ce; + struct c2_qp *qp; + int is_recv = 0; + + ce = c2_mq_consume(&cq->mq); + if (!ce) { + return -EAGAIN; + } + + /* + * if the qp returned is null then this qp has already + * been freed and we are unable process the completion. + * try pulling the next message + */ + while ((qp = + (struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) { + c2_mq_free(&cq->mq); + ce = c2_mq_consume(&cq->mq); + if (!ce) + return -EAGAIN; + } + + entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce)); + entry->wr_id = ce->hdr.context; + entry->qp = &qp->ibqp; + entry->wc_flags = 0; + entry->slid = 0; + entry->sl = 0; + entry->src_qp = 0; + entry->dlid_path_bits = 0; + entry->pkey_index = 0; + + switch (c2_wr_get_id(ce)) { + case C2_WR_TYPE_SEND: + entry->opcode = IB_WC_SEND; + break; + case C2_WR_TYPE_RDMA_WRITE: + entry->opcode = IB_WC_RDMA_WRITE; + break; + case C2_WR_TYPE_RDMA_READ: + entry->opcode = IB_WC_RDMA_READ; + break; + case C2_WR_TYPE_BIND_MW: + entry->opcode = IB_WC_BIND_MW; + break; + case C2_WR_TYPE_RECV: + entry->byte_len = be32_to_cpu(ce->bytes_rcvd); + entry->opcode = IB_WC_RECV; + is_recv = 1; + break; + default: + break; + } + + /* consume the WQEs */ + if (is_recv) + c2_mq_lconsume(&qp->rq_mq, 1); + else + c2_mq_lconsume(&qp->sq_mq, + be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1); + + /* free the message */ + c2_mq_free(&cq->mq); + + return 0; +} + +int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct c2_dev *c2dev = to_c2dev(ibcq->device); + struct c2_cq *cq = to_c2cq(ibcq); + unsigned long flags; + int npolled, err; + + spin_lock_irqsave(&cq->lock, flags); + + for (npolled = 0; npolled < num_entries; ++npolled) { + + err = c2_poll_one(c2dev, cq, entry + npolled); + if (err) + break; + } + + spin_unlock_irqrestore(&cq->lock, flags); + + return npolled; +} + +int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct c2_mq_shared __iomem *shared; + struct c2_cq *cq; + unsigned long flags; + int ret = 0; + + cq = to_c2cq(ibcq); + shared = cq->mq.peer; + + if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP) + writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type); + else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type); + else + return -EINVAL; + + writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed); + + /* + * Now read back shared->armed to make the PCI + * write synchronous. This is necessary for + * correct cq notification semantics. + */ + readb(&shared->armed); + + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + spin_lock_irqsave(&cq->lock, flags); + ret = !c2_mq_empty(&cq->mq); + spin_unlock_irqrestore(&cq->lock, flags); + } + + return ret; +} + +static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq) +{ + dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size, + mq->msg_pool.host, dma_unmap_addr(mq, mapping)); +} + +static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, + size_t q_size, size_t msg_size) +{ + u8 *pool_start; + + if (q_size > SIZE_MAX / msg_size) + return -EINVAL; + + pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size, + &mq->host_dma, GFP_KERNEL); + if (!pool_start) + return -ENOMEM; + + c2_mq_rep_init(mq, + 0, /* index (currently unknown) */ + q_size, + msg_size, + pool_start, + NULL, /* peer (currently unknown) */ + C2_MQ_HOST_TARGET); + + dma_unmap_addr_set(mq, mapping, mq->host_dma); + + return 0; +} + +int c2_init_cq(struct c2_dev *c2dev, int entries, + struct c2_ucontext *ctx, struct c2_cq *cq) +{ + struct c2wr_cq_create_req wr; + struct c2wr_cq_create_rep *reply; + unsigned long peer_pa; + struct c2_vq_req *vq_req; + int err; + + might_sleep(); + + cq->ibcq.cqe = entries - 1; + cq->is_kernel = !ctx; + + /* Allocate a shared pointer */ + cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &cq->mq.shared_dma, GFP_KERNEL); + if (!cq->mq.shared) + return -ENOMEM; + + /* Allocate pages for the message pool */ + err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE); + if (err) + goto bail0; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + err = -ENOMEM; + goto bail1; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_CQ_CREATE); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.msg_size = cpu_to_be32(cq->mq.msg_size); + wr.depth = cpu_to_be32(cq->mq.q_size); + wr.shared_ht = cpu_to_be64(cq->mq.shared_dma); + wr.msg_pool = cpu_to_be64(cq->mq.host_dma); + wr.user_context = (u64) (unsigned long) (cq); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail2; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail2; + + reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail2; + } + + if ((err = c2_errno(reply)) != 0) + goto bail3; + + cq->adapter_handle = reply->cq_handle; + cq->mq.index = be32_to_cpu(reply->mq_index); + + peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared); + cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE); + if (!cq->mq.peer) { + err = -ENOMEM; + goto bail3; + } + + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + spin_lock_init(&cq->lock); + atomic_set(&cq->refcount, 1); + init_waitqueue_head(&cq->wait); + + /* + * Use the MQ index allocated by the adapter to + * store the CQ in the qptr_array + */ + cq->cqn = cq->mq.index; + c2dev->qptr_array[cq->cqn] = cq; + + return 0; + + bail3: + vq_repbuf_free(c2dev, reply); + bail2: + vq_req_free(c2dev, vq_req); + bail1: + c2_free_cq_buf(c2dev, &cq->mq); + bail0: + c2_free_mqsp(cq->mq.shared); + + return err; +} + +void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq) +{ + int err; + struct c2_vq_req *vq_req; + struct c2wr_cq_destroy_req wr; + struct c2wr_cq_destroy_rep *reply; + + might_sleep(); + + /* Clear CQ from the qptr array */ + spin_lock_irq(&c2dev->lock); + c2dev->qptr_array[cq->mq.index] = NULL; + atomic_dec(&cq->refcount); + spin_unlock_irq(&c2dev->lock); + + wait_event(cq->wait, !atomic_read(&cq->refcount)); + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + goto bail0; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_CQ_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.cq_handle = cq->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg); + if (reply) + vq_repbuf_free(c2dev, reply); + bail1: + vq_req_free(c2dev, vq_req); + bail0: + if (cq->is_kernel) { + c2_free_cq_buf(c2dev, &cq->mq); + } + + return; +} diff --git a/drivers/staging/rdma/amso1100/c2_intr.c b/drivers/staging/rdma/amso1100/c2_intr.c new file mode 100644 index 000000000..3a17d9b36 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_intr.c @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include +#include "c2_vq.h" + +static void handle_mq(struct c2_dev *c2dev, u32 index); +static void handle_vq(struct c2_dev *c2dev, u32 mq_index); + +/* + * Handle RNIC interrupts + */ +void c2_rnic_interrupt(struct c2_dev *c2dev) +{ + unsigned int mq_index; + + while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) { + mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT); + if (mq_index & 0x80000000) { + break; + } + + c2dev->hints_read++; + handle_mq(c2dev, mq_index); + } + +} + +/* + * Top level MQ handler + */ +static void handle_mq(struct c2_dev *c2dev, u32 mq_index) +{ + if (c2dev->qptr_array[mq_index] == NULL) { + pr_debug("handle_mq: stray activity for mq_index=%d\n", + mq_index); + return; + } + + switch (mq_index) { + case (0): + /* + * An index of 0 in the activity queue + * indicates the req vq now has messages + * available... + * + * Wake up any waiters waiting on req VQ + * message availability. + */ + wake_up(&c2dev->req_vq_wo); + break; + case (1): + handle_vq(c2dev, mq_index); + break; + case (2): + /* We have to purge the VQ in case there are pending + * accept reply requests that would result in the + * generation of an ESTABLISHED event. If we don't + * generate these first, a CLOSE event could end up + * being delivered before the ESTABLISHED event. + */ + handle_vq(c2dev, 1); + + c2_ae_event(c2dev, mq_index); + break; + default: + /* There is no event synchronization between CQ events + * and AE or CM events. In fact, CQE could be + * delivered for all of the I/O up to and including the + * FLUSH for a peer disconenct prior to the ESTABLISHED + * event being delivered to the app. The reason for this + * is that CM events are delivered on a thread, while AE + * and CM events are delivered on interrupt context. + */ + c2_cq_event(c2dev, mq_index); + break; + } + + return; +} + +/* + * Handles verbs WR replies. + */ +static void handle_vq(struct c2_dev *c2dev, u32 mq_index) +{ + void *adapter_msg, *reply_msg; + struct c2wr_hdr *host_msg; + struct c2wr_hdr tmp; + struct c2_mq *reply_vq; + struct c2_vq_req *req; + struct iw_cm_event cm_event; + int err; + + reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index]; + + /* + * get next msg from mq_index into adapter_msg. + * don't free it yet. + */ + adapter_msg = c2_mq_consume(reply_vq); + if (adapter_msg == NULL) { + return; + } + + host_msg = vq_repbuf_alloc(c2dev); + + /* + * If we can't get a host buffer, then we'll still + * wakeup the waiter, we just won't give him the msg. + * It is assumed the waiter will deal with this... + */ + if (!host_msg) { + pr_debug("handle_vq: no repbufs!\n"); + + /* + * just copy the WR header into a local variable. + * this allows us to still demux on the context + */ + host_msg = &tmp; + memcpy(host_msg, adapter_msg, sizeof(tmp)); + reply_msg = NULL; + } else { + memcpy(host_msg, adapter_msg, reply_vq->msg_size); + reply_msg = host_msg; + } + + /* + * consume the msg from the MQ + */ + c2_mq_free(reply_vq); + + /* + * wakeup the waiter. + */ + req = (struct c2_vq_req *) (unsigned long) host_msg->context; + if (req == NULL) { + /* + * We should never get here, as the adapter should + * never send us a reply that we're not expecting. + */ + if (reply_msg != NULL) + vq_repbuf_free(c2dev, host_msg); + pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n"); + return; + } + + if (reply_msg) + err = c2_errno(reply_msg); + else + err = -ENOMEM; + + if (!err) switch (req->event) { + case IW_CM_EVENT_ESTABLISHED: + c2_set_qp_state(req->qp, + C2_QP_STATE_RTS); + /* + * Until ird/ord negotiation via MPAv2 support is added, send + * max supported values + */ + cm_event.ird = cm_event.ord = 128; + case IW_CM_EVENT_CLOSE: + + /* + * Move the QP to RTS if this is + * the established event + */ + cm_event.event = req->event; + cm_event.status = 0; + cm_event.local_addr = req->cm_id->local_addr; + cm_event.remote_addr = req->cm_id->remote_addr; + cm_event.private_data = NULL; + cm_event.private_data_len = 0; + req->cm_id->event_handler(req->cm_id, &cm_event); + break; + default: + break; + } + + req->reply_msg = (u64) (unsigned long) (reply_msg); + atomic_set(&req->reply_ready, 1); + wake_up(&req->wait_object); + + /* + * If the request was cancelled, then this put will + * free the vq_req memory...and reply_msg!!! + */ + vq_req_put(c2dev, req); +} diff --git a/drivers/staging/rdma/amso1100/c2_mm.c b/drivers/staging/rdma/amso1100/c2_mm.c new file mode 100644 index 000000000..119c4f3d9 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_mm.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +#include "c2.h" +#include "c2_vq.h" + +#define PBL_VIRT 1 +#define PBL_PHYS 2 + +/* + * Send all the PBL messages to convey the remainder of the PBL + * Wait for the adapter's reply on the last one. + * This is indicated by setting the MEM_PBL_COMPLETE in the flags. + * + * NOTE: vq_req is _not_ freed by this function. The VQ Host + * Reply buffer _is_ freed by this function. + */ +static int +send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index, + unsigned long va, u32 pbl_depth, + struct c2_vq_req *vq_req, int pbl_type) +{ + u32 pbe_count; /* amt that fits in a PBL msg */ + u32 count; /* amt in this PBL MSG. */ + struct c2wr_nsmr_pbl_req *wr; /* PBL WR ptr */ + struct c2wr_nsmr_pbl_rep *reply; /* reply ptr */ + int err, pbl_virt, pbl_index, i; + + switch (pbl_type) { + case PBL_VIRT: + pbl_virt = 1; + break; + case PBL_PHYS: + pbl_virt = 0; + break; + default: + return -EINVAL; + break; + } + + pbe_count = (c2dev->req_vq.msg_size - + sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + return -ENOMEM; + } + c2_wr_set_id(wr, CCWR_NSMR_PBL); + + /* + * Only the last PBL message will generate a reply from the verbs, + * so we set the context to 0 indicating there is no kernel verbs + * handler blocked awaiting this reply. + */ + wr->hdr.context = 0; + wr->rnic_handle = c2dev->adapter_handle; + wr->stag_index = stag_index; /* already swapped */ + wr->flags = 0; + pbl_index = 0; + while (pbl_depth) { + count = min(pbe_count, pbl_depth); + wr->addrs_length = cpu_to_be32(count); + + /* + * If this is the last message, then reference the + * vq request struct cuz we're gonna wait for a reply. + * also make this PBL msg as the last one. + */ + if (count == pbl_depth) { + /* + * reference the request struct. dereferenced in the + * int handler. + */ + vq_req_get(c2dev, vq_req); + wr->flags = cpu_to_be32(MEM_PBL_COMPLETE); + + /* + * This is the last PBL message. + * Set the context to our VQ Request Object so we can + * wait for the reply. + */ + wr->hdr.context = (unsigned long) vq_req; + } + + /* + * If pbl_virt is set then va is a virtual address + * that describes a virtually contiguous memory + * allocation. The wr needs the start of each virtual page + * to be converted to the corresponding physical address + * of the page. If pbl_virt is not set then va is an array + * of physical addresses and there is no conversion to do. + * Just fill in the wr with what is in the array. + */ + for (i = 0; i < count; i++) { + if (pbl_virt) { + va += PAGE_SIZE; + } else { + wr->paddrs[i] = + cpu_to_be64(((u64 *)va)[pbl_index + i]); + } + } + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + if (count <= pbe_count) { + vq_req_put(c2dev, vq_req); + } + goto bail0; + } + pbl_depth -= count; + pbl_index += count; + } + + /* + * Now wait for the reply... + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + + vq_repbuf_free(c2dev, reply); + bail0: + kfree(wr); + return err; +} + +#define C2_PBL_MAX_DEPTH 131072 +int +c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list, + int page_size, int pbl_depth, u32 length, + u32 offset, u64 *va, enum c2_acf acf, + struct c2_mr *mr) +{ + struct c2_vq_req *vq_req; + struct c2wr_nsmr_register_req *wr; + struct c2wr_nsmr_register_rep *reply; + u16 flags; + int i, pbe_count, count; + int err; + + if (!va || !length || !addr_list || !pbl_depth) + return -EINTR; + + /* + * Verify PBL depth is within rnic max + */ + if (pbl_depth > C2_PBL_MAX_DEPTH) { + return -EINTR; + } + + /* + * allocate verbs request object + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + /* + * build the WR + */ + c2_wr_set_id(wr, CCWR_NSMR_REGISTER); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + + flags = (acf | MEM_VA_BASED | MEM_REMOTE); + + /* + * compute how many pbes can fit in the message + */ + pbe_count = (c2dev->req_vq.msg_size - + sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64); + + if (pbl_depth <= pbe_count) { + flags |= MEM_PBL_COMPLETE; + } + wr->flags = cpu_to_be16(flags); + wr->stag_key = 0; //stag_key; + wr->va = cpu_to_be64(*va); + wr->pd_id = mr->pd->pd_id; + wr->pbe_size = cpu_to_be32(page_size); + wr->length = cpu_to_be32(length); + wr->pbl_depth = cpu_to_be32(pbl_depth); + wr->fbo = cpu_to_be32(offset); + count = min(pbl_depth, pbe_count); + wr->addrs_length = cpu_to_be32(count); + + /* + * fill out the PBL for this message + */ + for (i = 0; i < count; i++) { + wr->paddrs[i] = cpu_to_be64(addr_list[i]); + } + + /* + * regerence the request struct + */ + vq_req_get(c2dev, vq_req); + + /* + * send the WR to the adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + /* + * wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail1; + } + + /* + * process reply + */ + reply = + (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + if ((err = c2_errno(reply))) { + goto bail2; + } + //*p_pb_entries = be32_to_cpu(reply->pbl_depth); + mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index); + vq_repbuf_free(c2dev, reply); + + /* + * if there are still more PBEs we need to send them to + * the adapter and wait for a reply on the final one. + * reuse vq_req for this purpose. + */ + pbl_depth -= count; + if (pbl_depth) { + + vq_req->reply_msg = (unsigned long) NULL; + atomic_set(&vq_req->reply_ready, 0); + err = send_pbl_messages(c2dev, + cpu_to_be32(mr->ibmr.lkey), + (unsigned long) &addr_list[i], + pbl_depth, vq_req, PBL_PHYS); + if (err) { + goto bail1; + } + } + + vq_req_free(c2dev, vq_req); + kfree(wr); + + return err; + + bail2: + vq_repbuf_free(c2dev, reply); + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index) +{ + struct c2_vq_req *vq_req; /* verbs request object */ + struct c2wr_stag_dealloc_req wr; /* work request */ + struct c2wr_stag_dealloc_rep *reply; /* WR reply */ + int err; + + + /* + * allocate verbs request object + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + return -ENOMEM; + } + + /* + * Build the WR + */ + c2_wr_set_id(&wr, CCWR_STAG_DEALLOC); + wr.hdr.context = (u64) (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.stag_index = cpu_to_be32(stag_index); + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} diff --git a/drivers/staging/rdma/amso1100/c2_mq.c b/drivers/staging/rdma/amso1100/c2_mq.c new file mode 100644 index 000000000..0cddc49be --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_mq.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "c2.h" +#include "c2_mq.h" + +void *c2_mq_alloc(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + if (c2_mq_full(q)) { + return NULL; + } else { +#ifdef DEBUG + struct c2wr_hdr *m = + (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size); +#ifdef CCMSGMAGIC + BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC)); + m->magic = cpu_to_be32(CCWR_MAGIC); +#endif + return m; +#else + return q->msg_pool.host + q->priv * q->msg_size; +#endif + } +} + +void c2_mq_produce(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + if (!c2_mq_full(q)) { + q->priv = (q->priv + 1) % q->q_size; + q->hint_count++; + /* Update peer's offset. */ + __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared); + } +} + +void *c2_mq_consume(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_HOST_TARGET); + + if (c2_mq_empty(q)) { + return NULL; + } else { +#ifdef DEBUG + struct c2wr_hdr *m = (struct c2wr_hdr *) + (q->msg_pool.host + q->priv * q->msg_size); +#ifdef CCMSGMAGIC + BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC)); +#endif + return m; +#else + return q->msg_pool.host + q->priv * q->msg_size; +#endif + } +} + +void c2_mq_free(struct c2_mq *q) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_HOST_TARGET); + + if (!c2_mq_empty(q)) { + +#ifdef CCMSGMAGIC + { + struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *) + (q->msg_pool.adapter + q->priv * q->msg_size); + __raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic); + } +#endif + q->priv = (q->priv + 1) % q->q_size; + /* Update peer's offset. */ + __raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared); + } +} + + +void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count) +{ + BUG_ON(q->magic != C2_MQ_MAGIC); + BUG_ON(q->type != C2_MQ_ADAPTER_TARGET); + + while (wqe_count--) { + BUG_ON(c2_mq_empty(q)); + *q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size); + } +} + +#if 0 +u32 c2_mq_count(struct c2_mq *q) +{ + s32 count; + + if (q->type == C2_MQ_HOST_TARGET) + count = be16_to_cpu(*q->shared) - q->priv; + else + count = q->priv - be16_to_cpu(*q->shared); + + if (count < 0) + count += q->q_size; + + return (u32) count; +} +#endif /* 0 */ + +void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 __iomem *pool_start, u16 __iomem *peer, u32 type) +{ + BUG_ON(!q->shared); + + /* This code assumes the byte swapping has already been done! */ + q->index = index; + q->q_size = q_size; + q->msg_size = msg_size; + q->msg_pool.adapter = pool_start; + q->peer = (struct c2_mq_shared __iomem *) peer; + q->magic = C2_MQ_MAGIC; + q->type = type; + q->priv = 0; + q->hint_count = 0; + return; +} +void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 *pool_start, u16 __iomem *peer, u32 type) +{ + BUG_ON(!q->shared); + + /* This code assumes the byte swapping has already been done! */ + q->index = index; + q->q_size = q_size; + q->msg_size = msg_size; + q->msg_pool.host = pool_start; + q->peer = (struct c2_mq_shared __iomem *) peer; + q->magic = C2_MQ_MAGIC; + q->type = type; + q->priv = 0; + q->hint_count = 0; + return; +} diff --git a/drivers/staging/rdma/amso1100/c2_mq.h b/drivers/staging/rdma/amso1100/c2_mq.h new file mode 100644 index 000000000..fc1b9a7ce --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_mq.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _C2_MQ_H_ +#define _C2_MQ_H_ +#include +#include +#include "c2_wr.h" + +enum c2_shared_regs { + + C2_SHARED_ARMED = 0x10, + C2_SHARED_NOTIFY = 0x18, + C2_SHARED_SHARED = 0x40, +}; + +struct c2_mq_shared { + u16 unused1; + u8 armed; + u8 notification_type; + u32 unused2; + u16 shared; + /* Pad to 64 bytes. */ + u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)]; +}; + +enum c2_mq_type { + C2_MQ_HOST_TARGET = 1, + C2_MQ_ADAPTER_TARGET = 2, +}; + +/* + * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ. + * c2_user_mq_t (which is the same format) is for user-mode MQs... + */ +#define C2_MQ_MAGIC 0x4d512020 /* 'MQ ' */ +struct c2_mq { + u32 magic; + union { + u8 *host; + u8 __iomem *adapter; + } msg_pool; + dma_addr_t host_dma; + DEFINE_DMA_UNMAP_ADDR(mapping); + u16 hint_count; + u16 priv; + struct c2_mq_shared __iomem *peer; + __be16 *shared; + dma_addr_t shared_dma; + u32 q_size; + u32 msg_size; + u32 index; + enum c2_mq_type type; +}; + +static __inline__ int c2_mq_empty(struct c2_mq *q) +{ + return q->priv == be16_to_cpu(*q->shared); +} + +static __inline__ int c2_mq_full(struct c2_mq *q) +{ + return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size; +} + +extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count); +extern void *c2_mq_alloc(struct c2_mq *q); +extern void c2_mq_produce(struct c2_mq *q); +extern void *c2_mq_consume(struct c2_mq *q); +extern void c2_mq_free(struct c2_mq *q); +extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 __iomem *pool_start, u16 __iomem *peer, u32 type); +extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size, + u8 *pool_start, u16 __iomem *peer, u32 type); + +#endif /* _C2_MQ_H_ */ diff --git a/drivers/staging/rdma/amso1100/c2_pd.c b/drivers/staging/rdma/amso1100/c2_pd.c new file mode 100644 index 000000000..f3e81dc35 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_pd.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "c2.h" +#include "c2_provider.h" + +int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd) +{ + u32 obj; + int ret = 0; + + spin_lock(&c2dev->pd_table.lock); + obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max, + c2dev->pd_table.last); + if (obj >= c2dev->pd_table.max) + obj = find_first_zero_bit(c2dev->pd_table.table, + c2dev->pd_table.max); + if (obj < c2dev->pd_table.max) { + pd->pd_id = obj; + __set_bit(obj, c2dev->pd_table.table); + c2dev->pd_table.last = obj+1; + if (c2dev->pd_table.last >= c2dev->pd_table.max) + c2dev->pd_table.last = 0; + } else + ret = -ENOMEM; + spin_unlock(&c2dev->pd_table.lock); + return ret; +} + +void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd) +{ + spin_lock(&c2dev->pd_table.lock); + __clear_bit(pd->pd_id, c2dev->pd_table.table); + spin_unlock(&c2dev->pd_table.lock); +} + +int c2_init_pd_table(struct c2_dev *c2dev) +{ + + c2dev->pd_table.last = 0; + c2dev->pd_table.max = c2dev->props.max_pd; + spin_lock_init(&c2dev->pd_table.lock); + c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) * + sizeof(long), GFP_KERNEL); + if (!c2dev->pd_table.table) + return -ENOMEM; + bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd); + return 0; +} + +void c2_cleanup_pd_table(struct c2_dev *c2dev) +{ + kfree(c2dev->pd_table.table); +} diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c new file mode 100644 index 000000000..25c3f0085 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_provider.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include "c2.h" +#include "c2_provider.h" +#include "c2_user.h" + +static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw) +{ + struct c2_dev *c2dev = to_c2dev(ibdev); + + pr_debug("%s:%u\n", __func__, __LINE__); + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + + *props = c2dev->props; + return 0; +} + +static int c2_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + props->max_mtu = IB_MTU_4096; + props->lid = 0; + props->lmc = 0; + props->sm_lid = 0; + props->sm_sl = 0; + props->state = IB_PORT_ACTIVE; + props->phys_state = 0; + props->port_cap_flags = + IB_PORT_CM_SUP | + IB_PORT_REINIT_SUP | + IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP; + props->gid_tbl_len = 1; + props->pkey_tbl_len = 1; + props->qkey_viol_cntr = 0; + props->active_width = 1; + props->active_speed = IB_SPEED_SDR; + + return 0; +} + +static int c2_query_pkey(struct ib_device *ibdev, + u8 port, u16 index, u16 * pkey) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + *pkey = 0; + return 0; +} + +static int c2_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct c2_dev *c2dev = to_c2dev(ibdev); + + pr_debug("%s:%u\n", __func__, __LINE__); + memset(&(gid->raw[0]), 0, sizeof(gid->raw)); + memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6); + + return 0; +} + +/* Allocate the user context data structure. This keeps track + * of all objects associated with a particular user-mode client. + */ +static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct c2_ucontext *context; + + pr_debug("%s:%u\n", __func__, __LINE__); + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return ERR_PTR(-ENOMEM); + + return &context->ibucontext; +} + +static int c2_dealloc_ucontext(struct ib_ucontext *context) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + kfree(context); + return 0; +} + +static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct c2_pd *pd; + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + err = c2_pd_alloc(to_c2dev(ibdev), !context, pd); + if (err) { + kfree(pd); + return ERR_PTR(err); + } + + if (context) { + if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) { + c2_pd_free(to_c2dev(ibdev), pd); + kfree(pd); + return ERR_PTR(-EFAULT); + } + } + + return &pd->ibpd; +} + +static int c2_dealloc_pd(struct ib_pd *pd) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + c2_pd_free(to_c2dev(pd->device), to_c2pd(pd)); + kfree(pd); + + return 0; +} + +static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return ERR_PTR(-ENOSYS); +} + +static int c2_ah_destroy(struct ib_ah *ah) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static void c2_add_ref(struct ib_qp *ibqp) +{ + struct c2_qp *qp; + BUG_ON(!ibqp); + qp = to_c2qp(ibqp); + atomic_inc(&qp->refcount); +} + +static void c2_rem_ref(struct ib_qp *ibqp) +{ + struct c2_qp *qp; + BUG_ON(!ibqp); + qp = to_c2qp(ibqp); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +struct ib_qp *c2_get_qp(struct ib_device *device, int qpn) +{ + struct c2_dev* c2dev = to_c2dev(device); + struct c2_qp *qp; + + qp = c2_find_qpn(c2dev, qpn); + pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n", + __func__, qp, qpn, device, + (qp?atomic_read(&qp->refcount):0)); + + return (qp?&qp->ibqp:NULL); +} + +static struct ib_qp *c2_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct c2_qp *qp; + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + if (init_attr->create_flags) + return ERR_PTR(-EINVAL); + + switch (init_attr->qp_type) { + case IB_QPT_RC: + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + pr_debug("%s: Unable to allocate QP\n", __func__); + return ERR_PTR(-ENOMEM); + } + spin_lock_init(&qp->lock); + if (pd->uobject) { + /* userspace specific */ + } + + err = c2_alloc_qp(to_c2dev(pd->device), + to_c2pd(pd), init_attr, qp); + + if (err && pd->uobject) { + /* userspace specific */ + } + + break; + default: + pr_debug("%s: Invalid QP type: %d\n", __func__, + init_attr->qp_type); + return ERR_PTR(-EINVAL); + } + + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + return &qp->ibqp; +} + +static int c2_destroy_qp(struct ib_qp *ib_qp) +{ + struct c2_qp *qp = to_c2qp(ib_qp); + + pr_debug("%s:%u qp=%p,qp->state=%d\n", + __func__, __LINE__, ib_qp, qp->state); + c2_free_qp(to_c2dev(ib_qp->device), qp); + kfree(qp); + return 0; +} + +static struct ib_cq *c2_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + int entries = attr->cqe; + struct c2_cq *cq; + int err; + + if (attr->flags) + return ERR_PTR(-EINVAL); + + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + pr_debug("%s: Unable to allocate CQ\n", __func__); + return ERR_PTR(-ENOMEM); + } + + err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq); + if (err) { + pr_debug("%s: error initializing CQ\n", __func__); + kfree(cq); + return ERR_PTR(err); + } + + return &cq->ibcq; +} + +static int c2_destroy_cq(struct ib_cq *ib_cq) +{ + struct c2_cq *cq = to_c2cq(ib_cq); + + pr_debug("%s:%u\n", __func__, __LINE__); + + c2_free_cq(to_c2dev(ib_cq->device), cq); + kfree(cq); + + return 0; +} + +static inline u32 c2_convert_access(int acc) +{ + return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) | + C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND; +} + +static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 * iova_start) +{ + struct c2_mr *mr; + u64 *page_list; + u32 total_len; + int err, i, j, k, page_shift, pbl_depth; + + pbl_depth = 0; + total_len = 0; + + page_shift = PAGE_SHIFT; + /* + * If there is only 1 buffer we assume this could + * be a map of all phy mem...use a 32k page_shift. + */ + if (num_phys_buf == 1) + page_shift += 3; + + for (i = 0; i < num_phys_buf; i++) { + + if (buffer_list[i].addr & ~PAGE_MASK) { + pr_debug("Unaligned Memory Buffer: 0x%x\n", + (unsigned int) buffer_list[i].addr); + return ERR_PTR(-EINVAL); + } + + if (!buffer_list[i].size) { + pr_debug("Invalid Buffer Size\n"); + return ERR_PTR(-EINVAL); + } + + total_len += buffer_list[i].size; + pbl_depth += ALIGN(buffer_list[i].size, + (1 << page_shift)) >> page_shift; + } + + page_list = vmalloc(sizeof(u64) * pbl_depth); + if (!page_list) { + pr_debug("couldn't vmalloc page_list of size %zd\n", + (sizeof(u64) * pbl_depth)); + return ERR_PTR(-ENOMEM); + } + + for (i = 0, j = 0; i < num_phys_buf; i++) { + + int naddrs; + + naddrs = ALIGN(buffer_list[i].size, + (1 << page_shift)) >> page_shift; + for (k = 0; k < naddrs; k++) + page_list[j++] = (buffer_list[i].addr + + (k << page_shift)); + } + + mr = kmalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + vfree(page_list); + return ERR_PTR(-ENOMEM); + } + + mr->pd = to_c2pd(ib_pd); + mr->umem = NULL; + pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, " + "*iova_start %llx, first pa %llx, last pa %llx\n", + __func__, page_shift, pbl_depth, total_len, + (unsigned long long) *iova_start, + (unsigned long long) page_list[0], + (unsigned long long) page_list[pbl_depth-1]); + err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list, + (1 << page_shift), pbl_depth, + total_len, 0, iova_start, + c2_convert_access(acc), mr); + vfree(page_list); + if (err) { + kfree(mr); + return ERR_PTR(err); + } + + return &mr->ibmr; +} + +static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ib_phys_buf bl; + u64 kva = 0; + + pr_debug("%s:%u\n", __func__, __LINE__); + + /* AMSO1100 limit */ + bl.size = 0xffffffff; + bl.addr = 0; + return c2_reg_phys_mr(pd, &bl, 1, acc, &kva); +} + +static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int acc, struct ib_udata *udata) +{ + u64 *pages; + u64 kva = 0; + int shift, n, len; + int i, k, entry; + int err = 0; + struct scatterlist *sg; + struct c2_pd *c2pd = to_c2pd(pd); + struct c2_mr *c2mr; + + pr_debug("%s:%u\n", __func__, __LINE__); + + c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL); + if (!c2mr) + return ERR_PTR(-ENOMEM); + c2mr->pd = c2pd; + + c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0); + if (IS_ERR(c2mr->umem)) { + err = PTR_ERR(c2mr->umem); + kfree(c2mr); + return ERR_PTR(err); + } + + shift = ffs(c2mr->umem->page_size) - 1; + n = c2mr->umem->nmap; + + pages = kmalloc(n * sizeof(u64), GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + goto err; + } + + i = 0; + for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) { + len = sg_dma_len(sg) >> shift; + for (k = 0; k < len; ++k) { + pages[i++] = + sg_dma_address(sg) + + (c2mr->umem->page_size * k); + } + } + + kva = virt; + err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), + pages, + c2mr->umem->page_size, + i, + length, + ib_umem_offset(c2mr->umem), + &kva, + c2_convert_access(acc), + c2mr); + kfree(pages); + if (err) + goto err; + return &c2mr->ibmr; + +err: + ib_umem_release(c2mr->umem); + kfree(c2mr); + return ERR_PTR(err); +} + +static int c2_dereg_mr(struct ib_mr *ib_mr) +{ + struct c2_mr *mr = to_c2mr(ib_mr); + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey); + if (err) + pr_debug("c2_stag_dealloc failed: %d\n", err); + else { + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); + } + + return err; +} + +static ssize_t show_rev(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev); + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%x\n", c2dev->props.hw_ver); +} + +static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev); + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%x.%x.%x\n", + (int) (c2dev->props.fw_ver >> 32), + (int) (c2dev->props.fw_ver >> 16) & 0xffff, + (int) (c2dev->props.fw_ver & 0xffff)); +} + +static ssize_t show_hca(struct device *dev, struct device_attribute *attr, + char *buf) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "AMSO1100\n"); +} + +static ssize_t show_board(struct device *dev, struct device_attribute *attr, + char *buf) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID"); +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); + +static struct device_attribute *c2_dev_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_fw_ver, + &dev_attr_hca_type, + &dev_attr_board_id +}; + +static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + int err; + + err = + c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr, + attr_mask); + + return err; +} + +static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, + size_t in_mad_size, + struct ib_mad_hdr *out_mad, + size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + return -ENOSYS; +} + +static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + /* Request a connection */ + return c2_llp_connect(cm_id, iw_param); +} + +static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + + /* Accept the new connection */ + return c2_llp_accept(cm_id, iw_param); +} + +static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) +{ + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_llp_reject(cm_id, pdata, pdata_len); + return err; +} + +static int c2_service_create(struct iw_cm_id *cm_id, int backlog) +{ + int err; + + pr_debug("%s:%u\n", __func__, __LINE__); + err = c2_llp_service_create(cm_id, backlog); + pr_debug("%s:%u err=%d\n", + __func__, __LINE__, + err); + return err; +} + +static int c2_service_destroy(struct iw_cm_id *cm_id) +{ + int err; + pr_debug("%s:%u\n", __func__, __LINE__); + + err = c2_llp_service_destroy(cm_id); + + return err; +} + +static int c2_pseudo_up(struct net_device *netdev) +{ + struct in_device *ind; + struct c2_dev *c2dev = netdev->ml_priv; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + pr_debug("adding...\n"); + for_ifa(ind) { +#ifdef DEBUG + u8 *ip = (u8 *) & ifa->ifa_address; + + pr_debug("%s: %d.%d.%d.%d\n", + ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]); +#endif + c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask); + } + endfor_ifa(ind); + in_dev_put(ind); + + return 0; +} + +static int c2_pseudo_down(struct net_device *netdev) +{ + struct in_device *ind; + struct c2_dev *c2dev = netdev->ml_priv; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + pr_debug("deleting...\n"); + for_ifa(ind) { +#ifdef DEBUG + u8 *ip = (u8 *) & ifa->ifa_address; + + pr_debug("%s: %d.%d.%d.%d\n", + ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]); +#endif + c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask); + } + endfor_ifa(ind); + in_dev_put(ind); + + return 0; +} + +static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev) +{ + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu) +{ + if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU) + return -EINVAL; + + netdev->mtu = new_mtu; + + /* TODO: Tell rnic about new rmda interface mtu */ + return 0; +} + +static const struct net_device_ops c2_pseudo_netdev_ops = { + .ndo_open = c2_pseudo_up, + .ndo_stop = c2_pseudo_down, + .ndo_start_xmit = c2_pseudo_xmit_frame, + .ndo_change_mtu = c2_pseudo_change_mtu, + .ndo_validate_addr = eth_validate_addr, +}; + +static void setup(struct net_device *netdev) +{ + netdev->netdev_ops = &c2_pseudo_netdev_ops; + + netdev->watchdog_timeo = 0; + netdev->type = ARPHRD_ETHER; + netdev->mtu = 1500; + netdev->hard_header_len = ETH_HLEN; + netdev->addr_len = ETH_ALEN; + netdev->tx_queue_len = 0; + netdev->flags |= IFF_NOARP; +} + +static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev) +{ + char name[IFNAMSIZ]; + struct net_device *netdev; + + /* change ethxxx to iwxxx */ + strcpy(name, "iw"); + strcat(name, &c2dev->netdev->name[3]); + netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup); + if (!netdev) { + printk(KERN_ERR PFX "%s - etherdev alloc failed", + __func__); + return NULL; + } + + netdev->ml_priv = c2dev; + + SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev); + + memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6); + + /* Print out the MAC address */ + pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr); + +#if 0 + /* Disable network packets */ + netif_stop_queue(netdev); +#endif + return netdev; +} + +static int c2_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + err = c2_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int c2_register_device(struct c2_dev *dev) +{ + int ret = -ENOMEM; + int i; + + /* Register pseudo network device */ + dev->pseudo_netdev = c2_pseudo_netdev_init(dev); + if (!dev->pseudo_netdev) + goto out; + + ret = register_netdev(dev->pseudo_netdev); + if (ret) + goto out_free_netdev; + + pr_debug("%s:%u\n", __func__, __LINE__); + strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX); + dev->ibdev.owner = THIS_MODULE; + dev->ibdev.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV); + + dev->ibdev.node_type = RDMA_NODE_RNIC; + memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); + memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6); + dev->ibdev.phys_port_cnt = 1; + dev->ibdev.num_comp_vectors = 1; + dev->ibdev.dma_device = &dev->pcidev->dev; + dev->ibdev.query_device = c2_query_device; + dev->ibdev.query_port = c2_query_port; + dev->ibdev.query_pkey = c2_query_pkey; + dev->ibdev.query_gid = c2_query_gid; + dev->ibdev.alloc_ucontext = c2_alloc_ucontext; + dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext; + dev->ibdev.mmap = c2_mmap_uar; + dev->ibdev.alloc_pd = c2_alloc_pd; + dev->ibdev.dealloc_pd = c2_dealloc_pd; + dev->ibdev.create_ah = c2_ah_create; + dev->ibdev.destroy_ah = c2_ah_destroy; + dev->ibdev.create_qp = c2_create_qp; + dev->ibdev.modify_qp = c2_modify_qp; + dev->ibdev.destroy_qp = c2_destroy_qp; + dev->ibdev.create_cq = c2_create_cq; + dev->ibdev.destroy_cq = c2_destroy_cq; + dev->ibdev.poll_cq = c2_poll_cq; + dev->ibdev.get_dma_mr = c2_get_dma_mr; + dev->ibdev.reg_phys_mr = c2_reg_phys_mr; + dev->ibdev.reg_user_mr = c2_reg_user_mr; + dev->ibdev.dereg_mr = c2_dereg_mr; + dev->ibdev.get_port_immutable = c2_port_immutable; + + dev->ibdev.alloc_fmr = NULL; + dev->ibdev.unmap_fmr = NULL; + dev->ibdev.dealloc_fmr = NULL; + dev->ibdev.map_phys_fmr = NULL; + + dev->ibdev.attach_mcast = c2_multicast_attach; + dev->ibdev.detach_mcast = c2_multicast_detach; + dev->ibdev.process_mad = c2_process_mad; + + dev->ibdev.req_notify_cq = c2_arm_cq; + dev->ibdev.post_send = c2_post_send; + dev->ibdev.post_recv = c2_post_receive; + + dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL); + if (dev->ibdev.iwcm == NULL) { + ret = -ENOMEM; + goto out_unregister_netdev; + } + dev->ibdev.iwcm->add_ref = c2_add_ref; + dev->ibdev.iwcm->rem_ref = c2_rem_ref; + dev->ibdev.iwcm->get_qp = c2_get_qp; + dev->ibdev.iwcm->connect = c2_connect; + dev->ibdev.iwcm->accept = c2_accept; + dev->ibdev.iwcm->reject = c2_reject; + dev->ibdev.iwcm->create_listen = c2_service_create; + dev->ibdev.iwcm->destroy_listen = c2_service_destroy; + + ret = ib_register_device(&dev->ibdev, NULL); + if (ret) + goto out_free_iwcm; + + for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) { + ret = device_create_file(&dev->ibdev.dev, + c2_dev_attributes[i]); + if (ret) + goto out_unregister_ibdev; + } + goto out; + +out_unregister_ibdev: + ib_unregister_device(&dev->ibdev); +out_free_iwcm: + kfree(dev->ibdev.iwcm); +out_unregister_netdev: + unregister_netdev(dev->pseudo_netdev); +out_free_netdev: + free_netdev(dev->pseudo_netdev); +out: + pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret); + return ret; +} + +void c2_unregister_device(struct c2_dev *dev) +{ + pr_debug("%s:%u\n", __func__, __LINE__); + unregister_netdev(dev->pseudo_netdev); + free_netdev(dev->pseudo_netdev); + ib_unregister_device(&dev->ibdev); +} diff --git a/drivers/staging/rdma/amso1100/c2_provider.h b/drivers/staging/rdma/amso1100/c2_provider.h new file mode 100644 index 000000000..bf1899877 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_provider.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef C2_PROVIDER_H +#define C2_PROVIDER_H +#include + +#include +#include + +#include "c2_mq.h" +#include + +#define C2_MPT_FLAG_ATOMIC (1 << 14) +#define C2_MPT_FLAG_REMOTE_WRITE (1 << 13) +#define C2_MPT_FLAG_REMOTE_READ (1 << 12) +#define C2_MPT_FLAG_LOCAL_WRITE (1 << 11) +#define C2_MPT_FLAG_LOCAL_READ (1 << 10) + +struct c2_buf_list { + void *buf; + DEFINE_DMA_UNMAP_ADDR(mapping); +}; + + +/* The user context keeps track of objects allocated for a + * particular user-mode client. */ +struct c2_ucontext { + struct ib_ucontext ibucontext; +}; + +struct c2_mtt; + +/* All objects associated with a PD are kept in the + * associated user context if present. + */ +struct c2_pd { + struct ib_pd ibpd; + u32 pd_id; +}; + +struct c2_mr { + struct ib_mr ibmr; + struct c2_pd *pd; + struct ib_umem *umem; +}; + +struct c2_av; + +enum c2_ah_type { + C2_AH_ON_HCA, + C2_AH_PCI_POOL, + C2_AH_KMALLOC +}; + +struct c2_ah { + struct ib_ah ibah; +}; + +struct c2_cq { + struct ib_cq ibcq; + spinlock_t lock; + atomic_t refcount; + int cqn; + int is_kernel; + wait_queue_head_t wait; + + u32 adapter_handle; + struct c2_mq mq; +}; + +struct c2_wq { + spinlock_t lock; +}; +struct iw_cm_id; +struct c2_qp { + struct ib_qp ibqp; + struct iw_cm_id *cm_id; + spinlock_t lock; + atomic_t refcount; + wait_queue_head_t wait; + int qpn; + + u32 adapter_handle; + u32 send_sgl_depth; + u32 recv_sgl_depth; + u32 rdma_write_sgl_depth; + u8 state; + + struct c2_mq sq_mq; + struct c2_mq rq_mq; +}; + +struct c2_cr_query_attrs { + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; +}; + +static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct c2_pd, ibpd); +} + +static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext) +{ + return container_of(ibucontext, struct c2_ucontext, ibucontext); +} + +static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct c2_mr, ibmr); +} + + +static inline struct c2_ah *to_c2ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct c2_ah, ibah); +} + +static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct c2_cq, ibcq); +} + +static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct c2_qp, ibqp); +} + +static inline int is_rnic_addr(struct net_device *netdev, u32 addr) +{ + struct in_device *ind; + int ret = 0; + + ind = in_dev_get(netdev); + if (!ind) + return 0; + + for_ifa(ind) { + if (ifa->ifa_address == addr) { + ret = 1; + break; + } + } + endfor_ifa(ind); + in_dev_put(ind); + return ret; +} +#endif /* C2_PROVIDER_H */ diff --git a/drivers/staging/rdma/amso1100/c2_qp.c b/drivers/staging/rdma/amso1100/c2_qp.c new file mode 100644 index 000000000..86708dee5 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_qp.c @@ -0,0 +1,1024 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies. All rights reserved. + * Copyright (c) 2004 Voltaire, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include + +#include "c2.h" +#include "c2_vq.h" +#include "c2_status.h" + +#define C2_MAX_ORD_PER_QP 128 +#define C2_MAX_IRD_PER_QP 128 + +#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count) +#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16) +#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF) + +#define NO_SUPPORT -1 +static const u8 c2_opcode[] = { + [IB_WR_SEND] = C2_WR_TYPE_SEND, + [IB_WR_SEND_WITH_IMM] = NO_SUPPORT, + [IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT, + [IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT, + [IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT, +}; + +static int to_c2_state(enum ib_qp_state ib_state) +{ + switch (ib_state) { + case IB_QPS_RESET: + return C2_QP_STATE_IDLE; + case IB_QPS_RTS: + return C2_QP_STATE_RTS; + case IB_QPS_SQD: + return C2_QP_STATE_CLOSING; + case IB_QPS_SQE: + return C2_QP_STATE_CLOSING; + case IB_QPS_ERR: + return C2_QP_STATE_ERROR; + default: + return -1; + } +} + +static int to_ib_state(enum c2_qp_state c2_state) +{ + switch (c2_state) { + case C2_QP_STATE_IDLE: + return IB_QPS_RESET; + case C2_QP_STATE_CONNECTING: + return IB_QPS_RTR; + case C2_QP_STATE_RTS: + return IB_QPS_RTS; + case C2_QP_STATE_CLOSING: + return IB_QPS_SQD; + case C2_QP_STATE_ERROR: + return IB_QPS_ERR; + case C2_QP_STATE_TERMINATE: + return IB_QPS_SQE; + default: + return -1; + } +} + +static const char *to_ib_state_str(int ib_state) +{ + static const char *state_str[] = { + "IB_QPS_RESET", + "IB_QPS_INIT", + "IB_QPS_RTR", + "IB_QPS_RTS", + "IB_QPS_SQD", + "IB_QPS_SQE", + "IB_QPS_ERR" + }; + if (ib_state < IB_QPS_RESET || + ib_state > IB_QPS_ERR) + return ""; + + ib_state -= IB_QPS_RESET; + return state_str[ib_state]; +} + +void c2_set_qp_state(struct c2_qp *qp, int c2_state) +{ + int new_state = to_ib_state(c2_state); + + pr_debug("%s: qp[%p] state modify %s --> %s\n", + __func__, + qp, + to_ib_state_str(qp->state), + to_ib_state_str(new_state)); + qp->state = new_state; +} + +#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF + +int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp, + struct ib_qp_attr *attr, int attr_mask) +{ + struct c2wr_qp_modify_req wr; + struct c2wr_qp_modify_rep *reply; + struct c2_vq_req *vq_req; + unsigned long flags; + u8 next_state; + int err; + + pr_debug("%s:%d qp=%p, %s --> %s\n", + __func__, __LINE__, + qp, + to_ib_state_str(qp->state), + to_ib_state_str(attr->qp_state)); + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_QP_MODIFY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + + if (attr_mask & IB_QP_STATE) { + /* Ensure the state is valid */ + if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) { + err = -EINVAL; + goto bail0; + } + + wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state)); + + if (attr->qp_state == IB_QPS_ERR) { + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id && qp->state == IB_QPS_RTS) { + pr_debug("Generating CLOSE event for QP-->ERR, " + "qp=%p, cm_id=%p\n",qp,qp->cm_id); + /* Generate an CLOSE event */ + vq_req->cm_id = qp->cm_id; + vq_req->event = IW_CM_EVENT_CLOSE; + } + spin_unlock_irqrestore(&qp->lock, flags); + } + next_state = attr->qp_state; + + } else if (attr_mask & IB_QP_CUR_STATE) { + + if (attr->cur_qp_state != IB_QPS_RTR && + attr->cur_qp_state != IB_QPS_RTS && + attr->cur_qp_state != IB_QPS_SQD && + attr->cur_qp_state != IB_QPS_SQE) { + err = -EINVAL; + goto bail0; + } else + wr.next_qp_state = + cpu_to_be32(to_c2_state(attr->cur_qp_state)); + + next_state = attr->cur_qp_state; + + } else { + err = 0; + goto bail0; + } + + /* reference the request struct */ + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + if (!err) + qp->state = next_state; +#ifdef DEBUG + else + pr_debug("%s: c2_errno=%d\n", __func__, err); +#endif + /* + * If we're going to error and generating the event here, then + * we need to remove the reference because there will be no + * close event generated by the adapter + */ + spin_lock_irqsave(&qp->lock, flags); + if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + + pr_debug("%s:%d qp=%p, cur_state=%s\n", + __func__, __LINE__, + qp, + to_ib_state_str(qp->state)); + return err; +} + +int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp, + int ord, int ird) +{ + struct c2wr_qp_modify_req wr; + struct c2wr_qp_modify_rep *reply; + struct c2_vq_req *vq_req; + int err; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_QP_MODIFY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + wr.ord = cpu_to_be32(ord); + wr.ird = cpu_to_be32(ird); + wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE); + + /* reference the request struct */ + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail0; + + reply = (struct c2wr_qp_modify_rep *) (unsigned long) + vq_req->reply_msg; + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp) +{ + struct c2_vq_req *vq_req; + struct c2wr_qp_destroy_req wr; + struct c2wr_qp_destroy_rep *reply; + unsigned long flags; + int err; + + /* + * Allocate a verb request message + */ + vq_req = vq_req_alloc(c2dev); + if (!vq_req) { + return -ENOMEM; + } + + /* + * Initialize the WR + */ + c2_wr_set_id(&wr, CCWR_QP_DESTROY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.qp_handle = qp->adapter_handle; + + /* + * reference the request struct. dereferenced in the int handler. + */ + vq_req_get(c2dev, vq_req); + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id && qp->state == IB_QPS_RTS) { + pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, " + "qp=%p, cm_id=%p\n",qp,qp->cm_id); + /* Generate an CLOSE event */ + vq_req->qp = qp; + vq_req->cm_id = qp->cm_id; + vq_req->event = IW_CM_EVENT_CLOSE; + } + spin_unlock_irqrestore(&qp->lock, flags); + + /* + * Send WR to adapter + */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + /* + * Wait for reply from adapter + */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + /* + * Process reply + */ + reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + spin_lock_irqsave(&qp->lock, flags); + if (qp->cm_id) { + qp->cm_id->rem_ref(qp->cm_id); + qp->cm_id = NULL; + } + spin_unlock_irqrestore(&qp->lock, flags); + + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp) +{ + int ret; + + idr_preload(GFP_KERNEL); + spin_lock_irq(&c2dev->qp_table.lock); + + ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT); + if (ret >= 0) + qp->qpn = ret; + + spin_unlock_irq(&c2dev->qp_table.lock); + idr_preload_end(); + return ret < 0 ? ret : 0; +} + +static void c2_free_qpn(struct c2_dev *c2dev, int qpn) +{ + spin_lock_irq(&c2dev->qp_table.lock); + idr_remove(&c2dev->qp_table.idr, qpn); + spin_unlock_irq(&c2dev->qp_table.lock); +} + +struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn) +{ + unsigned long flags; + struct c2_qp *qp; + + spin_lock_irqsave(&c2dev->qp_table.lock, flags); + qp = idr_find(&c2dev->qp_table.idr, qpn); + spin_unlock_irqrestore(&c2dev->qp_table.lock, flags); + return qp; +} + +int c2_alloc_qp(struct c2_dev *c2dev, + struct c2_pd *pd, + struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp) +{ + struct c2wr_qp_create_req wr; + struct c2wr_qp_create_rep *reply; + struct c2_vq_req *vq_req; + struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq); + struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq); + unsigned long peer_pa; + u32 q_size, msg_size, mmap_size; + void __iomem *mmap; + int err; + + err = c2_alloc_qpn(c2dev, qp); + if (err) + return err; + qp->ibqp.qp_num = qp->qpn; + qp->ibqp.qp_type = IB_QPT_RC; + + /* Allocate the SQ and RQ shared pointers */ + qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &qp->sq_mq.shared_dma, GFP_KERNEL); + if (!qp->sq_mq.shared) { + err = -ENOMEM; + goto bail0; + } + + qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &qp->rq_mq.shared_dma, GFP_KERNEL); + if (!qp->rq_mq.shared) { + err = -ENOMEM; + goto bail1; + } + + /* Allocate the verbs request */ + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + err = -ENOMEM; + goto bail2; + } + + /* Initialize the work request */ + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_QP_CREATE); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + wr.sq_cq_handle = send_cq->adapter_handle; + wr.rq_cq_handle = recv_cq->adapter_handle; + wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1); + wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1); + wr.srq_handle = 0; + wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND | + QP_ZERO_STAG | QP_RDMA_READ_RESPONSE); + wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge); + wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge); + wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge); + wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma); + wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma); + wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP); + wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP); + wr.pd_id = pd->pd_id; + wr.user_context = (unsigned long) qp; + + vq_req_get(c2dev, vq_req); + + /* Send the WR to the adapter */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail3; + } + + /* Wait for the verb reply */ + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail3; + } + + /* Process the reply */ + reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail3; + } + + if ((err = c2_wr_get_result(reply)) != 0) { + goto bail4; + } + + /* Fill in the kernel QP struct */ + atomic_set(&qp->refcount, 1); + qp->adapter_handle = reply->qp_handle; + qp->state = IB_QPS_RESET; + qp->send_sgl_depth = qp_attrs->cap.max_send_sge; + qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge; + qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge; + init_waitqueue_head(&qp->wait); + + /* Initialize the SQ MQ */ + q_size = be32_to_cpu(reply->sq_depth); + msg_size = be32_to_cpu(reply->sq_msg_size); + peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start); + mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size); + mmap = ioremap_nocache(peer_pa, mmap_size); + if (!mmap) { + err = -ENOMEM; + goto bail5; + } + + c2_mq_req_init(&qp->sq_mq, + be32_to_cpu(reply->sq_mq_index), + q_size, + msg_size, + mmap + sizeof(struct c2_mq_shared), /* pool start */ + mmap, /* peer */ + C2_MQ_ADAPTER_TARGET); + + /* Initialize the RQ mq */ + q_size = be32_to_cpu(reply->rq_depth); + msg_size = be32_to_cpu(reply->rq_msg_size); + peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start); + mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size); + mmap = ioremap_nocache(peer_pa, mmap_size); + if (!mmap) { + err = -ENOMEM; + goto bail6; + } + + c2_mq_req_init(&qp->rq_mq, + be32_to_cpu(reply->rq_mq_index), + q_size, + msg_size, + mmap + sizeof(struct c2_mq_shared), /* pool start */ + mmap, /* peer */ + C2_MQ_ADAPTER_TARGET); + + vq_repbuf_free(c2dev, reply); + vq_req_free(c2dev, vq_req); + + return 0; + + bail6: + iounmap(qp->sq_mq.peer); + bail5: + destroy_qp(c2dev, qp); + bail4: + vq_repbuf_free(c2dev, reply); + bail3: + vq_req_free(c2dev, vq_req); + bail2: + c2_free_mqsp(qp->rq_mq.shared); + bail1: + c2_free_mqsp(qp->sq_mq.shared); + bail0: + c2_free_qpn(c2dev, qp->qpn); + return err; +} + +static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_lock_irq(&send_cq->lock); + else if (send_cq > recv_cq) { + spin_lock_irq(&send_cq->lock); + spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock_irq(&recv_cq->lock); + spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); + } +} + +static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq) +{ + if (send_cq == recv_cq) + spin_unlock_irq(&send_cq->lock); + else if (send_cq > recv_cq) { + spin_unlock(&recv_cq->lock); + spin_unlock_irq(&send_cq->lock); + } else { + spin_unlock(&send_cq->lock); + spin_unlock_irq(&recv_cq->lock); + } +} + +void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp) +{ + struct c2_cq *send_cq; + struct c2_cq *recv_cq; + + send_cq = to_c2cq(qp->ibqp.send_cq); + recv_cq = to_c2cq(qp->ibqp.recv_cq); + + /* + * Lock CQs here, so that CQ polling code can do QP lookup + * without taking a lock. + */ + c2_lock_cqs(send_cq, recv_cq); + c2_free_qpn(c2dev, qp->qpn); + c2_unlock_cqs(send_cq, recv_cq); + + /* + * Destroy qp in the rnic... + */ + destroy_qp(c2dev, qp); + + /* + * Mark any unreaped CQEs as null and void. + */ + c2_cq_clean(c2dev, qp, send_cq->cqn); + if (send_cq != recv_cq) + c2_cq_clean(c2dev, qp, recv_cq->cqn); + /* + * Unmap the MQs and return the shared pointers + * to the message pool. + */ + iounmap(qp->sq_mq.peer); + iounmap(qp->rq_mq.peer); + c2_free_mqsp(qp->sq_mq.shared); + c2_free_mqsp(qp->rq_mq.shared); + + atomic_dec(&qp->refcount); + wait_event(qp->wait, !atomic_read(&qp->refcount)); +} + +/* + * Function: move_sgl + * + * Description: + * Move an SGL from the user's work request struct into a CCIL Work Request + * message, swapping to WR byte order and ensure the total length doesn't + * overflow. + * + * IN: + * dst - ptr to CCIL Work Request message SGL memory. + * src - ptr to the consumers SGL memory. + * + * OUT: none + * + * Return: + * CCIL status codes. + */ +static int +move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len, + u8 * actual_count) +{ + u32 tot = 0; /* running total */ + u8 acount = 0; /* running total non-0 len sge's */ + + while (count > 0) { + /* + * If the addition of this SGE causes the + * total SGL length to exceed 2^32-1, then + * fail-n-bail. + * + * If the current total plus the next element length + * wraps, then it will go negative and be less than the + * current total... + */ + if ((tot + src->length) < tot) { + return -EINVAL; + } + /* + * Bug: 1456 (as well as 1498 & 1643) + * Skip over any sge's supplied with len=0 + */ + if (src->length) { + tot += src->length; + dst->stag = cpu_to_be32(src->lkey); + dst->to = cpu_to_be64(src->addr); + dst->length = cpu_to_be32(src->length); + dst++; + acount++; + } + src++; + count--; + } + + if (acount == 0) { + /* + * Bug: 1476 (as well as 1498, 1456 and 1643) + * Setup the SGL in the WR to make it easier for the RNIC. + * This way, the FW doesn't have to deal with special cases. + * Setting length=0 should be sufficient. + */ + dst->stag = 0; + dst->to = 0; + dst->length = 0; + } + + *p_len = tot; + *actual_count = acount; + return 0; +} + +/* + * Function: c2_activity (private function) + * + * Description: + * Post an mq index to the host->adapter activity fifo. + * + * IN: + * c2dev - ptr to c2dev structure + * mq_index - mq index to post + * shared - value most recently written to shared + * + * OUT: + * + * Return: + * none + */ +static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared) +{ + /* + * First read the register to see if the FIFO is full, and if so, + * spin until it's not. This isn't perfect -- there is no + * synchronization among the clients of the register, but in + * practice it prevents multiple CPU from hammering the bus + * with PCI RETRY. Note that when this does happen, the card + * cannot get on the bus and the card and system hang in a + * deadlock -- thus the need for this code. [TOT] + */ + while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000) + udelay(10); + + __raw_writel(C2_HINT_MAKE(mq_index, shared), + c2dev->regs + PCI_BAR0_ADAPTER_HINT); +} + +/* + * Function: qp_wr_post + * + * Description: + * This in-line function allocates a MQ msg, then moves the host-copy of + * the completed WR into msg. Then it posts the message. + * + * IN: + * q - ptr to user MQ. + * wr - ptr to host-copy of the WR. + * qp - ptr to user qp + * size - Number of bytes to post. Assumed to be divisible by 4. + * + * OUT: none + * + * Return: + * CCIL status codes. + */ +static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size) +{ + union c2wr *msg; + + msg = c2_mq_alloc(q); + if (msg == NULL) { + return -EINVAL; + } +#ifdef CCMSGMAGIC + ((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC); +#endif + + /* + * Since all header fields in the WR are the same as the + * CQE, set the following so the adapter need not. + */ + c2_wr_set_result(wr, CCERR_PENDING); + + /* + * Copy the wr down to the adapter + */ + memcpy((void *) msg, (void *) wr, size); + + c2_mq_produce(q); + return 0; +} + + +int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, + struct ib_send_wr **bad_wr) +{ + struct c2_dev *c2dev = to_c2dev(ibqp->device); + struct c2_qp *qp = to_c2qp(ibqp); + union c2wr wr; + unsigned long lock_flags; + int err = 0; + + u32 flags; + u32 tot_len; + u8 actual_sge_count; + u32 msg_size; + + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + while (ib_wr) { + + flags = 0; + wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id; + if (ib_wr->send_flags & IB_SEND_SIGNALED) { + flags |= SQ_SIGNALED; + } + + switch (ib_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (ib_wr->opcode == IB_WR_SEND) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE); + else + c2_wr_set_id(&wr, C2_WR_TYPE_SEND); + wr.sqwr.send.remote_stag = 0; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV); + else + c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV); + wr.sqwr.send.remote_stag = + cpu_to_be32(ib_wr->ex.invalidate_rkey); + } + + msg_size = sizeof(struct c2wr_send_req) + + sizeof(struct c2_data_addr) * ib_wr->num_sge; + if (ib_wr->num_sge > qp->send_sgl_depth) { + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + flags |= SQ_READ_FENCE; + } + err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data), + ib_wr->sg_list, + ib_wr->num_sge, + &tot_len, &actual_sge_count); + wr.sqwr.send.sge_len = cpu_to_be32(tot_len); + c2_wr_set_sge_count(&wr, actual_sge_count); + break; + case IB_WR_RDMA_WRITE: + c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE); + msg_size = sizeof(struct c2wr_rdma_write_req) + + (sizeof(struct c2_data_addr) * ib_wr->num_sge); + if (ib_wr->num_sge > qp->rdma_write_sgl_depth) { + err = -EINVAL; + break; + } + if (ib_wr->send_flags & IB_SEND_FENCE) { + flags |= SQ_READ_FENCE; + } + wr.sqwr.rdma_write.remote_stag = + cpu_to_be32(ib_wr->wr.rdma.rkey); + wr.sqwr.rdma_write.remote_to = + cpu_to_be64(ib_wr->wr.rdma.remote_addr); + err = move_sgl((struct c2_data_addr *) + & (wr.sqwr.rdma_write.data), + ib_wr->sg_list, + ib_wr->num_sge, + &tot_len, &actual_sge_count); + wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len); + c2_wr_set_sge_count(&wr, actual_sge_count); + break; + case IB_WR_RDMA_READ: + c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ); + msg_size = sizeof(struct c2wr_rdma_read_req); + + /* IWarp only suppots 1 sge for RDMA reads */ + if (ib_wr->num_sge > 1) { + err = -EINVAL; + break; + } + + /* + * Move the local and remote stag/to/len into the WR. + */ + wr.sqwr.rdma_read.local_stag = + cpu_to_be32(ib_wr->sg_list->lkey); + wr.sqwr.rdma_read.local_to = + cpu_to_be64(ib_wr->sg_list->addr); + wr.sqwr.rdma_read.remote_stag = + cpu_to_be32(ib_wr->wr.rdma.rkey); + wr.sqwr.rdma_read.remote_to = + cpu_to_be64(ib_wr->wr.rdma.remote_addr); + wr.sqwr.rdma_read.length = + cpu_to_be32(ib_wr->sg_list->length); + break; + default: + /* error */ + msg_size = 0; + err = -EINVAL; + break; + } + + /* + * If we had an error on the last wr build, then + * break out. Possible errors include bogus WR + * type, and a bogus SGL length... + */ + if (err) { + break; + } + + /* + * Store flags + */ + c2_wr_set_flags(&wr, flags); + + /* + * Post the puppy! + */ + spin_lock_irqsave(&qp->lock, lock_flags); + err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size); + if (err) { + spin_unlock_irqrestore(&qp->lock, lock_flags); + break; + } + + /* + * Enqueue mq index to activity FIFO. + */ + c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count); + spin_unlock_irqrestore(&qp->lock, lock_flags); + + ib_wr = ib_wr->next; + } + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + +int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, + struct ib_recv_wr **bad_wr) +{ + struct c2_dev *c2dev = to_c2dev(ibqp->device); + struct c2_qp *qp = to_c2qp(ibqp); + union c2wr wr; + unsigned long lock_flags; + int err = 0; + + if (qp->state > IB_QPS_RTS) { + err = -EINVAL; + goto out; + } + + /* + * Try and post each work request + */ + while (ib_wr) { + u32 tot_len; + u8 actual_sge_count; + + if (ib_wr->num_sge > qp->recv_sgl_depth) { + err = -EINVAL; + break; + } + + /* + * Create local host-copy of the WR + */ + wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id; + c2_wr_set_id(&wr, CCWR_RECV); + c2_wr_set_flags(&wr, 0); + + /* sge_count is limited to eight bits. */ + BUG_ON(ib_wr->num_sge >= 256); + err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data), + ib_wr->sg_list, + ib_wr->num_sge, &tot_len, &actual_sge_count); + c2_wr_set_sge_count(&wr, actual_sge_count); + + /* + * If we had an error on the last wr build, then + * break out. Possible errors include bogus WR + * type, and a bogus SGL length... + */ + if (err) { + break; + } + + spin_lock_irqsave(&qp->lock, lock_flags); + err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size); + if (err) { + spin_unlock_irqrestore(&qp->lock, lock_flags); + break; + } + + /* + * Enqueue mq index to activity FIFO + */ + c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count); + spin_unlock_irqrestore(&qp->lock, lock_flags); + + ib_wr = ib_wr->next; + } + +out: + if (err) + *bad_wr = ib_wr; + return err; +} + +void c2_init_qp_table(struct c2_dev *c2dev) +{ + spin_lock_init(&c2dev->qp_table.lock); + idr_init(&c2dev->qp_table.idr); +} + +void c2_cleanup_qp_table(struct c2_dev *c2dev) +{ + idr_destroy(&c2dev->qp_table.idr); +} diff --git a/drivers/staging/rdma/amso1100/c2_rnic.c b/drivers/staging/rdma/amso1100/c2_rnic.c new file mode 100644 index 000000000..d2a6d9613 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_rnic.c @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include "c2.h" +#include "c2_vq.h" + +/* Device capabilities */ +#define C2_MIN_PAGESIZE 1024 + +#define C2_MAX_MRS 32768 +#define C2_MAX_QPS 16000 +#define C2_MAX_WQE_SZ 256 +#define C2_MAX_QP_WR ((128*1024)/C2_MAX_WQE_SZ) +#define C2_MAX_SGES 4 +#define C2_MAX_SGE_RD 1 +#define C2_MAX_CQS 32768 +#define C2_MAX_CQES 4096 +#define C2_MAX_PDS 16384 + +/* + * Send the adapter INIT message to the amso1100 + */ +static int c2_adapter_init(struct c2_dev *c2dev) +{ + struct c2wr_init_req wr; + int err; + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_INIT); + wr.hdr.context = 0; + wr.hint_count = cpu_to_be64(c2dev->hint_count_dma); + wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma); + wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma); + wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma); + wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma); + wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma); + + /* Post the init message */ + err = vq_send_wr(c2dev, (union c2wr *) & wr); + + return err; +} + +/* + * Send the adapter TERM message to the amso1100 + */ +static void c2_adapter_term(struct c2_dev *c2dev) +{ + struct c2wr_init_req wr; + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_TERM); + wr.hdr.context = 0; + + /* Post the init message */ + vq_send_wr(c2dev, (union c2wr *) & wr); + c2dev->init = 0; + + return; +} + +/* + * Query the adapter + */ +static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_query_req wr; + struct c2wr_rnic_query_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + c2_wr_set_id(&wr, CCWR_RNIC_QUERY); + wr.hdr.context = (unsigned long) vq_req; + wr.rnic_handle = c2dev->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) + err = -ENOMEM; + else + err = c2_errno(reply); + if (err) + goto bail2; + + props->fw_ver = + ((u64)be32_to_cpu(reply->fw_ver_major) << 32) | + ((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) | + (be32_to_cpu(reply->fw_ver_patch) & 0xFFFF); + memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6); + props->max_mr_size = 0xFFFFFFFF; + props->page_size_cap = ~(C2_MIN_PAGESIZE-1); + props->vendor_id = be32_to_cpu(reply->vendor_id); + props->vendor_part_id = be32_to_cpu(reply->part_number); + props->hw_ver = be32_to_cpu(reply->hw_version); + props->max_qp = be32_to_cpu(reply->max_qps); + props->max_qp_wr = be32_to_cpu(reply->max_qp_depth); + props->device_cap_flags = c2dev->device_cap_flags; + props->max_sge = C2_MAX_SGES; + props->max_sge_rd = C2_MAX_SGE_RD; + props->max_cq = be32_to_cpu(reply->max_cqs); + props->max_cqe = be32_to_cpu(reply->max_cq_depth); + props->max_mr = be32_to_cpu(reply->max_mrs); + props->max_pd = be32_to_cpu(reply->max_pds); + props->max_qp_rd_atom = be32_to_cpu(reply->max_qp_ird); + props->max_ee_rd_atom = 0; + props->max_res_rd_atom = be32_to_cpu(reply->max_global_ird); + props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord); + props->max_ee_init_rd_atom = 0; + props->atomic_cap = IB_ATOMIC_NONE; + props->max_ee = 0; + props->max_rdd = 0; + props->max_mw = be32_to_cpu(reply->max_mws); + props->max_raw_ipv6_qp = 0; + props->max_raw_ethy_qp = 0; + props->max_mcast_grp = 0; + props->max_mcast_qp_attach = 0; + props->max_total_mcast_qp_attach = 0; + props->max_ah = 0; + props->max_fmr = 0; + props->max_map_per_fmr = 0; + props->max_srq = 0; + props->max_srq_wr = 0; + props->max_srq_sge = 0; + props->max_pkeys = 0; + props->local_ca_ack_delay = 0; + + bail2: + vq_repbuf_free(c2dev, reply); + + bail1: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Add an IP address to the RNIC interface + */ +int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_setconfig_req *wr; + struct c2wr_rnic_setconfig_rep *reply; + struct c2_netaddr netaddr; + int err, len; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + len = sizeof(struct c2_netaddr); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->option = cpu_to_be32(C2_CFG_ADD_ADDR); + + netaddr.ip_addr = inaddr; + netaddr.netmask = inmask; + netaddr.mtu = 0; + + memcpy(wr->data, &netaddr, len); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Delete an IP address from the RNIC interface + */ +int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask) +{ + struct c2_vq_req *vq_req; + struct c2wr_rnic_setconfig_req *wr; + struct c2wr_rnic_setconfig_rep *reply; + struct c2_netaddr netaddr; + int err, len; + + vq_req = vq_req_alloc(c2dev); + if (!vq_req) + return -ENOMEM; + + len = sizeof(struct c2_netaddr); + wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL); + if (!wr) { + err = -ENOMEM; + goto bail0; + } + + c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG); + wr->hdr.context = (unsigned long) vq_req; + wr->rnic_handle = c2dev->adapter_handle; + wr->option = cpu_to_be32(C2_CFG_DEL_ADDR); + + netaddr.ip_addr = inaddr; + netaddr.netmask = inmask; + netaddr.mtu = 0; + + memcpy(wr->data, &netaddr, len); + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, (union c2wr *) wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail1; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) + goto bail1; + + reply = + (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail1; + } + + err = c2_errno(reply); + vq_repbuf_free(c2dev, reply); + + bail1: + kfree(wr); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Open a single RNIC instance to use with all + * low level openib calls + */ +static int c2_rnic_open(struct c2_dev *c2dev) +{ + struct c2_vq_req *vq_req; + union c2wr wr; + struct c2wr_rnic_open_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + return -ENOMEM; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_RNIC_OPEN); + wr.rnic_open.req.hdr.context = (unsigned long) (vq_req); + wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE); + wr.rnic_open.req.port_num = cpu_to_be16(0); + wr.rnic_open.req.user_context = (unsigned long) c2dev; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + if ((err = c2_errno(reply)) != 0) { + goto bail1; + } + + c2dev->adapter_handle = reply->rnic_handle; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Close the RNIC instance + */ +static int c2_rnic_close(struct c2_dev *c2dev) +{ + struct c2_vq_req *vq_req; + union c2wr wr; + struct c2wr_rnic_close_rep *reply; + int err; + + vq_req = vq_req_alloc(c2dev); + if (vq_req == NULL) { + return -ENOMEM; + } + + memset(&wr, 0, sizeof(wr)); + c2_wr_set_id(&wr, CCWR_RNIC_CLOSE); + wr.rnic_close.req.hdr.context = (unsigned long) vq_req; + wr.rnic_close.req.rnic_handle = c2dev->adapter_handle; + + vq_req_get(c2dev, vq_req); + + err = vq_send_wr(c2dev, &wr); + if (err) { + vq_req_put(c2dev, vq_req); + goto bail0; + } + + err = vq_wait_for_reply(c2dev, vq_req); + if (err) { + goto bail0; + } + + reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg); + if (!reply) { + err = -ENOMEM; + goto bail0; + } + + if ((err = c2_errno(reply)) != 0) { + goto bail1; + } + + c2dev->adapter_handle = 0; + + bail1: + vq_repbuf_free(c2dev, reply); + bail0: + vq_req_free(c2dev, vq_req); + return err; +} + +/* + * Called by c2_probe to initialize the RNIC. This principally + * involves initializing the various limits and resource pools that + * comprise the RNIC instance. + */ +int c2_rnic_init(struct c2_dev *c2dev) +{ + int err; + u32 qsize, msgsize; + void *q1_pages; + void *q2_pages; + void __iomem *mmio_regs; + + /* Device capabilities */ + c2dev->device_cap_flags = + (IB_DEVICE_RESIZE_MAX_WR | + IB_DEVICE_CURR_QP_STATE_MOD | + IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_LOCAL_DMA_LKEY | + IB_DEVICE_MEM_WINDOW); + + /* Allocate the qptr_array */ + c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *)); + if (!c2dev->qptr_array) { + return -ENOMEM; + } + + /* Initialize the qptr_array */ + c2dev->qptr_array[0] = (void *) &c2dev->req_vq; + c2dev->qptr_array[1] = (void *) &c2dev->rep_vq; + c2dev->qptr_array[2] = (void *) &c2dev->aeq; + + /* Initialize data structures */ + init_waitqueue_head(&c2dev->req_vq_wo); + spin_lock_init(&c2dev->vqlock); + spin_lock_init(&c2dev->lock); + + /* Allocate MQ shared pointer pool for kernel clients. User + * mode client pools are hung off the user context + */ + err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool); + if (err) { + goto bail0; + } + + /* Allocate shared pointers for Q0, Q1, and Q2 from + * the shared pointer pool. + */ + + c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->hint_count_dma, + GFP_KERNEL); + c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->req_vq.shared_dma, + GFP_KERNEL); + c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->rep_vq.shared_dma, + GFP_KERNEL); + c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool, + &c2dev->aeq.shared_dma, GFP_KERNEL); + if (!c2dev->hint_count || !c2dev->req_vq.shared || + !c2dev->rep_vq.shared || !c2dev->aeq.shared) { + err = -ENOMEM; + goto bail1; + } + + mmio_regs = c2dev->kva; + /* Initialize the Verbs Request Queue */ + c2_mq_req_init(&c2dev->req_vq, 0, + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)), + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)), + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)), + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)), + C2_MQ_ADAPTER_TARGET); + + /* Initialize the Verbs Reply Queue */ + qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE)); + msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE)); + q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize, + &c2dev->rep_vq.host_dma, GFP_KERNEL); + if (!q1_pages) { + err = -ENOMEM; + goto bail1; + } + dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma); + pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages, + (unsigned long long) c2dev->rep_vq.host_dma); + c2_mq_rep_init(&c2dev->rep_vq, + 1, + qsize, + msgsize, + q1_pages, + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)), + C2_MQ_HOST_TARGET); + + /* Initialize the Asynchronus Event Queue */ + qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE)); + msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE)); + q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize, + &c2dev->aeq.host_dma, GFP_KERNEL); + if (!q2_pages) { + err = -ENOMEM; + goto bail2; + } + dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma); + pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages, + (unsigned long long) c2dev->aeq.host_dma); + c2_mq_rep_init(&c2dev->aeq, + 2, + qsize, + msgsize, + q2_pages, + mmio_regs + + be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)), + C2_MQ_HOST_TARGET); + + /* Initialize the verbs request allocator */ + err = vq_init(c2dev); + if (err) + goto bail3; + + /* Enable interrupts on the adapter */ + writel(0, c2dev->regs + C2_IDIS); + + /* create the WR init message */ + err = c2_adapter_init(c2dev); + if (err) + goto bail4; + c2dev->init++; + + /* open an adapter instance */ + err = c2_rnic_open(c2dev); + if (err) + goto bail4; + + /* Initialize cached the adapter limits */ + err = c2_rnic_query(c2dev, &c2dev->props); + if (err) + goto bail5; + + /* Initialize the PD pool */ + err = c2_init_pd_table(c2dev); + if (err) + goto bail5; + + /* Initialize the QP pool */ + c2_init_qp_table(c2dev); + return 0; + + bail5: + c2_rnic_close(c2dev); + bail4: + vq_term(c2dev); + bail3: + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->aeq.q_size * c2dev->aeq.msg_size, + q2_pages, dma_unmap_addr(&c2dev->aeq, mapping)); + bail2: + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size, + q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping)); + bail1: + c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool); + bail0: + vfree(c2dev->qptr_array); + + return err; +} + +/* + * Called by c2_remove to cleanup the RNIC resources. + */ +void c2_rnic_term(struct c2_dev *c2dev) +{ + + /* Close the open adapter instance */ + c2_rnic_close(c2dev); + + /* Send the TERM message to the adapter */ + c2_adapter_term(c2dev); + + /* Disable interrupts on the adapter */ + writel(1, c2dev->regs + C2_IDIS); + + /* Free the QP pool */ + c2_cleanup_qp_table(c2dev); + + /* Free the PD pool */ + c2_cleanup_pd_table(c2dev); + + /* Free the verbs request allocator */ + vq_term(c2dev); + + /* Free the asynchronus event queue */ + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->aeq.q_size * c2dev->aeq.msg_size, + c2dev->aeq.msg_pool.host, + dma_unmap_addr(&c2dev->aeq, mapping)); + + /* Free the verbs reply queue */ + dma_free_coherent(&c2dev->pcidev->dev, + c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size, + c2dev->rep_vq.msg_pool.host, + dma_unmap_addr(&c2dev->rep_vq, mapping)); + + /* Free the MQ shared pointer pool */ + c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool); + + /* Free the qptr_array */ + vfree(c2dev->qptr_array); + + return; +} diff --git a/drivers/staging/rdma/amso1100/c2_status.h b/drivers/staging/rdma/amso1100/c2_status.h new file mode 100644 index 000000000..6ee4aa92d --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_status.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_STATUS_H_ +#define _C2_STATUS_H_ + +/* + * Verbs Status Codes + */ +enum c2_status { + C2_OK = 0, /* This must be zero */ + CCERR_INSUFFICIENT_RESOURCES = 1, + CCERR_INVALID_MODIFIER = 2, + CCERR_INVALID_MODE = 3, + CCERR_IN_USE = 4, + CCERR_INVALID_RNIC = 5, + CCERR_INTERRUPTED_OPERATION = 6, + CCERR_INVALID_EH = 7, + CCERR_INVALID_CQ = 8, + CCERR_CQ_EMPTY = 9, + CCERR_NOT_IMPLEMENTED = 10, + CCERR_CQ_DEPTH_TOO_SMALL = 11, + CCERR_PD_IN_USE = 12, + CCERR_INVALID_PD = 13, + CCERR_INVALID_SRQ = 14, + CCERR_INVALID_ADDRESS = 15, + CCERR_INVALID_NETMASK = 16, + CCERR_INVALID_QP = 17, + CCERR_INVALID_QP_STATE = 18, + CCERR_TOO_MANY_WRS_POSTED = 19, + CCERR_INVALID_WR_TYPE = 20, + CCERR_INVALID_SGL_LENGTH = 21, + CCERR_INVALID_SQ_DEPTH = 22, + CCERR_INVALID_RQ_DEPTH = 23, + CCERR_INVALID_ORD = 24, + CCERR_INVALID_IRD = 25, + CCERR_QP_ATTR_CANNOT_CHANGE = 26, + CCERR_INVALID_STAG = 27, + CCERR_QP_IN_USE = 28, + CCERR_OUTSTANDING_WRS = 29, + CCERR_STAG_IN_USE = 30, + CCERR_INVALID_STAG_INDEX = 31, + CCERR_INVALID_SGL_FORMAT = 32, + CCERR_ADAPTER_TIMEOUT = 33, + CCERR_INVALID_CQ_DEPTH = 34, + CCERR_INVALID_PRIVATE_DATA_LENGTH = 35, + CCERR_INVALID_EP = 36, + CCERR_MR_IN_USE = CCERR_STAG_IN_USE, + CCERR_FLUSHED = 38, + CCERR_INVALID_WQE = 39, + CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40, + CCERR_REMOTE_TERMINATION_ERROR = 41, + CCERR_BASE_AND_BOUNDS_VIOLATION = 42, + CCERR_ACCESS_VIOLATION = 43, + CCERR_INVALID_PD_ID = 44, + CCERR_WRAP_ERROR = 45, + CCERR_INV_STAG_ACCESS_ERROR = 46, + CCERR_ZERO_RDMA_READ_RESOURCES = 47, + CCERR_QP_NOT_PRIVILEGED = 48, + CCERR_STAG_STATE_NOT_INVALID = 49, + CCERR_INVALID_PAGE_SIZE = 50, + CCERR_INVALID_BUFFER_SIZE = 51, + CCERR_INVALID_PBE = 52, + CCERR_INVALID_FBO = 53, + CCERR_INVALID_LENGTH = 54, + CCERR_INVALID_ACCESS_RIGHTS = 55, + CCERR_PBL_TOO_BIG = 56, + CCERR_INVALID_VA = 57, + CCERR_INVALID_REGION = 58, + CCERR_INVALID_WINDOW = 59, + CCERR_TOTAL_LENGTH_TOO_BIG = 60, + CCERR_INVALID_QP_ID = 61, + CCERR_ADDR_IN_USE = 62, + CCERR_ADDR_NOT_AVAIL = 63, + CCERR_NET_DOWN = 64, + CCERR_NET_UNREACHABLE = 65, + CCERR_CONN_ABORTED = 66, + CCERR_CONN_RESET = 67, + CCERR_NO_BUFS = 68, + CCERR_CONN_TIMEDOUT = 69, + CCERR_CONN_REFUSED = 70, + CCERR_HOST_UNREACHABLE = 71, + CCERR_INVALID_SEND_SGL_DEPTH = 72, + CCERR_INVALID_RECV_SGL_DEPTH = 73, + CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74, + CCERR_INSUFFICIENT_PRIVILEGES = 75, + CCERR_STACK_ERROR = 76, + CCERR_INVALID_VERSION = 77, + CCERR_INVALID_MTU = 78, + CCERR_INVALID_IMAGE = 79, + CCERR_PENDING = 98, /* not an error; user internally by adapter */ + CCERR_DEFER = 99, /* not an error; used internally by adapter */ + CCERR_FAILED_WRITE = 100, + CCERR_FAILED_ERASE = 101, + CCERR_FAILED_VERIFICATION = 102, + CCERR_NOT_FOUND = 103, + +}; + +/* + * CCAE_ACTIVE_CONNECT_RESULTS status result codes. + */ +enum c2_connect_status { + C2_CONN_STATUS_SUCCESS = C2_OK, + C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES, + C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT, + C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED, + C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE, + C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE, + C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC, + C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP, + C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE, + C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET, + C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL, +}; + +/* + * Flash programming status codes. + */ +enum c2_flash_status { + C2_FLASH_STATUS_SUCCESS = 0x0000, + C2_FLASH_STATUS_VERIFY_ERR = 0x0002, + C2_FLASH_STATUS_IMAGE_ERR = 0x0004, + C2_FLASH_STATUS_ECLBS = 0x0400, + C2_FLASH_STATUS_PSLBS = 0x0800, + C2_FLASH_STATUS_VPENS = 0x1000, +}; + +#endif /* _C2_STATUS_H_ */ diff --git a/drivers/staging/rdma/amso1100/c2_user.h b/drivers/staging/rdma/amso1100/c2_user.h new file mode 100644 index 000000000..7e9e7ad65 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_user.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Cisco Systems. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef C2_USER_H +#define C2_USER_H + +#include + +/* + * Make sure that all structs defined in this file remain laid out so + * that they pack the same way on 32-bit and 64-bit architectures (to + * avoid incompatibility between 32-bit userspace and 64-bit kernels). + * In particular do not use pointer types -- pass pointers in __u64 + * instead. + */ + +struct c2_alloc_ucontext_resp { + __u32 qp_tab_size; + __u32 uarc_size; +}; + +struct c2_alloc_pd_resp { + __u32 pdn; + __u32 reserved; +}; + +struct c2_create_cq { + __u32 lkey; + __u32 pdn; + __u64 arm_db_page; + __u64 set_db_page; + __u32 arm_db_index; + __u32 set_db_index; +}; + +struct c2_create_cq_resp { + __u32 cqn; + __u32 reserved; +}; + +struct c2_create_qp { + __u32 lkey; + __u32 reserved; + __u64 sq_db_page; + __u64 rq_db_page; + __u32 sq_db_index; + __u32 rq_db_index; +}; + +#endif /* C2_USER_H */ diff --git a/drivers/staging/rdma/amso1100/c2_vq.c b/drivers/staging/rdma/amso1100/c2_vq.c new file mode 100644 index 000000000..2ec716fb2 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_vq.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include + +#include "c2_vq.h" +#include "c2_provider.h" + +/* + * Verbs Request Objects: + * + * VQ Request Objects are allocated by the kernel verbs handlers. + * They contain a wait object, a refcnt, an atomic bool indicating that the + * adapter has replied, and a copy of the verb reply work request. + * A pointer to the VQ Request Object is passed down in the context + * field of the work request message, and reflected back by the adapter + * in the verbs reply message. The function handle_vq() in the interrupt + * path will use this pointer to: + * 1) append a copy of the verbs reply message + * 2) mark that the reply is ready + * 3) wake up the kernel verbs handler blocked awaiting the reply. + * + * + * The kernel verbs handlers do a "get" to put a 2nd reference on the + * VQ Request object. If the kernel verbs handler exits before the adapter + * can respond, this extra reference will keep the VQ Request object around + * until the adapter's reply can be processed. The reason we need this is + * because a pointer to this object is stuffed into the context field of + * the verbs work request message, and reflected back in the reply message. + * It is used in the interrupt handler (handle_vq()) to wake up the appropriate + * kernel verb handler that is blocked awaiting the verb reply. + * So handle_vq() will do a "put" on the object when it's done accessing it. + * NOTE: If we guarantee that the kernel verb handler will never bail before + * getting the reply, then we don't need these refcnts. + * + * + * VQ Request objects are freed by the kernel verbs handlers only + * after the verb has been processed, or when the adapter fails and + * does not reply. + * + * + * Verbs Reply Buffers: + * + * VQ Reply bufs are local host memory copies of a + * outstanding Verb Request reply + * message. The are always allocated by the kernel verbs handlers, and _may_ be + * freed by either the kernel verbs handler -or- the interrupt handler. The + * kernel verbs handler _must_ free the repbuf, then free the vq request object + * in that order. + */ + +int vq_init(struct c2_dev *c2dev) +{ + sprintf(c2dev->vq_cache_name, "c2-vq:dev%c", + (char) ('0' + c2dev->devnum)); + c2dev->host_msg_cache = + kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0, + SLAB_HWCACHE_ALIGN, NULL); + if (c2dev->host_msg_cache == NULL) { + return -ENOMEM; + } + return 0; +} + +void vq_term(struct c2_dev *c2dev) +{ + kmem_cache_destroy(c2dev->host_msg_cache); +} + +/* vq_req_alloc - allocate a VQ Request Object and initialize it. + * The refcnt is set to 1. + */ +struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev) +{ + struct c2_vq_req *r; + + r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL); + if (r) { + init_waitqueue_head(&r->wait_object); + r->reply_msg = 0; + r->event = 0; + r->cm_id = NULL; + r->qp = NULL; + atomic_set(&r->refcnt, 1); + atomic_set(&r->reply_ready, 0); + } + return r; +} + + +/* vq_req_free - free the VQ Request Object. It is assumed the verbs handler + * has already free the VQ Reply Buffer if it existed. + */ +void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + r->reply_msg = 0; + if (atomic_dec_and_test(&r->refcnt)) { + kfree(r); + } +} + +/* vq_req_get - reference a VQ Request Object. Done + * only in the kernel verbs handlers. + */ +void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + atomic_inc(&r->refcnt); +} + + +/* vq_req_put - dereference and potentially free a VQ Request Object. + * + * This is only called by handle_vq() on the + * interrupt when it is done processing + * a verb reply message. If the associated + * kernel verbs handler has already bailed, + * then this put will actually free the VQ + * Request object _and_ the VQ Reply Buffer + * if it exists. + */ +void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r) +{ + if (atomic_dec_and_test(&r->refcnt)) { + if (r->reply_msg != 0) + vq_repbuf_free(c2dev, + (void *) (unsigned long) r->reply_msg); + kfree(r); + } +} + + +/* + * vq_repbuf_alloc - allocate a VQ Reply Buffer. + */ +void *vq_repbuf_alloc(struct c2_dev *c2dev) +{ + return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC); +} + +/* + * vq_send_wr - post a verbs request message to the Verbs Request Queue. + * If a message is not available in the MQ, then block until one is available. + * NOTE: handle_mq() on the interrupt context will wake up threads blocked here. + * When the adapter drains the Verbs Request Queue, + * it inserts MQ index 0 in to the + * adapter->host activity fifo and interrupts the host. + */ +int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr) +{ + void *msg; + wait_queue_t __wait; + + /* + * grab adapter vq lock + */ + spin_lock(&c2dev->vqlock); + + /* + * allocate msg + */ + msg = c2_mq_alloc(&c2dev->req_vq); + + /* + * If we cannot get a msg, then we'll wait + * When a messages are available, the int handler will wake_up() + * any waiters. + */ + while (msg == NULL) { + pr_debug("%s:%d no available msg in VQ, waiting...\n", + __func__, __LINE__); + init_waitqueue_entry(&__wait, current); + add_wait_queue(&c2dev->req_vq_wo, &__wait); + spin_unlock(&c2dev->vqlock); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (!c2_mq_full(&c2dev->req_vq)) { + break; + } + if (!signal_pending(current)) { + schedule_timeout(1 * HZ); /* 1 second... */ + continue; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&c2dev->req_vq_wo, &__wait); + return -EINTR; + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&c2dev->req_vq_wo, &__wait); + spin_lock(&c2dev->vqlock); + msg = c2_mq_alloc(&c2dev->req_vq); + } + + /* + * copy wr into adapter msg + */ + memcpy(msg, wr, c2dev->req_vq.msg_size); + + /* + * post msg + */ + c2_mq_produce(&c2dev->req_vq); + + /* + * release adapter vq lock + */ + spin_unlock(&c2dev->vqlock); + return 0; +} + + +/* + * vq_wait_for_reply - block until the adapter posts a Verb Reply Message. + */ +int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req) +{ + if (!wait_event_timeout(req->wait_object, + atomic_read(&req->reply_ready), + 60*HZ)) + return -ETIMEDOUT; + + return 0; +} + +/* + * vq_repbuf_free - Free a Verbs Reply Buffer. + */ +void vq_repbuf_free(struct c2_dev *c2dev, void *reply) +{ + kmem_cache_free(c2dev->host_msg_cache, reply); +} diff --git a/drivers/staging/rdma/amso1100/c2_vq.h b/drivers/staging/rdma/amso1100/c2_vq.h new file mode 100644 index 000000000..33805627a --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_vq.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_VQ_H_ +#define _C2_VQ_H_ +#include +#include "c2.h" +#include "c2_wr.h" +#include "c2_provider.h" + +struct c2_vq_req { + u64 reply_msg; /* ptr to reply msg */ + wait_queue_head_t wait_object; /* wait object for vq reqs */ + atomic_t reply_ready; /* set when reply is ready */ + atomic_t refcnt; /* used to cancel WRs... */ + int event; + struct iw_cm_id *cm_id; + struct c2_qp *qp; +}; + +extern int vq_init(struct c2_dev *c2dev); +extern void vq_term(struct c2_dev *c2dev); + +extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev); +extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req); +extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req); +extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req); +extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr); + +extern void *vq_repbuf_alloc(struct c2_dev *c2dev); +extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply); + +extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req); +#endif /* _C2_VQ_H_ */ diff --git a/drivers/staging/rdma/amso1100/c2_wr.h b/drivers/staging/rdma/amso1100/c2_wr.h new file mode 100644 index 000000000..8d4b4ca46 --- /dev/null +++ b/drivers/staging/rdma/amso1100/c2_wr.h @@ -0,0 +1,1520 @@ +/* + * Copyright (c) 2005 Ammasso, Inc. All rights reserved. + * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _C2_WR_H_ +#define _C2_WR_H_ + +#ifdef CCDEBUG +#define CCWR_MAGIC 0xb07700b0 +#endif + +#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF + +/* Maximum allowed size in bytes of private_data exchange + * on connect. + */ +#define C2_MAX_PRIVATE_DATA_SIZE 200 + +/* + * These types are shared among the adapter, host, and CCIL consumer. + */ +enum c2_cq_notification_type { + C2_CQ_NOTIFICATION_TYPE_NONE = 1, + C2_CQ_NOTIFICATION_TYPE_NEXT, + C2_CQ_NOTIFICATION_TYPE_NEXT_SE +}; + +enum c2_setconfig_cmd { + C2_CFG_ADD_ADDR = 1, + C2_CFG_DEL_ADDR = 2, + C2_CFG_ADD_ROUTE = 3, + C2_CFG_DEL_ROUTE = 4 +}; + +enum c2_getconfig_cmd { + C2_GETCONFIG_ROUTES = 1, + C2_GETCONFIG_ADDRS +}; + +/* + * CCIL Work Request Identifiers + */ +enum c2wr_ids { + CCWR_RNIC_OPEN = 1, + CCWR_RNIC_QUERY, + CCWR_RNIC_SETCONFIG, + CCWR_RNIC_GETCONFIG, + CCWR_RNIC_CLOSE, + CCWR_CQ_CREATE, + CCWR_CQ_QUERY, + CCWR_CQ_MODIFY, + CCWR_CQ_DESTROY, + CCWR_QP_CONNECT, + CCWR_PD_ALLOC, + CCWR_PD_DEALLOC, + CCWR_SRQ_CREATE, + CCWR_SRQ_QUERY, + CCWR_SRQ_MODIFY, + CCWR_SRQ_DESTROY, + CCWR_QP_CREATE, + CCWR_QP_QUERY, + CCWR_QP_MODIFY, + CCWR_QP_DESTROY, + CCWR_NSMR_STAG_ALLOC, + CCWR_NSMR_REGISTER, + CCWR_NSMR_PBL, + CCWR_STAG_DEALLOC, + CCWR_NSMR_REREGISTER, + CCWR_SMR_REGISTER, + CCWR_MR_QUERY, + CCWR_MW_ALLOC, + CCWR_MW_QUERY, + CCWR_EP_CREATE, + CCWR_EP_GETOPT, + CCWR_EP_SETOPT, + CCWR_EP_DESTROY, + CCWR_EP_BIND, + CCWR_EP_CONNECT, + CCWR_EP_LISTEN, + CCWR_EP_SHUTDOWN, + CCWR_EP_LISTEN_CREATE, + CCWR_EP_LISTEN_DESTROY, + CCWR_EP_QUERY, + CCWR_CR_ACCEPT, + CCWR_CR_REJECT, + CCWR_CONSOLE, + CCWR_TERM, + CCWR_FLASH_INIT, + CCWR_FLASH, + CCWR_BUF_ALLOC, + CCWR_BUF_FREE, + CCWR_FLASH_WRITE, + CCWR_INIT, /* WARNING: Don't move this ever again! */ + + + + /* Add new IDs here */ + + + + /* + * WARNING: CCWR_LAST must always be the last verbs id defined! + * All the preceding IDs are fixed, and must not change. + * You can add new IDs, but must not remove or reorder + * any IDs. If you do, YOU will ruin any hope of + * compatibility between versions. + */ + CCWR_LAST, + + /* + * Start over at 1 so that arrays indexed by user wr id's + * begin at 1. This is OK since the verbs and user wr id's + * are always used on disjoint sets of queues. + */ + /* + * The order of the CCWR_SEND_XX verbs must + * match the order of the RDMA_OPs + */ + CCWR_SEND = 1, + CCWR_SEND_INV, + CCWR_SEND_SE, + CCWR_SEND_SE_INV, + CCWR_RDMA_WRITE, + CCWR_RDMA_READ, + CCWR_RDMA_READ_INV, + CCWR_MW_BIND, + CCWR_NSMR_FASTREG, + CCWR_STAG_INVALIDATE, + CCWR_RECV, + CCWR_NOP, + CCWR_UNIMPL, +/* WARNING: This must always be the last user wr id defined! */ +}; +#define RDMA_SEND_OPCODE_FROM_WR_ID(x) (x+2) + +/* + * SQ/RQ Work Request Types + */ +enum c2_wr_type { + C2_WR_TYPE_SEND = CCWR_SEND, + C2_WR_TYPE_SEND_SE = CCWR_SEND_SE, + C2_WR_TYPE_SEND_INV = CCWR_SEND_INV, + C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV, + C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE, + C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ, + C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV, + C2_WR_TYPE_BIND_MW = CCWR_MW_BIND, + C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG, + C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE, + C2_WR_TYPE_RECV = CCWR_RECV, + C2_WR_TYPE_NOP = CCWR_NOP, +}; + +struct c2_netaddr { + __be32 ip_addr; + __be32 netmask; + u32 mtu; +}; + +struct c2_route { + u32 ip_addr; /* 0 indicates the default route */ + u32 netmask; /* netmask associated with dst */ + u32 flags; + union { + u32 ipaddr; /* address of the nexthop interface */ + u8 enaddr[6]; + } nexthop; +}; + +/* + * A Scatter Gather Entry. + */ +struct c2_data_addr { + __be32 stag; + __be32 length; + __be64 to; +}; + +/* + * MR and MW flags used by the consumer, RI, and RNIC. + */ +enum c2_mm_flags { + MEM_REMOTE = 0x0001, /* allow mw binds with remote access. */ + MEM_VA_BASED = 0x0002, /* Not Zero-based */ + MEM_PBL_COMPLETE = 0x0004, /* PBL array is complete in this msg */ + MEM_LOCAL_READ = 0x0008, /* allow local reads */ + MEM_LOCAL_WRITE = 0x0010, /* allow local writes */ + MEM_REMOTE_READ = 0x0020, /* allow remote reads */ + MEM_REMOTE_WRITE = 0x0040, /* allow remote writes */ + MEM_WINDOW_BIND = 0x0080, /* binds allowed */ + MEM_SHARED = 0x0100, /* set if MR is shared */ + MEM_STAG_VALID = 0x0200 /* set if STAG is in valid state */ +}; + +/* + * CCIL API ACF flags defined in terms of the low level mem flags. + * This minimizes translation needed in the user API + */ +enum c2_acf { + C2_ACF_LOCAL_READ = MEM_LOCAL_READ, + C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE, + C2_ACF_REMOTE_READ = MEM_REMOTE_READ, + C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE, + C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND +}; + +/* + * Image types of objects written to flash + */ +#define C2_FLASH_IMG_BITFILE 1 +#define C2_FLASH_IMG_OPTION_ROM 2 +#define C2_FLASH_IMG_VPD 3 + +/* + * to fix bug 1815 we define the max size allowable of the + * terminate message (per the IETF spec).Refer to the IETF + * protocol specification, section 12.1.6, page 64) + * The message is prefixed by 20 types of DDP info. + * + * Then the message has 6 bytes for the terminate control + * and DDP segment length info plus a DDP header (either + * 14 or 18 byts) plus 28 bytes for the RDMA header. + * Thus the max size in: + * 20 + (6 + 18 + 28) = 72 + */ +#define C2_MAX_TERMINATE_MESSAGE_SIZE (72) + +/* + * Build String Length. It must be the same as C2_BUILD_STR_LEN in ccil_api.h + */ +#define WR_BUILD_STR_LEN 64 + +/* + * WARNING: All of these structs need to align any 64bit types on + * 64 bit boundaries! 64bit types include u64 and u64. + */ + +/* + * Clustercore Work Request Header. Be sensitive to field layout + * and alignment. + */ +struct c2wr_hdr { + /* wqe_count is part of the cqe. It is put here so the + * adapter can write to it while the wr is pending without + * clobbering part of the wr. This word need not be dma'd + * from the host to adapter by libccil, but we copy it anyway + * to make the memcpy to the adapter better aligned. + */ + __be32 wqe_count; + + /* Put these fields next so that later 32- and 64-bit + * quantities are naturally aligned. + */ + u8 id; + u8 result; /* adapter -> host */ + u8 sge_count; /* host -> adapter */ + u8 flags; /* host -> adapter */ + + u64 context; +#ifdef CCMSGMAGIC + u32 magic; + u32 pad; +#endif +} __attribute__((packed)); + +/* + *------------------------ RNIC ------------------------ + */ + +/* + * WR_RNIC_OPEN + */ + +/* + * Flags for the RNIC WRs + */ +enum c2_rnic_flags { + RNIC_IRD_STATIC = 0x0001, + RNIC_ORD_STATIC = 0x0002, + RNIC_QP_STATIC = 0x0004, + RNIC_SRQ_SUPPORTED = 0x0008, + RNIC_PBL_BLOCK_MODE = 0x0010, + RNIC_SRQ_MODEL_ARRIVAL = 0x0020, + RNIC_CQ_OVF_DETECTED = 0x0040, + RNIC_PRIV_MODE = 0x0080 +}; + +struct c2wr_rnic_open_req { + struct c2wr_hdr hdr; + u64 user_context; + __be16 flags; /* See enum c2_rnic_flags */ + __be16 port_num; +} __attribute__((packed)); + +struct c2wr_rnic_open_rep { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +union c2wr_rnic_open { + struct c2wr_rnic_open_req req; + struct c2wr_rnic_open_rep rep; +} __attribute__((packed)); + +struct c2wr_rnic_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +/* + * WR_RNIC_QUERY + */ +struct c2wr_rnic_query_rep { + struct c2wr_hdr hdr; + u64 user_context; + __be32 vendor_id; + __be32 part_number; + __be32 hw_version; + __be32 fw_ver_major; + __be32 fw_ver_minor; + __be32 fw_ver_patch; + char fw_ver_build_str[WR_BUILD_STR_LEN]; + __be32 max_qps; + __be32 max_qp_depth; + u32 max_srq_depth; + u32 max_send_sgl_depth; + u32 max_rdma_sgl_depth; + __be32 max_cqs; + __be32 max_cq_depth; + u32 max_cq_event_handlers; + __be32 max_mrs; + u32 max_pbl_depth; + __be32 max_pds; + __be32 max_global_ird; + u32 max_global_ord; + __be32 max_qp_ird; + __be32 max_qp_ord; + u32 flags; + __be32 max_mws; + u32 pbe_range_low; + u32 pbe_range_high; + u32 max_srqs; + u32 page_size; +} __attribute__((packed)); + +union c2wr_rnic_query { + struct c2wr_rnic_query_req req; + struct c2wr_rnic_query_rep rep; +} __attribute__((packed)); + +/* + * WR_RNIC_GETCONFIG + */ + +struct c2wr_rnic_getconfig_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 option; /* see c2_getconfig_cmd_t */ + u64 reply_buf; + u32 reply_buf_len; +} __attribute__((packed)) ; + +struct c2wr_rnic_getconfig_rep { + struct c2wr_hdr hdr; + u32 option; /* see c2_getconfig_cmd_t */ + u32 count_len; /* length of the number of addresses configured */ +} __attribute__((packed)) ; + +union c2wr_rnic_getconfig { + struct c2wr_rnic_getconfig_req req; + struct c2wr_rnic_getconfig_rep rep; +} __attribute__((packed)) ; + +/* + * WR_RNIC_SETCONFIG + */ +struct c2wr_rnic_setconfig_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 option; /* See c2_setconfig_cmd_t */ + /* variable data and pad. See c2_netaddr and c2_route */ + u8 data[0]; +} __attribute__((packed)) ; + +struct c2wr_rnic_setconfig_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_rnic_setconfig { + struct c2wr_rnic_setconfig_req req; + struct c2wr_rnic_setconfig_rep rep; +} __attribute__((packed)) ; + +/* + * WR_RNIC_CLOSE + */ +struct c2wr_rnic_close_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)) ; + +struct c2wr_rnic_close_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_rnic_close { + struct c2wr_rnic_close_req req; + struct c2wr_rnic_close_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ CQ ------------------------ + */ +struct c2wr_cq_create_req { + struct c2wr_hdr hdr; + __be64 shared_ht; + u64 user_context; + __be64 msg_pool; + u32 rnic_handle; + __be32 msg_size; + __be32 depth; +} __attribute__((packed)) ; + +struct c2wr_cq_create_rep { + struct c2wr_hdr hdr; + __be32 mq_index; + __be32 adapter_shared; + u32 cq_handle; +} __attribute__((packed)) ; + +union c2wr_cq_create { + struct c2wr_cq_create_req req; + struct c2wr_cq_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_cq_modify_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 cq_handle; + u32 new_depth; + u64 new_msg_pool; +} __attribute__((packed)) ; + +struct c2wr_cq_modify_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_cq_modify { + struct c2wr_cq_modify_req req; + struct c2wr_cq_modify_rep rep; +} __attribute__((packed)) ; + +struct c2wr_cq_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 cq_handle; +} __attribute__((packed)) ; + +struct c2wr_cq_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_cq_destroy { + struct c2wr_cq_destroy_req req; + struct c2wr_cq_destroy_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ PD ------------------------ + */ +struct c2wr_pd_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_pd_alloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_pd_alloc { + struct c2wr_pd_alloc_req req; + struct c2wr_pd_alloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_pd_dealloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_pd_dealloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_pd_dealloc { + struct c2wr_pd_dealloc_req req; + struct c2wr_pd_dealloc_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ SRQ ------------------------ + */ +struct c2wr_srq_create_req { + struct c2wr_hdr hdr; + u64 shared_ht; + u64 user_context; + u32 rnic_handle; + u32 srq_depth; + u32 srq_limit; + u32 sgl_depth; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_srq_create_rep { + struct c2wr_hdr hdr; + u32 srq_depth; + u32 sgl_depth; + u32 msg_size; + u32 mq_index; + u32 mq_start; + u32 srq_handle; +} __attribute__((packed)) ; + +union c2wr_srq_create { + struct c2wr_srq_create_req req; + struct c2wr_srq_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_srq_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 srq_handle; +} __attribute__((packed)) ; + +struct c2wr_srq_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_srq_destroy { + struct c2wr_srq_destroy_req req; + struct c2wr_srq_destroy_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ QP ------------------------ + */ +enum c2wr_qp_flags { + QP_RDMA_READ = 0x00000001, /* RDMA read enabled? */ + QP_RDMA_WRITE = 0x00000002, /* RDMA write enabled? */ + QP_MW_BIND = 0x00000004, /* MWs enabled */ + QP_ZERO_STAG = 0x00000008, /* enabled? */ + QP_REMOTE_TERMINATION = 0x00000010, /* remote end terminated */ + QP_RDMA_READ_RESPONSE = 0x00000020 /* Remote RDMA read */ + /* enabled? */ +}; + +struct c2wr_qp_create_req { + struct c2wr_hdr hdr; + __be64 shared_sq_ht; + __be64 shared_rq_ht; + u64 user_context; + u32 rnic_handle; + u32 sq_cq_handle; + u32 rq_cq_handle; + __be32 sq_depth; + __be32 rq_depth; + u32 srq_handle; + u32 srq_limit; + __be32 flags; /* see enum c2wr_qp_flags */ + __be32 send_sgl_depth; + __be32 recv_sgl_depth; + __be32 rdma_write_sgl_depth; + __be32 ord; + __be32 ird; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_qp_create_rep { + struct c2wr_hdr hdr; + __be32 sq_depth; + __be32 rq_depth; + u32 send_sgl_depth; + u32 recv_sgl_depth; + u32 rdma_write_sgl_depth; + u32 ord; + u32 ird; + __be32 sq_msg_size; + __be32 sq_mq_index; + __be32 sq_mq_start; + __be32 rq_msg_size; + __be32 rq_mq_index; + __be32 rq_mq_start; + u32 qp_handle; +} __attribute__((packed)) ; + +union c2wr_qp_create { + struct c2wr_qp_create_req req; + struct c2wr_qp_create_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_query_rep { + struct c2wr_hdr hdr; + u64 user_context; + u32 rnic_handle; + u32 sq_depth; + u32 rq_depth; + u32 send_sgl_depth; + u32 rdma_write_sgl_depth; + u32 recv_sgl_depth; + u32 ord; + u32 ird; + u16 qp_state; + u16 flags; /* see c2wr_qp_flags_t */ + u32 qp_id; + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; + u32 terminate_msg_length; /* 0 if not present */ + u8 data[0]; + /* Terminate Message in-line here. */ +} __attribute__((packed)) ; + +union c2wr_qp_query { + struct c2wr_qp_query_req req; + struct c2wr_qp_query_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_modify_req { + struct c2wr_hdr hdr; + u64 stream_msg; + u32 stream_msg_length; + u32 rnic_handle; + u32 qp_handle; + __be32 next_qp_state; + __be32 ord; + __be32 ird; + __be32 sq_depth; + __be32 rq_depth; + u32 llp_ep_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_modify_rep { + struct c2wr_hdr hdr; + u32 ord; + u32 ird; + u32 sq_depth; + u32 rq_depth; + u32 sq_msg_size; + u32 sq_mq_index; + u32 sq_mq_start; + u32 rq_msg_size; + u32 rq_mq_index; + u32 rq_mq_start; +} __attribute__((packed)) ; + +union c2wr_qp_modify { + struct c2wr_qp_modify_req req; + struct c2wr_qp_modify_rep rep; +} __attribute__((packed)) ; + +struct c2wr_qp_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; +} __attribute__((packed)) ; + +struct c2wr_qp_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_qp_destroy { + struct c2wr_qp_destroy_req req; + struct c2wr_qp_destroy_rep rep; +} __attribute__((packed)) ; + +/* + * The CCWR_QP_CONNECT msg is posted on the verbs request queue. It can + * only be posted when a QP is in IDLE state. After the connect request is + * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state. + * No synchronous reply from adapter to this WR. The results of + * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS + * See c2wr_ae_active_connect_results_t + */ +struct c2wr_qp_connect_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; + __be32 remote_addr; + __be16 remote_port; + u16 pad; + __be32 private_data_length; + u8 private_data[0]; /* Private data in-line. */ +} __attribute__((packed)) ; + +struct c2wr_qp_connect { + struct c2wr_qp_connect_req req; + /* no synchronous reply. */ +} __attribute__((packed)) ; + + +/* + *------------------------ MM ------------------------ + */ + +struct c2wr_nsmr_stag_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pbl_depth; + u32 pd_id; + u32 flags; +} __attribute__((packed)) ; + +struct c2wr_nsmr_stag_alloc_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_stag_alloc { + struct c2wr_nsmr_stag_alloc_req req; + struct c2wr_nsmr_stag_alloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_register_req { + struct c2wr_hdr hdr; + __be64 va; + u32 rnic_handle; + __be16 flags; + u8 stag_key; + u8 pad; + u32 pd_id; + __be32 pbl_depth; + __be32 pbe_size; + __be32 fbo; + __be32 length; + __be32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + __be64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_register_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + __be32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_register { + struct c2wr_nsmr_register_req req; + struct c2wr_nsmr_register_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_pbl_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 flags; + __be32 stag_index; + __be32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + __be64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_pbl_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_nsmr_pbl { + struct c2wr_nsmr_pbl_req req; + struct c2wr_nsmr_pbl_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mr_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_mr_query_rep { + struct c2wr_hdr hdr; + u8 stag_key; + u8 pad[3]; + u32 pd_id; + u32 flags; + u32 pbl_depth; +} __attribute__((packed)) ; + +union c2wr_mr_query { + struct c2wr_mr_query_req req; + struct c2wr_mr_query_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mw_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_mw_query_rep { + struct c2wr_hdr hdr; + u8 stag_key; + u8 pad[3]; + u32 pd_id; + u32 flags; +} __attribute__((packed)) ; + +union c2wr_mw_query { + struct c2wr_mw_query_req req; + struct c2wr_mw_query_rep rep; +} __attribute__((packed)) ; + + +struct c2wr_stag_dealloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + __be32 stag_index; +} __attribute__((packed)) ; + +struct c2wr_stag_dealloc_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)) ; + +union c2wr_stag_dealloc { + struct c2wr_stag_dealloc_req req; + struct c2wr_stag_dealloc_rep rep; +} __attribute__((packed)) ; + +struct c2wr_nsmr_reregister_req { + struct c2wr_hdr hdr; + u64 va; + u32 rnic_handle; + u16 flags; + u8 stag_key; + u8 pad; + u32 stag_index; + u32 pd_id; + u32 pbl_depth; + u32 pbe_size; + u32 fbo; + u32 length; + u32 addrs_length; + u32 pad1; + /* array of paddrs (must be aligned on a 64bit boundary) */ + u64 paddrs[0]; +} __attribute__((packed)) ; + +struct c2wr_nsmr_reregister_rep { + struct c2wr_hdr hdr; + u32 pbl_depth; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_nsmr_reregister { + struct c2wr_nsmr_reregister_req req; + struct c2wr_nsmr_reregister_rep rep; +} __attribute__((packed)) ; + +struct c2wr_smr_register_req { + struct c2wr_hdr hdr; + u64 va; + u32 rnic_handle; + u16 flags; + u8 stag_key; + u8 pad; + u32 stag_index; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_smr_register_rep { + struct c2wr_hdr hdr; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_smr_register { + struct c2wr_smr_register_req req; + struct c2wr_smr_register_rep rep; +} __attribute__((packed)) ; + +struct c2wr_mw_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 pd_id; +} __attribute__((packed)) ; + +struct c2wr_mw_alloc_rep { + struct c2wr_hdr hdr; + u32 stag_index; +} __attribute__((packed)) ; + +union c2wr_mw_alloc { + struct c2wr_mw_alloc_req req; + struct c2wr_mw_alloc_rep rep; +} __attribute__((packed)) ; + +/* + *------------------------ WRs ----------------------- + */ + +struct c2wr_user_hdr { + struct c2wr_hdr hdr; /* Has status and WR Type */ +} __attribute__((packed)) ; + +enum c2_qp_state { + C2_QP_STATE_IDLE = 0x01, + C2_QP_STATE_CONNECTING = 0x02, + C2_QP_STATE_RTS = 0x04, + C2_QP_STATE_CLOSING = 0x08, + C2_QP_STATE_TERMINATE = 0x10, + C2_QP_STATE_ERROR = 0x20, +}; + +/* Completion queue entry. */ +struct c2wr_ce { + struct c2wr_hdr hdr; /* Has status and WR Type */ + u64 qp_user_context; /* c2_user_qp_t * */ + u32 qp_state; /* Current QP State */ + u32 handle; /* QPID or EP Handle */ + __be32 bytes_rcvd; /* valid for RECV WCs */ + u32 stag; +} __attribute__((packed)) ; + + +/* + * Flags used for all post-sq WRs. These must fit in the flags + * field of the struct c2wr_hdr (eight bits). + */ +enum { + SQ_SIGNALED = 0x01, + SQ_READ_FENCE = 0x02, + SQ_FENCE = 0x04, +}; + +/* + * Common fields for all post-sq WRs. Namely the standard header and a + * secondary header with fields common to all post-sq WRs. + */ +struct c2_sq_hdr { + struct c2wr_user_hdr user_hdr; +} __attribute__((packed)); + +/* + * Same as above but for post-rq WRs. + */ +struct c2_rq_hdr { + struct c2wr_user_hdr user_hdr; +} __attribute__((packed)); + +/* + * use the same struct for all sends. + */ +struct c2wr_send_req { + struct c2_sq_hdr sq_hdr; + __be32 sge_len; + __be32 remote_stag; + u8 data[0]; /* SGE array */ +} __attribute__((packed)); + +union c2wr_send { + struct c2wr_send_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_rdma_write_req { + struct c2_sq_hdr sq_hdr; + __be64 remote_to; + __be32 remote_stag; + __be32 sge_len; + u8 data[0]; /* SGE array */ +} __attribute__((packed)); + +union c2wr_rdma_write { + struct c2wr_rdma_write_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_rdma_read_req { + struct c2_sq_hdr sq_hdr; + __be64 local_to; + __be64 remote_to; + __be32 local_stag; + __be32 remote_stag; + __be32 length; +} __attribute__((packed)); + +union c2wr_rdma_read { + struct c2wr_rdma_read_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_mw_bind_req { + struct c2_sq_hdr sq_hdr; + u64 va; + u8 stag_key; + u8 pad[3]; + u32 mw_stag_index; + u32 mr_stag_index; + u32 length; + u32 flags; +} __attribute__((packed)); + +union c2wr_mw_bind { + struct c2wr_mw_bind_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_nsmr_fastreg_req { + struct c2_sq_hdr sq_hdr; + u64 va; + u8 stag_key; + u8 pad[3]; + u32 stag_index; + u32 pbe_size; + u32 fbo; + u32 length; + u32 addrs_length; + /* array of paddrs (must be aligned on a 64bit boundary) */ + u64 paddrs[0]; +} __attribute__((packed)); + +union c2wr_nsmr_fastreg { + struct c2wr_nsmr_fastreg_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_stag_invalidate_req { + struct c2_sq_hdr sq_hdr; + u8 stag_key; + u8 pad[3]; + u32 stag_index; +} __attribute__((packed)); + +union c2wr_stag_invalidate { + struct c2wr_stag_invalidate_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +union c2wr_sqwr { + struct c2_sq_hdr sq_hdr; + struct c2wr_send_req send; + struct c2wr_send_req send_se; + struct c2wr_send_req send_inv; + struct c2wr_send_req send_se_inv; + struct c2wr_rdma_write_req rdma_write; + struct c2wr_rdma_read_req rdma_read; + struct c2wr_mw_bind_req mw_bind; + struct c2wr_nsmr_fastreg_req nsmr_fastreg; + struct c2wr_stag_invalidate_req stag_inv; +} __attribute__((packed)); + + +/* + * RQ WRs + */ +struct c2wr_rqwr { + struct c2_rq_hdr rq_hdr; + u8 data[0]; /* array of SGEs */ +} __attribute__((packed)); + +union c2wr_recv { + struct c2wr_rqwr req; + struct c2wr_ce rep; +} __attribute__((packed)); + +/* + * All AEs start with this header. Most AEs only need to convey the + * information in the header. Some, like LLP connection events, need + * more info. The union typdef c2wr_ae_t has all the possible AEs. + * + * hdr.context is the user_context from the rnic_open WR. NULL If this + * is not affiliated with an rnic + * + * hdr.id is the AE identifier (eg; CCAE_REMOTE_SHUTDOWN, + * CCAE_LLP_CLOSE_COMPLETE) + * + * resource_type is one of: C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ + * + * user_context is the context passed down when the host created the resource. + */ +struct c2wr_ae_hdr { + struct c2wr_hdr hdr; + u64 user_context; /* user context for this res. */ + __be32 resource_type; /* see enum c2_resource_indicator */ + __be32 resource; /* handle for resource */ + __be32 qp_state; /* current QP State */ +} __attribute__((packed)); + +/* + * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ, + * the adapter moves the QP into RTS state + */ +struct c2wr_ae_active_connect_results { + struct c2wr_ae_hdr ae_hdr; + __be32 laddr; + __be32 raddr; + __be16 lport; + __be16 rport; + __be32 private_data_length; + u8 private_data[0]; /* data is in-line in the msg. */ +} __attribute__((packed)); + +/* + * When connections are established by the stack (and the private data + * MPA frame is received), the adapter will generate an event to the host. + * The details of the connection, any private data, and the new connection + * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the + * AE queue: + */ +struct c2wr_ae_connection_request { + struct c2wr_ae_hdr ae_hdr; + u32 cr_handle; /* connreq handle (sock ptr) */ + __be32 laddr; + __be32 raddr; + __be16 lport; + __be16 rport; + __be32 private_data_length; + u8 private_data[0]; /* data is in-line in the msg. */ +} __attribute__((packed)); + +union c2wr_ae { + struct c2wr_ae_hdr ae_generic; + struct c2wr_ae_active_connect_results ae_active_connect_results; + struct c2wr_ae_connection_request ae_connection_request; +} __attribute__((packed)); + +struct c2wr_init_req { + struct c2wr_hdr hdr; + __be64 hint_count; + __be64 q0_host_shared; + __be64 q1_host_shared; + __be64 q1_host_msg_pool; + __be64 q2_host_shared; + __be64 q2_host_msg_pool; +} __attribute__((packed)); + +struct c2wr_init_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_init { + struct c2wr_init_req req; + struct c2wr_init_rep rep; +} __attribute__((packed)); + +/* + * For upgrading flash. + */ + +struct c2wr_flash_init_req { + struct c2wr_hdr hdr; + u32 rnic_handle; +} __attribute__((packed)); + +struct c2wr_flash_init_rep { + struct c2wr_hdr hdr; + u32 adapter_flash_buf_offset; + u32 adapter_flash_len; +} __attribute__((packed)); + +union c2wr_flash_init { + struct c2wr_flash_init_req req; + struct c2wr_flash_init_rep rep; +} __attribute__((packed)); + +struct c2wr_flash_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 len; +} __attribute__((packed)); + +struct c2wr_flash_rep { + struct c2wr_hdr hdr; + u32 status; +} __attribute__((packed)); + +union c2wr_flash { + struct c2wr_flash_req req; + struct c2wr_flash_rep rep; +} __attribute__((packed)); + +struct c2wr_buf_alloc_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 size; +} __attribute__((packed)); + +struct c2wr_buf_alloc_rep { + struct c2wr_hdr hdr; + u32 offset; /* 0 if mem not available */ + u32 size; /* 0 if mem not available */ +} __attribute__((packed)); + +union c2wr_buf_alloc { + struct c2wr_buf_alloc_req req; + struct c2wr_buf_alloc_rep rep; +} __attribute__((packed)); + +struct c2wr_buf_free_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 offset; /* Must match value from alloc */ + u32 size; /* Must match value from alloc */ +} __attribute__((packed)); + +struct c2wr_buf_free_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_buf_free { + struct c2wr_buf_free_req req; + struct c2wr_ce rep; +} __attribute__((packed)); + +struct c2wr_flash_write_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 offset; + u32 size; + u32 type; + u32 flags; +} __attribute__((packed)); + +struct c2wr_flash_write_rep { + struct c2wr_hdr hdr; + u32 status; +} __attribute__((packed)); + +union c2wr_flash_write { + struct c2wr_flash_write_req req; + struct c2wr_flash_write_rep rep; +} __attribute__((packed)); + +/* + * Messages for LLP connection setup. + */ + +/* + * Listen Request. This allocates a listening endpoint to allow passive + * connection setup. Newly established LLP connections are passed up + * via an AE. See c2wr_ae_connection_request_t + */ +struct c2wr_ep_listen_create_req { + struct c2wr_hdr hdr; + u64 user_context; /* returned in AEs. */ + u32 rnic_handle; + __be32 local_addr; /* local addr, or 0 */ + __be16 local_port; /* 0 means "pick one" */ + u16 pad; + __be32 backlog; /* tradional tcp listen bl */ +} __attribute__((packed)); + +struct c2wr_ep_listen_create_rep { + struct c2wr_hdr hdr; + u32 ep_handle; /* handle to new listening ep */ + u16 local_port; /* resulting port... */ + u16 pad; +} __attribute__((packed)); + +union c2wr_ep_listen_create { + struct c2wr_ep_listen_create_req req; + struct c2wr_ep_listen_create_rep rep; +} __attribute__((packed)); + +struct c2wr_ep_listen_destroy_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; +} __attribute__((packed)); + +struct c2wr_ep_listen_destroy_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_ep_listen_destroy { + struct c2wr_ep_listen_destroy_req req; + struct c2wr_ep_listen_destroy_rep rep; +} __attribute__((packed)); + +struct c2wr_ep_query_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; +} __attribute__((packed)); + +struct c2wr_ep_query_rep { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 local_addr; + u32 remote_addr; + u16 local_port; + u16 remote_port; +} __attribute__((packed)); + +union c2wr_ep_query { + struct c2wr_ep_query_req req; + struct c2wr_ep_query_rep rep; +} __attribute__((packed)); + + +/* + * The host passes this down to indicate acceptance of a pending iWARP + * connection. The cr_handle was obtained from the CONNECTION_REQUEST + * AE passed up by the adapter. See c2wr_ae_connection_request_t. + */ +struct c2wr_cr_accept_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 qp_handle; /* QP to bind to this LLP conn */ + u32 ep_handle; /* LLP handle to accept */ + __be32 private_data_length; + u8 private_data[0]; /* data in-line in msg. */ +} __attribute__((packed)); + +/* + * adapter sends reply when private data is successfully submitted to + * the LLP. + */ +struct c2wr_cr_accept_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_cr_accept { + struct c2wr_cr_accept_req req; + struct c2wr_cr_accept_rep rep; +} __attribute__((packed)); + +/* + * The host sends this down if a given iWARP connection request was + * rejected by the consumer. The cr_handle was obtained from a + * previous c2wr_ae_connection_request_t AE sent by the adapter. + */ +struct c2wr_cr_reject_req { + struct c2wr_hdr hdr; + u32 rnic_handle; + u32 ep_handle; /* LLP handle to reject */ +} __attribute__((packed)); + +/* + * Dunno if this is needed, but we'll add it for now. The adapter will + * send the reject_reply after the LLP endpoint has been destroyed. + */ +struct c2wr_cr_reject_rep { + struct c2wr_hdr hdr; +} __attribute__((packed)); + +union c2wr_cr_reject { + struct c2wr_cr_reject_req req; + struct c2wr_cr_reject_rep rep; +} __attribute__((packed)); + +/* + * console command. Used to implement a debug console over the verbs + * request and reply queues. + */ + +/* + * Console request message. It contains: + * - message hdr with id = CCWR_CONSOLE + * - the physaddr/len of host memory to be used for the reply. + * - the command string. eg: "netstat -s" or "zoneinfo" + */ +struct c2wr_console_req { + struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */ + u64 reply_buf; /* pinned host buf for reply */ + u32 reply_buf_len; /* length of reply buffer */ + u8 command[0]; /* NUL terminated ascii string */ + /* containing the command req */ +} __attribute__((packed)); + +/* + * flags used in the console reply. + */ +enum c2_console_flags { + CONS_REPLY_TRUNCATED = 0x00000001 /* reply was truncated */ +} __attribute__((packed)); + +/* + * Console reply message. + * hdr.result contains the c2_status_t error if the reply was _not_ generated, + * or C2_OK if the reply was generated. + */ +struct c2wr_console_rep { + struct c2wr_hdr hdr; /* id = CCWR_CONSOLE */ + u32 flags; +} __attribute__((packed)); + +union c2wr_console { + struct c2wr_console_req req; + struct c2wr_console_rep rep; +} __attribute__((packed)); + + +/* + * Giant union with all WRs. Makes life easier... + */ +union c2wr { + struct c2wr_hdr hdr; + struct c2wr_user_hdr user_hdr; + union c2wr_rnic_open rnic_open; + union c2wr_rnic_query rnic_query; + union c2wr_rnic_getconfig rnic_getconfig; + union c2wr_rnic_setconfig rnic_setconfig; + union c2wr_rnic_close rnic_close; + union c2wr_cq_create cq_create; + union c2wr_cq_modify cq_modify; + union c2wr_cq_destroy cq_destroy; + union c2wr_pd_alloc pd_alloc; + union c2wr_pd_dealloc pd_dealloc; + union c2wr_srq_create srq_create; + union c2wr_srq_destroy srq_destroy; + union c2wr_qp_create qp_create; + union c2wr_qp_query qp_query; + union c2wr_qp_modify qp_modify; + union c2wr_qp_destroy qp_destroy; + struct c2wr_qp_connect qp_connect; + union c2wr_nsmr_stag_alloc nsmr_stag_alloc; + union c2wr_nsmr_register nsmr_register; + union c2wr_nsmr_pbl nsmr_pbl; + union c2wr_mr_query mr_query; + union c2wr_mw_query mw_query; + union c2wr_stag_dealloc stag_dealloc; + union c2wr_sqwr sqwr; + struct c2wr_rqwr rqwr; + struct c2wr_ce ce; + union c2wr_ae ae; + union c2wr_init init; + union c2wr_ep_listen_create ep_listen_create; + union c2wr_ep_listen_destroy ep_listen_destroy; + union c2wr_cr_accept cr_accept; + union c2wr_cr_reject cr_reject; + union c2wr_console console; + union c2wr_flash_init flash_init; + union c2wr_flash flash; + union c2wr_buf_alloc buf_alloc; + union c2wr_buf_free buf_free; + union c2wr_flash_write flash_write; +} __attribute__((packed)); + + +/* + * Accessors for the wr fields that are packed together tightly to + * reduce the wr message size. The wr arguments are void* so that + * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types + * in the struct c2wr union can be passed in. + */ +static __inline__ u8 c2_wr_get_id(void *wr) +{ + return ((struct c2wr_hdr *) wr)->id; +} +static __inline__ void c2_wr_set_id(void *wr, u8 id) +{ + ((struct c2wr_hdr *) wr)->id = id; +} +static __inline__ u8 c2_wr_get_result(void *wr) +{ + return ((struct c2wr_hdr *) wr)->result; +} +static __inline__ void c2_wr_set_result(void *wr, u8 result) +{ + ((struct c2wr_hdr *) wr)->result = result; +} +static __inline__ u8 c2_wr_get_flags(void *wr) +{ + return ((struct c2wr_hdr *) wr)->flags; +} +static __inline__ void c2_wr_set_flags(void *wr, u8 flags) +{ + ((struct c2wr_hdr *) wr)->flags = flags; +} +static __inline__ u8 c2_wr_get_sge_count(void *wr) +{ + return ((struct c2wr_hdr *) wr)->sge_count; +} +static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count) +{ + ((struct c2wr_hdr *) wr)->sge_count = sge_count; +} +static __inline__ __be32 c2_wr_get_wqe_count(void *wr) +{ + return ((struct c2wr_hdr *) wr)->wqe_count; +} +static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count) +{ + ((struct c2wr_hdr *) wr)->wqe_count = wqe_count; +} + +#endif /* _C2_WR_H_ */ diff --git a/drivers/staging/rdma/ehca/Kconfig b/drivers/staging/rdma/ehca/Kconfig new file mode 100644 index 000000000..3fadd2ad6 --- /dev/null +++ b/drivers/staging/rdma/ehca/Kconfig @@ -0,0 +1,10 @@ +config INFINIBAND_EHCA + tristate "eHCA support" + depends on IBMEBUS + ---help--- + This driver supports the deprecated IBM pSeries eHCA InfiniBand + adapter. + + To compile the driver as a module, choose M here. The module + will be called ib_ehca. + diff --git a/drivers/staging/rdma/ehca/Makefile b/drivers/staging/rdma/ehca/Makefile new file mode 100644 index 000000000..74d284e46 --- /dev/null +++ b/drivers/staging/rdma/ehca/Makefile @@ -0,0 +1,16 @@ +# Authors: Heiko J Schick +# Christoph Raisch +# Joachim Fenkes +# +# Copyright (c) 2005 IBM Corporation +# +# All rights reserved. +# +# This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD. + +obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o + +ib_ehca-objs = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \ + ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \ + ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o + diff --git a/drivers/staging/rdma/ehca/TODO b/drivers/staging/rdma/ehca/TODO new file mode 100644 index 000000000..199a4a600 --- /dev/null +++ b/drivers/staging/rdma/ehca/TODO @@ -0,0 +1,4 @@ +9/2015 + +The ehca driver has been deprecated and moved to drivers/staging/rdma. +It will be removed in the 4.6 merge window. diff --git a/drivers/staging/rdma/ehca/ehca_av.c b/drivers/staging/rdma/ehca/ehca_av.c new file mode 100644 index 000000000..465926319 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_av.c @@ -0,0 +1,277 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * address vector functions + * + * Authors: Hoang-Nam Nguyen + * Khadija Souissi + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +static struct kmem_cache *av_cache; + +int ehca_calc_ipd(struct ehca_shca *shca, int port, + enum ib_rate path_rate, u32 *ipd) +{ + int path = ib_rate_to_mult(path_rate); + int link, ret; + struct ib_port_attr pa; + + if (path_rate == IB_RATE_PORT_CURRENT) { + *ipd = 0; + return 0; + } + + if (unlikely(path < 0)) { + ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x", + path_rate); + return -EINVAL; + } + + ret = ehca_query_port(&shca->ib_device, port, &pa); + if (unlikely(ret < 0)) { + ehca_err(&shca->ib_device, "Failed to query port ret=%i", ret); + return ret; + } + + link = ib_width_enum_to_int(pa.active_width) * pa.active_speed; + + if (path >= link) + /* no need to throttle if path faster than link */ + *ipd = 0; + else + /* IPD = round((link / path) - 1) */ + *ipd = ((link + (path >> 1)) / path) - 1; + + return 0; +} + +struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) +{ + int ret; + struct ehca_av *av; + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + + av = kmem_cache_alloc(av_cache, GFP_KERNEL); + if (!av) { + ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p", + pd, ah_attr); + return ERR_PTR(-ENOMEM); + } + + av->av.sl = ah_attr->sl; + av->av.dlid = ah_attr->dlid; + av->av.slid_path_bits = ah_attr->src_path_bits; + + if (ehca_static_rate < 0) { + u32 ipd; + if (ehca_calc_ipd(shca, ah_attr->port_num, + ah_attr->static_rate, &ipd)) { + ret = -EINVAL; + goto create_ah_exit1; + } + av->av.ipd = ipd; + } else + av->av.ipd = ehca_static_rate; + + av->av.lnh = ah_attr->ah_flags; + av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK, + ah_attr->grh.traffic_class); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK, + ah_attr->grh.flow_label); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK, + ah_attr->grh.hop_limit); + av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B); + /* set sgid in grh.word_1 */ + if (ah_attr->ah_flags & IB_AH_GRH) { + int rc; + struct ib_port_attr port_attr; + union ib_gid gid; + memset(&port_attr, 0, sizeof(port_attr)); + rc = ehca_query_port(pd->device, ah_attr->port_num, + &port_attr); + if (rc) { /* invalid port number */ + ret = -EINVAL; + ehca_err(pd->device, "Invalid port number " + "ehca_query_port() returned %x " + "pd=%p ah_attr=%p", rc, pd, ah_attr); + goto create_ah_exit1; + } + memset(&gid, 0, sizeof(gid)); + rc = ehca_query_gid(pd->device, + ah_attr->port_num, + ah_attr->grh.sgid_index, &gid); + if (rc) { + ret = -EINVAL; + ehca_err(pd->device, "Failed to retrieve sgid " + "ehca_query_gid() returned %x " + "pd=%p ah_attr=%p", rc, pd, ah_attr); + goto create_ah_exit1; + } + memcpy(&av->av.grh.word_1, &gid, sizeof(gid)); + } + av->av.pmtu = shca->max_mtu; + + /* dgid comes in grh.word_3 */ + memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid, + sizeof(ah_attr->grh.dgid)); + + return &av->ib_ah; + +create_ah_exit1: + kmem_cache_free(av_cache, av); + + return ERR_PTR(ret); +} + +int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + struct ehca_av *av; + struct ehca_ud_av new_ehca_av; + struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca, + ib_device); + + memset(&new_ehca_av, 0, sizeof(new_ehca_av)); + new_ehca_av.sl = ah_attr->sl; + new_ehca_av.dlid = ah_attr->dlid; + new_ehca_av.slid_path_bits = ah_attr->src_path_bits; + new_ehca_av.ipd = ah_attr->static_rate; + new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK, + (ah_attr->ah_flags & IB_AH_GRH) > 0); + new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK, + ah_attr->grh.traffic_class); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK, + ah_attr->grh.flow_label); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK, + ah_attr->grh.hop_limit); + new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b); + + /* set sgid in grh.word_1 */ + if (ah_attr->ah_flags & IB_AH_GRH) { + int rc; + struct ib_port_attr port_attr; + union ib_gid gid; + memset(&port_attr, 0, sizeof(port_attr)); + rc = ehca_query_port(ah->device, ah_attr->port_num, + &port_attr); + if (rc) { /* invalid port number */ + ehca_err(ah->device, "Invalid port number " + "ehca_query_port() returned %x " + "ah=%p ah_attr=%p port_num=%x", + rc, ah, ah_attr, ah_attr->port_num); + return -EINVAL; + } + memset(&gid, 0, sizeof(gid)); + rc = ehca_query_gid(ah->device, + ah_attr->port_num, + ah_attr->grh.sgid_index, &gid); + if (rc) { + ehca_err(ah->device, "Failed to retrieve sgid " + "ehca_query_gid() returned %x " + "ah=%p ah_attr=%p port_num=%x " + "sgid_index=%x", + rc, ah, ah_attr, ah_attr->port_num, + ah_attr->grh.sgid_index); + return -EINVAL; + } + memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid)); + } + + new_ehca_av.pmtu = shca->max_mtu; + + memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid, + sizeof(ah_attr->grh.dgid)); + + av = container_of(ah, struct ehca_av, ib_ah); + av->av = new_ehca_av; + + return 0; +} + +int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr) +{ + struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah); + + memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3, + sizeof(ah_attr->grh.dgid)); + ah_attr->sl = av->av.sl; + + ah_attr->dlid = av->av.dlid; + + ah_attr->src_path_bits = av->av.slid_path_bits; + ah_attr->static_rate = av->av.ipd; + ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh); + ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK, + av->av.grh.word_0); + ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK, + av->av.grh.word_0); + ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK, + av->av.grh.word_0); + + return 0; +} + +int ehca_destroy_ah(struct ib_ah *ah) +{ + kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah)); + + return 0; +} + +int ehca_init_av_cache(void) +{ + av_cache = kmem_cache_create("ehca_cache_av", + sizeof(struct ehca_av), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!av_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_av_cache(void) +{ + if (av_cache) + kmem_cache_destroy(av_cache); +} diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h new file mode 100644 index 000000000..bd45e0f39 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_classes.h @@ -0,0 +1,482 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Struct definition for eHCA internal structures + * + * Authors: Heiko J Schick + * Christoph Raisch + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_CLASSES_H__ +#define __EHCA_CLASSES_H__ + +struct ehca_module; +struct ehca_qp; +struct ehca_cq; +struct ehca_eq; +struct ehca_mr; +struct ehca_mw; +struct ehca_pd; +struct ehca_av; + +#include +#include + +#include +#include + +#ifdef CONFIG_PPC64 +#include "ehca_classes_pSeries.h" +#endif +#include "ipz_pt_fn.h" +#include "ehca_qes.h" +#include "ehca_irq.h" + +#define EHCA_EQE_CACHE_SIZE 20 +#define EHCA_MAX_NUM_QUEUES 0xffff + +struct ehca_eqe_cache_entry { + struct ehca_eqe *eqe; + struct ehca_cq *cq; +}; + +struct ehca_eq { + u32 length; + struct ipz_queue ipz_queue; + struct ipz_eq_handle ipz_eq_handle; + struct work_struct work; + struct h_galpas galpas; + int is_initialized; + struct ehca_pfeq pf; + spinlock_t spinlock; + struct tasklet_struct interrupt_task; + u32 ist; + spinlock_t irq_spinlock; + struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE]; +}; + +struct ehca_sma_attr { + u16 lid, lmc, sm_sl, sm_lid; + u16 pkey_tbl_len, pkeys[16]; +}; + +struct ehca_sport { + struct ib_cq *ibcq_aqp1; + struct ib_qp *ibqp_sqp[2]; + /* lock to serialze modify_qp() calls for sqp in normal + * and irq path (when event PORT_ACTIVE is received first time) + */ + spinlock_t mod_sqp_lock; + enum ib_port_state port_state; + struct ehca_sma_attr saved_attr; + u32 pma_qp_nr; +}; + +#define HCA_CAP_MR_PGSIZE_4K 0x80000000 +#define HCA_CAP_MR_PGSIZE_64K 0x40000000 +#define HCA_CAP_MR_PGSIZE_1M 0x20000000 +#define HCA_CAP_MR_PGSIZE_16M 0x10000000 + +struct ehca_shca { + struct ib_device ib_device; + struct platform_device *ofdev; + u8 num_ports; + int hw_level; + struct list_head shca_list; + struct ipz_adapter_handle ipz_hca_handle; + struct ehca_sport sport[2]; + struct ehca_eq eq; + struct ehca_eq neq; + struct ehca_mr *maxmr; + struct ehca_pd *pd; + struct h_galpas galpas; + struct mutex modify_mutex; + u64 hca_cap; + /* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */ + u32 hca_cap_mr_pgsize; + int max_mtu; + int max_num_qps; + int max_num_cqs; + atomic_t num_cqs; + atomic_t num_qps; +}; + +struct ehca_pd { + struct ib_pd ib_pd; + struct ipz_pd fw_pd; + /* small queue mgmt */ + struct mutex lock; + struct list_head free[2]; + struct list_head full[2]; +}; + +enum ehca_ext_qp_type { + EQPT_NORMAL = 0, + EQPT_LLQP = 1, + EQPT_SRQBASE = 2, + EQPT_SRQ = 3, +}; + +/* struct to cache modify_qp()'s parms for GSI/SMI qp */ +struct ehca_mod_qp_parm { + int mask; + struct ib_qp_attr attr; +}; + +#define EHCA_MOD_QP_PARM_MAX 4 + +#define QMAP_IDX_MASK 0xFFFFULL + +/* struct for tracking if cqes have been reported to the application */ +struct ehca_qmap_entry { + u16 app_wr_id; + u8 reported; + u8 cqe_req; +}; + +struct ehca_queue_map { + struct ehca_qmap_entry *map; + unsigned int entries; + unsigned int tail; + unsigned int left_to_poll; + unsigned int next_wqe_idx; /* Idx to first wqe to be flushed */ +}; + +/* function to calculate the next index for the qmap */ +static inline unsigned int next_index(unsigned int cur_index, unsigned int limit) +{ + unsigned int temp = cur_index + 1; + return (temp == limit) ? 0 : temp; +} + +struct ehca_qp { + union { + struct ib_qp ib_qp; + struct ib_srq ib_srq; + }; + u32 qp_type; + enum ehca_ext_qp_type ext_type; + enum ib_qp_state state; + struct ipz_queue ipz_squeue; + struct ehca_queue_map sq_map; + struct ipz_queue ipz_rqueue; + struct ehca_queue_map rq_map; + struct h_galpas galpas; + u32 qkey; + u32 real_qp_num; + u32 token; + spinlock_t spinlock_s; + spinlock_t spinlock_r; + u32 sq_max_inline_data_size; + struct ipz_qp_handle ipz_qp_handle; + struct ehca_pfqp pf; + struct ib_qp_init_attr init_attr; + struct ehca_cq *send_cq; + struct ehca_cq *recv_cq; + unsigned int sqerr_purgeflag; + struct hlist_node list_entries; + /* array to cache modify_qp()'s parms for GSI/SMI qp */ + struct ehca_mod_qp_parm *mod_qp_parm; + int mod_qp_parm_idx; + /* mmap counter for resources mapped into user space */ + u32 mm_count_squeue; + u32 mm_count_rqueue; + u32 mm_count_galpa; + /* unsolicited ack circumvention */ + int unsol_ack_circ; + int mtu_shift; + u32 message_count; + u32 packet_count; + atomic_t nr_events; /* events seen */ + wait_queue_head_t wait_completion; + int mig_armed; + struct list_head sq_err_node; + struct list_head rq_err_node; +}; + +#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ) +#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ) +#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE) + +/* must be power of 2 */ +#define QP_HASHTAB_LEN 8 + +struct ehca_cq { + struct ib_cq ib_cq; + struct ipz_queue ipz_queue; + struct h_galpas galpas; + spinlock_t spinlock; + u32 cq_number; + u32 token; + u32 nr_of_entries; + struct ipz_cq_handle ipz_cq_handle; + struct ehca_pfcq pf; + spinlock_t cb_lock; + struct hlist_head qp_hashtab[QP_HASHTAB_LEN]; + struct list_head entry; + u32 nr_callbacks; /* #events assigned to cpu by scaling code */ + atomic_t nr_events; /* #events seen */ + wait_queue_head_t wait_completion; + spinlock_t task_lock; + /* mmap counter for resources mapped into user space */ + u32 mm_count_queue; + u32 mm_count_galpa; + struct list_head sqp_err_list; + struct list_head rqp_err_list; +}; + +enum ehca_mr_flag { + EHCA_MR_FLAG_FMR = 0x80000000, /* FMR, created with ehca_alloc_fmr */ + EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR */ +}; + +struct ehca_mr { + union { + struct ib_mr ib_mr; /* must always be first in ehca_mr */ + struct ib_fmr ib_fmr; /* must always be first in ehca_mr */ + } ib; + struct ib_umem *umem; + spinlock_t mrlock; + + enum ehca_mr_flag flags; + u32 num_kpages; /* number of kernel pages */ + u32 num_hwpages; /* number of hw pages to form MR */ + u64 hwpage_size; /* hw page size used for this MR */ + int acl; /* ACL (stored here for usage in reregister) */ + u64 *start; /* virtual start address (stored here for */ + /* usage in reregister) */ + u64 size; /* size (stored here for usage in reregister) */ + u32 fmr_page_size; /* page size for FMR */ + u32 fmr_max_pages; /* max pages for FMR */ + u32 fmr_max_maps; /* max outstanding maps for FMR */ + u32 fmr_map_cnt; /* map counter for FMR */ + /* fw specific data */ + struct ipz_mrmw_handle ipz_mr_handle; /* MR handle for h-calls */ + struct h_galpas galpas; +}; + +struct ehca_mw { + struct ib_mw ib_mw; /* gen2 mw, must always be first in ehca_mw */ + spinlock_t mwlock; + + u8 never_bound; /* indication MW was never bound */ + struct ipz_mrmw_handle ipz_mw_handle; /* MW handle for h-calls */ + struct h_galpas galpas; +}; + +enum ehca_mr_pgi_type { + EHCA_MR_PGI_PHYS = 1, /* type of ehca_reg_phys_mr, + * ehca_rereg_phys_mr, + * ehca_reg_internal_maxmr */ + EHCA_MR_PGI_USER = 2, /* type of ehca_reg_user_mr */ + EHCA_MR_PGI_FMR = 3 /* type of ehca_map_phys_fmr */ +}; + +struct ehca_mr_pginfo { + enum ehca_mr_pgi_type type; + u64 num_kpages; + u64 kpage_cnt; + u64 hwpage_size; /* hw page size used for this MR */ + u64 num_hwpages; /* number of hw pages */ + u64 hwpage_cnt; /* counter for hw pages */ + u64 next_hwpage; /* next hw page in buffer/chunk/listelem */ + + union { + struct { /* type EHCA_MR_PGI_PHYS section */ + int num_phys_buf; + struct ib_phys_buf *phys_buf_array; + u64 next_buf; + } phy; + struct { /* type EHCA_MR_PGI_USER section */ + struct ib_umem *region; + struct scatterlist *next_sg; + u64 next_nmap; + } usr; + struct { /* type EHCA_MR_PGI_FMR section */ + u64 fmr_pgsize; + u64 *page_list; + u64 next_listelem; + } fmr; + } u; +}; + +/* output parameters for MR/FMR hipz calls */ +struct ehca_mr_hipzout_parms { + struct ipz_mrmw_handle handle; + u32 lkey; + u32 rkey; + u64 len; + u64 vaddr; + u32 acl; +}; + +/* output parameters for MW hipz calls */ +struct ehca_mw_hipzout_parms { + struct ipz_mrmw_handle handle; + u32 rkey; +}; + +struct ehca_av { + struct ib_ah ib_ah; + struct ehca_ud_av av; +}; + +struct ehca_ucontext { + struct ib_ucontext ib_ucontext; +}; + +int ehca_init_pd_cache(void); +void ehca_cleanup_pd_cache(void); +int ehca_init_cq_cache(void); +void ehca_cleanup_cq_cache(void); +int ehca_init_qp_cache(void); +void ehca_cleanup_qp_cache(void); +int ehca_init_av_cache(void); +void ehca_cleanup_av_cache(void); +int ehca_init_mrmw_cache(void); +void ehca_cleanup_mrmw_cache(void); +int ehca_init_small_qp_cache(void); +void ehca_cleanup_small_qp_cache(void); + +extern rwlock_t ehca_qp_idr_lock; +extern rwlock_t ehca_cq_idr_lock; +extern struct idr ehca_qp_idr; +extern struct idr ehca_cq_idr; +extern spinlock_t shca_list_lock; + +extern int ehca_static_rate; +extern int ehca_port_act_time; +extern bool ehca_use_hp_mr; +extern bool ehca_scaling_code; +extern int ehca_lock_hcalls; +extern int ehca_nr_ports; +extern int ehca_max_cq; +extern int ehca_max_qp; + +struct ipzu_queue_resp { + u32 qe_size; /* queue entry size */ + u32 act_nr_of_sg; + u32 queue_length; /* queue length allocated in bytes */ + u32 pagesize; + u32 toggle_state; + u32 offset; /* save offset within a page for small_qp */ +}; + +struct ehca_create_cq_resp { + u32 cq_number; + u32 token; + struct ipzu_queue_resp ipz_queue; + u32 fw_handle_ofs; + u32 dummy; +}; + +struct ehca_create_qp_resp { + u32 qp_num; + u32 token; + u32 qp_type; + u32 ext_type; + u32 qkey; + /* qp_num assigned by ehca: sqp0/1 may have got different numbers */ + u32 real_qp_num; + u32 fw_handle_ofs; + u32 dummy; + struct ipzu_queue_resp ipz_squeue; + struct ipzu_queue_resp ipz_rqueue; +}; + +struct ehca_alloc_cq_parms { + u32 nr_cqe; + u32 act_nr_of_entries; + u32 act_pages; + struct ipz_eq_handle eq_handle; +}; + +enum ehca_service_type { + ST_RC = 0, + ST_UC = 1, + ST_RD = 2, + ST_UD = 3, +}; + +enum ehca_ll_comp_flags { + LLQP_SEND_COMP = 0x20, + LLQP_RECV_COMP = 0x40, + LLQP_COMP_MASK = 0x60, +}; + +struct ehca_alloc_queue_parms { + /* input parameters */ + int max_wr; + int max_sge; + int page_size; + int is_small; + + /* output parameters */ + u16 act_nr_wqes; + u8 act_nr_sges; + u32 queue_size; /* bytes for small queues, pages otherwise */ +}; + +struct ehca_alloc_qp_parms { + struct ehca_alloc_queue_parms squeue; + struct ehca_alloc_queue_parms rqueue; + + /* input parameters */ + enum ehca_service_type servicetype; + int qp_storage; + int sigtype; + enum ehca_ext_qp_type ext_type; + enum ehca_ll_comp_flags ll_comp_flags; + int ud_av_l_key_ctl; + + u32 token; + struct ipz_eq_handle eq_handle; + struct ipz_pd pd; + struct ipz_cq_handle send_cq_handle, recv_cq_handle; + + u32 srq_qpn, srq_token, srq_limit; + + /* output parameters */ + u32 real_qp_num; + struct ipz_qp_handle qp_handle; + struct h_galpas galpas; +}; + +int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp); +int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num); +struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num); + +#endif diff --git a/drivers/staging/rdma/ehca/ehca_classes_pSeries.h b/drivers/staging/rdma/ehca/ehca_classes_pSeries.h new file mode 100644 index 000000000..689c35786 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_classes_pSeries.h @@ -0,0 +1,208 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * pSeries interface definitions + * + * Authors: Waleri Fomin + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_CLASSES_PSERIES_H__ +#define __EHCA_CLASSES_PSERIES_H__ + +#include "hcp_phyp.h" +#include "ipz_pt_fn.h" + + +struct ehca_pfqp { + struct ipz_qpt sqpt; + struct ipz_qpt rqpt; +}; + +struct ehca_pfcq { + struct ipz_qpt qpt; + u32 cqnr; +}; + +struct ehca_pfeq { + struct ipz_qpt qpt; + struct h_galpa galpa; + u32 eqnr; +}; + +struct ipz_adapter_handle { + u64 handle; +}; + +struct ipz_cq_handle { + u64 handle; +}; + +struct ipz_eq_handle { + u64 handle; +}; + +struct ipz_qp_handle { + u64 handle; +}; +struct ipz_mrmw_handle { + u64 handle; +}; + +struct ipz_pd { + u32 value; +}; + +struct hcp_modify_qp_control_block { + u32 qkey; /* 00 */ + u32 rdd; /* reliable datagram domain */ + u32 send_psn; /* 02 */ + u32 receive_psn; /* 03 */ + u32 prim_phys_port; /* 04 */ + u32 alt_phys_port; /* 05 */ + u32 prim_p_key_idx; /* 06 */ + u32 alt_p_key_idx; /* 07 */ + u32 rdma_atomic_ctrl; /* 08 */ + u32 qp_state; /* 09 */ + u32 reserved_10; /* 10 */ + u32 rdma_nr_atomic_resp_res; /* 11 */ + u32 path_migration_state; /* 12 */ + u32 rdma_atomic_outst_dest_qp; /* 13 */ + u32 dest_qp_nr; /* 14 */ + u32 min_rnr_nak_timer_field; /* 15 */ + u32 service_level; /* 16 */ + u32 send_grh_flag; /* 17 */ + u32 retry_count; /* 18 */ + u32 timeout; /* 19 */ + u32 path_mtu; /* 20 */ + u32 max_static_rate; /* 21 */ + u32 dlid; /* 22 */ + u32 rnr_retry_count; /* 23 */ + u32 source_path_bits; /* 24 */ + u32 traffic_class; /* 25 */ + u32 hop_limit; /* 26 */ + u32 source_gid_idx; /* 27 */ + u32 flow_label; /* 28 */ + u32 reserved_29; /* 29 */ + union { /* 30 */ + u64 dw[2]; + u8 byte[16]; + } dest_gid; + u32 service_level_al; /* 34 */ + u32 send_grh_flag_al; /* 35 */ + u32 retry_count_al; /* 36 */ + u32 timeout_al; /* 37 */ + u32 max_static_rate_al; /* 38 */ + u32 dlid_al; /* 39 */ + u32 rnr_retry_count_al; /* 40 */ + u32 source_path_bits_al; /* 41 */ + u32 traffic_class_al; /* 42 */ + u32 hop_limit_al; /* 43 */ + u32 source_gid_idx_al; /* 44 */ + u32 flow_label_al; /* 45 */ + u32 reserved_46; /* 46 */ + u32 reserved_47; /* 47 */ + union { /* 48 */ + u64 dw[2]; + u8 byte[16]; + } dest_gid_al; + u32 max_nr_outst_send_wr; /* 52 */ + u32 max_nr_outst_recv_wr; /* 53 */ + u32 disable_ete_credit_check; /* 54 */ + u32 qp_number; /* 55 */ + u64 send_queue_handle; /* 56 */ + u64 recv_queue_handle; /* 58 */ + u32 actual_nr_sges_in_sq_wqe; /* 60 */ + u32 actual_nr_sges_in_rq_wqe; /* 61 */ + u32 qp_enable; /* 62 */ + u32 curr_srq_limit; /* 63 */ + u64 qp_aff_asyn_ev_log_reg; /* 64 */ + u64 shared_rq_hndl; /* 66 */ + u64 trigg_doorbell_qp_hndl; /* 68 */ + u32 reserved_70_127[58]; /* 70 */ +}; + +#define MQPCB_MASK_QKEY EHCA_BMASK_IBM( 0, 0) +#define MQPCB_MASK_SEND_PSN EHCA_BMASK_IBM( 2, 2) +#define MQPCB_MASK_RECEIVE_PSN EHCA_BMASK_IBM( 3, 3) +#define MQPCB_MASK_PRIM_PHYS_PORT EHCA_BMASK_IBM( 4, 4) +#define MQPCB_PRIM_PHYS_PORT EHCA_BMASK_IBM(24, 31) +#define MQPCB_MASK_ALT_PHYS_PORT EHCA_BMASK_IBM( 5, 5) +#define MQPCB_MASK_PRIM_P_KEY_IDX EHCA_BMASK_IBM( 6, 6) +#define MQPCB_PRIM_P_KEY_IDX EHCA_BMASK_IBM(24, 31) +#define MQPCB_MASK_ALT_P_KEY_IDX EHCA_BMASK_IBM( 7, 7) +#define MQPCB_MASK_RDMA_ATOMIC_CTRL EHCA_BMASK_IBM( 8, 8) +#define MQPCB_MASK_QP_STATE EHCA_BMASK_IBM( 9, 9) +#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES EHCA_BMASK_IBM(11, 11) +#define MQPCB_MASK_PATH_MIGRATION_STATE EHCA_BMASK_IBM(12, 12) +#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP EHCA_BMASK_IBM(13, 13) +#define MQPCB_MASK_DEST_QP_NR EHCA_BMASK_IBM(14, 14) +#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD EHCA_BMASK_IBM(15, 15) +#define MQPCB_MASK_SERVICE_LEVEL EHCA_BMASK_IBM(16, 16) +#define MQPCB_MASK_SEND_GRH_FLAG EHCA_BMASK_IBM(17, 17) +#define MQPCB_MASK_RETRY_COUNT EHCA_BMASK_IBM(18, 18) +#define MQPCB_MASK_TIMEOUT EHCA_BMASK_IBM(19, 19) +#define MQPCB_MASK_PATH_MTU EHCA_BMASK_IBM(20, 20) +#define MQPCB_MASK_MAX_STATIC_RATE EHCA_BMASK_IBM(21, 21) +#define MQPCB_MASK_DLID EHCA_BMASK_IBM(22, 22) +#define MQPCB_MASK_RNR_RETRY_COUNT EHCA_BMASK_IBM(23, 23) +#define MQPCB_MASK_SOURCE_PATH_BITS EHCA_BMASK_IBM(24, 24) +#define MQPCB_MASK_TRAFFIC_CLASS EHCA_BMASK_IBM(25, 25) +#define MQPCB_MASK_HOP_LIMIT EHCA_BMASK_IBM(26, 26) +#define MQPCB_MASK_SOURCE_GID_IDX EHCA_BMASK_IBM(27, 27) +#define MQPCB_MASK_FLOW_LABEL EHCA_BMASK_IBM(28, 28) +#define MQPCB_MASK_DEST_GID EHCA_BMASK_IBM(30, 30) +#define MQPCB_MASK_SERVICE_LEVEL_AL EHCA_BMASK_IBM(31, 31) +#define MQPCB_MASK_SEND_GRH_FLAG_AL EHCA_BMASK_IBM(32, 32) +#define MQPCB_MASK_RETRY_COUNT_AL EHCA_BMASK_IBM(33, 33) +#define MQPCB_MASK_TIMEOUT_AL EHCA_BMASK_IBM(34, 34) +#define MQPCB_MASK_MAX_STATIC_RATE_AL EHCA_BMASK_IBM(35, 35) +#define MQPCB_MASK_DLID_AL EHCA_BMASK_IBM(36, 36) +#define MQPCB_MASK_RNR_RETRY_COUNT_AL EHCA_BMASK_IBM(37, 37) +#define MQPCB_MASK_SOURCE_PATH_BITS_AL EHCA_BMASK_IBM(38, 38) +#define MQPCB_MASK_TRAFFIC_CLASS_AL EHCA_BMASK_IBM(39, 39) +#define MQPCB_MASK_HOP_LIMIT_AL EHCA_BMASK_IBM(40, 40) +#define MQPCB_MASK_SOURCE_GID_IDX_AL EHCA_BMASK_IBM(41, 41) +#define MQPCB_MASK_FLOW_LABEL_AL EHCA_BMASK_IBM(42, 42) +#define MQPCB_MASK_DEST_GID_AL EHCA_BMASK_IBM(44, 44) +#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR EHCA_BMASK_IBM(45, 45) +#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR EHCA_BMASK_IBM(46, 46) +#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK EHCA_BMASK_IBM(47, 47) +#define MQPCB_MASK_QP_ENABLE EHCA_BMASK_IBM(48, 48) +#define MQPCB_MASK_CURR_SRQ_LIMIT EHCA_BMASK_IBM(49, 49) +#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG EHCA_BMASK_IBM(50, 50) +#define MQPCB_MASK_SHARED_RQ_HNDL EHCA_BMASK_IBM(51, 51) + +#endif /* __EHCA_CLASSES_PSERIES_H__ */ diff --git a/drivers/staging/rdma/ehca/ehca_cq.c b/drivers/staging/rdma/ehca/ehca_cq.c new file mode 100644 index 000000000..9b68b1750 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_cq.c @@ -0,0 +1,397 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Completion queue handling + * + * Authors: Waleri Fomin + * Khadija Souissi + * Reinhard Ernst + * Heiko J Schick + * Hoang-Nam Nguyen + * + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_iverbs.h" +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "hcp_if.h" + +static struct kmem_cache *cq_cache; + +int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp) +{ + unsigned int qp_num = qp->real_qp_num; + unsigned int key = qp_num & (QP_HASHTAB_LEN-1); + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]); + spin_unlock_irqrestore(&cq->spinlock, flags); + + ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x", + cq->cq_number, qp_num); + + return 0; +} + +int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num) +{ + int ret = -EINVAL; + unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1); + struct hlist_node *iter; + struct ehca_qp *qp; + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + hlist_for_each(iter, &cq->qp_hashtab[key]) { + qp = hlist_entry(iter, struct ehca_qp, list_entries); + if (qp->real_qp_num == real_qp_num) { + hlist_del(iter); + ehca_dbg(cq->ib_cq.device, + "removed qp from cq .cq_num=%x real_qp_num=%x", + cq->cq_number, real_qp_num); + ret = 0; + break; + } + } + spin_unlock_irqrestore(&cq->spinlock, flags); + if (ret) + ehca_err(cq->ib_cq.device, + "qp not found cq_num=%x real_qp_num=%x", + cq->cq_number, real_qp_num); + + return ret; +} + +struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num) +{ + struct ehca_qp *ret = NULL; + unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1); + struct hlist_node *iter; + struct ehca_qp *qp; + hlist_for_each(iter, &cq->qp_hashtab[key]) { + qp = hlist_entry(iter, struct ehca_qp, list_entries); + if (qp->real_qp_num == real_qp_num) { + ret = qp; + break; + } + } + return ret; +} + +struct ib_cq *ehca_create_cq(struct ib_device *device, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + int cqe = attr->cqe; + static const u32 additional_cqe = 20; + struct ib_cq *cq; + struct ehca_cq *my_cq; + struct ehca_shca *shca = + container_of(device, struct ehca_shca, ib_device); + struct ipz_adapter_handle adapter_handle; + struct ehca_alloc_cq_parms param; /* h_call's out parameters */ + struct h_galpa gal; + void *vpage; + u32 counter; + u64 rpage, cqx_fec, h_ret; + int ipz_rc, i; + unsigned long flags; + + if (attr->flags) + return ERR_PTR(-EINVAL); + + if (cqe >= 0xFFFFFFFF - 64 - additional_cqe) + return ERR_PTR(-EINVAL); + + if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) { + ehca_err(device, "Unable to create CQ, max number of %i " + "CQs reached.", shca->max_num_cqs); + ehca_err(device, "To increase the maximum number of CQs " + "use the number_of_cqs module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + + my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL); + if (!my_cq) { + ehca_err(device, "Out of memory for ehca_cq struct device=%p", + device); + atomic_dec(&shca->num_cqs); + return ERR_PTR(-ENOMEM); + } + + memset(¶m, 0, sizeof(struct ehca_alloc_cq_parms)); + + spin_lock_init(&my_cq->spinlock); + spin_lock_init(&my_cq->cb_lock); + spin_lock_init(&my_cq->task_lock); + atomic_set(&my_cq->nr_events, 0); + init_waitqueue_head(&my_cq->wait_completion); + + cq = &my_cq->ib_cq; + + adapter_handle = shca->ipz_hca_handle; + param.eq_handle = shca->eq.ipz_eq_handle; + + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_cq_idr_lock, flags); + my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + idr_preload_end(); + + if (my_cq->token < 0) { + cq = ERR_PTR(-ENOMEM); + ehca_err(device, "Can't allocate new idr entry. device=%p", + device); + goto create_cq_exit1; + } + + /* + * CQs maximum depth is 4GB-64, but we need additional 20 as buffer + * for receiving errors CQEs. + */ + param.nr_cqe = cqe + additional_cqe; + h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, ¶m); + + if (h_ret != H_SUCCESS) { + ehca_err(device, "hipz_h_alloc_resource_cq() failed " + "h_ret=%lli device=%p", h_ret, device); + cq = ERR_PTR(ehca2ib_return_code(h_ret)); + goto create_cq_exit2; + } + + ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages, + EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0); + if (!ipz_rc) { + ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p", + ipz_rc, device); + cq = ERR_PTR(-EINVAL); + goto create_cq_exit3; + } + + for (counter = 0; counter < param.act_pages; counter++) { + vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue); + if (!vpage) { + ehca_err(device, "ipz_qpageit_get_inc() " + "returns NULL device=%p", device); + cq = ERR_PTR(-EAGAIN); + goto create_cq_exit4; + } + rpage = __pa(vpage); + + h_ret = hipz_h_register_rpage_cq(adapter_handle, + my_cq->ipz_cq_handle, + &my_cq->pf, + 0, + 0, + rpage, + 1, + my_cq->galpas. + kernel); + + if (h_ret < H_SUCCESS) { + ehca_err(device, "hipz_h_register_rpage_cq() failed " + "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i " + "act_pages=%i", my_cq, my_cq->cq_number, + h_ret, counter, param.act_pages); + cq = ERR_PTR(-EINVAL); + goto create_cq_exit4; + } + + if (counter == (param.act_pages - 1)) { + vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue); + if ((h_ret != H_SUCCESS) || vpage) { + ehca_err(device, "Registration of pages not " + "complete ehca_cq=%p cq_num=%x " + "h_ret=%lli", my_cq, my_cq->cq_number, + h_ret); + cq = ERR_PTR(-EAGAIN); + goto create_cq_exit4; + } + } else { + if (h_ret != H_PAGE_REGISTERED) { + ehca_err(device, "Registration of page failed " + "ehca_cq=%p cq_num=%x h_ret=%lli " + "counter=%i act_pages=%i", + my_cq, my_cq->cq_number, + h_ret, counter, param.act_pages); + cq = ERR_PTR(-ENOMEM); + goto create_cq_exit4; + } + } + } + + ipz_qeit_reset(&my_cq->ipz_queue); + + gal = my_cq->galpas.kernel; + cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec)); + ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx", + my_cq, my_cq->cq_number, cqx_fec); + + my_cq->ib_cq.cqe = my_cq->nr_of_entries = + param.act_nr_of_entries - additional_cqe; + my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff; + + for (i = 0; i < QP_HASHTAB_LEN; i++) + INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]); + + INIT_LIST_HEAD(&my_cq->sqp_err_list); + INIT_LIST_HEAD(&my_cq->rqp_err_list); + + if (context) { + struct ipz_queue *ipz_queue = &my_cq->ipz_queue; + struct ehca_create_cq_resp resp; + memset(&resp, 0, sizeof(resp)); + resp.cq_number = my_cq->cq_number; + resp.token = my_cq->token; + resp.ipz_queue.qe_size = ipz_queue->qe_size; + resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg; + resp.ipz_queue.queue_length = ipz_queue->queue_length; + resp.ipz_queue.pagesize = ipz_queue->pagesize; + resp.ipz_queue.toggle_state = ipz_queue->toggle_state; + resp.fw_handle_ofs = (u32) + (my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1)); + if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { + ehca_err(device, "Copy to udata failed."); + cq = ERR_PTR(-EFAULT); + goto create_cq_exit4; + } + } + + return cq; + +create_cq_exit4: + ipz_queue_dtor(NULL, &my_cq->ipz_queue); + +create_cq_exit3: + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1); + if (h_ret != H_SUCCESS) + ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p " + "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret); + +create_cq_exit2: + write_lock_irqsave(&ehca_cq_idr_lock, flags); + idr_remove(&ehca_cq_idr, my_cq->token); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + +create_cq_exit1: + kmem_cache_free(cq_cache, my_cq); + + atomic_dec(&shca->num_cqs); + return cq; +} + +int ehca_destroy_cq(struct ib_cq *cq) +{ + u64 h_ret; + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int cq_num = my_cq->cq_number; + struct ib_device *device = cq->device; + struct ehca_shca *shca = container_of(device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + unsigned long flags; + + if (cq->uobject) { + if (my_cq->mm_count_galpa || my_cq->mm_count_queue) { + ehca_err(device, "Resources still referenced in " + "user space cq_num=%x", my_cq->cq_number); + return -EINVAL; + } + } + + /* + * remove the CQ from the idr first to make sure + * no more interrupt tasklets will touch this CQ + */ + write_lock_irqsave(&ehca_cq_idr_lock, flags); + idr_remove(&ehca_cq_idr, my_cq->token); + write_unlock_irqrestore(&ehca_cq_idr_lock, flags); + + /* now wait until all pending events have completed */ + wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events)); + + /* nobody's using our CQ any longer -- we can destroy it */ + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0); + if (h_ret == H_R_STATE) { + /* cq in err: read err data and destroy it forcibly */ + ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err " + "state. Try to delete it forcibly.", + my_cq, cq_num, my_cq->ipz_cq_handle.handle); + ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle); + h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1); + if (h_ret == H_SUCCESS) + ehca_dbg(device, "cq_num=%x deleted successfully.", + cq_num); + } + if (h_ret != H_SUCCESS) { + ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli " + "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num); + return ehca2ib_return_code(h_ret); + } + ipz_queue_dtor(NULL, &my_cq->ipz_queue); + kmem_cache_free(cq_cache, my_cq); + + atomic_dec(&shca->num_cqs); + return 0; +} + +int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) +{ + /* TODO: proper resize needs to be done */ + ehca_err(cq->device, "not implemented yet"); + + return -EFAULT; +} + +int ehca_init_cq_cache(void) +{ + cq_cache = kmem_cache_create("ehca_cache_cq", + sizeof(struct ehca_cq), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!cq_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_cq_cache(void) +{ + if (cq_cache) + kmem_cache_destroy(cq_cache); +} diff --git a/drivers/staging/rdma/ehca/ehca_eq.c b/drivers/staging/rdma/ehca/ehca_eq.c new file mode 100644 index 000000000..90da6747d --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_eq.c @@ -0,0 +1,189 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Event queue handling + * + * Authors: Waleri Fomin + * Khadija Souissi + * Reinhard Ernst + * Heiko J Schick + * Hoang-Nam Nguyen + * + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "ehca_iverbs.h" +#include "ehca_qes.h" +#include "hcp_if.h" +#include "ipz_pt_fn.h" + +int ehca_create_eq(struct ehca_shca *shca, + struct ehca_eq *eq, + const enum ehca_eq_type type, const u32 length) +{ + int ret; + u64 h_ret; + u32 nr_pages; + u32 i; + void *vpage; + struct ib_device *ib_dev = &shca->ib_device; + + spin_lock_init(&eq->spinlock); + spin_lock_init(&eq->irq_spinlock); + eq->is_initialized = 0; + + if (type != EHCA_EQ && type != EHCA_NEQ) { + ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq); + return -EINVAL; + } + if (!length) { + ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq); + return -EINVAL; + } + + h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle, + &eq->pf, + type, + length, + &eq->ipz_eq_handle, + &eq->length, + &nr_pages, &eq->ist); + + if (h_ret != H_SUCCESS) { + ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq); + return -EINVAL; + } + + ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages, + EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0); + if (!ret) { + ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq); + goto create_eq_exit1; + } + + for (i = 0; i < nr_pages; i++) { + u64 rpage; + + vpage = ipz_qpageit_get_inc(&eq->ipz_queue); + if (!vpage) + goto create_eq_exit2; + + rpage = __pa(vpage); + h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle, + eq->ipz_eq_handle, + &eq->pf, + 0, 0, rpage, 1); + + if (i == (nr_pages - 1)) { + /* last page */ + vpage = ipz_qpageit_get_inc(&eq->ipz_queue); + if (h_ret != H_SUCCESS || vpage) + goto create_eq_exit2; + } else { + if (h_ret != H_PAGE_REGISTERED) + goto create_eq_exit2; + } + } + + ipz_qeit_reset(&eq->ipz_queue); + + /* register interrupt handlers and initialize work queues */ + if (type == EHCA_EQ) { + tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca); + + ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq, + 0, "ehca_eq", + (void *)shca); + if (ret < 0) + ehca_err(ib_dev, "Can't map interrupt handler."); + } else if (type == EHCA_NEQ) { + tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca); + + ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq, + 0, "ehca_neq", + (void *)shca); + if (ret < 0) + ehca_err(ib_dev, "Can't map interrupt handler."); + } + + eq->is_initialized = 1; + + return 0; + +create_eq_exit2: + ipz_queue_dtor(NULL, &eq->ipz_queue); + +create_eq_exit1: + hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + + return -EINVAL; +} + +void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq) +{ + unsigned long flags; + void *eqe; + + spin_lock_irqsave(&eq->spinlock, flags); + eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue); + spin_unlock_irqrestore(&eq->spinlock, flags); + + return eqe; +} + +int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq) +{ + unsigned long flags; + u64 h_ret; + + ibmebus_free_irq(eq->ist, (void *)shca); + + spin_lock_irqsave(&shca_list_lock, flags); + eq->is_initialized = 0; + spin_unlock_irqrestore(&shca_list_lock, flags); + + tasklet_kill(&eq->interrupt_task); + + h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq); + + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't free EQ resources."); + return -EINVAL; + } + ipz_queue_dtor(NULL, &eq->ipz_queue); + + return 0; +} diff --git a/drivers/staging/rdma/ehca/ehca_hca.c b/drivers/staging/rdma/ehca/ehca_hca.c new file mode 100644 index 000000000..e8b1bb657 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_hca.c @@ -0,0 +1,414 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HCA query functions + * + * Authors: Heiko J Schick + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +static unsigned int limit_uint(unsigned int value) +{ + return min_t(unsigned int, value, INT_MAX); +} + +int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw) +{ + int i, ret = 0; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_hca *rblock; + + static const u32 cap_mapping[] = { + IB_DEVICE_RESIZE_MAX_WR, HCA_CAP_WQE_RESIZE, + IB_DEVICE_BAD_PKEY_CNTR, HCA_CAP_BAD_P_KEY_CTR, + IB_DEVICE_BAD_QKEY_CNTR, HCA_CAP_Q_KEY_VIOL_CTR, + IB_DEVICE_RAW_MULTI, HCA_CAP_RAW_PACKET_MCAST, + IB_DEVICE_AUTO_PATH_MIG, HCA_CAP_AUTO_PATH_MIG, + IB_DEVICE_CHANGE_PHY_PORT, HCA_CAP_SQD_RTS_PORT_CHANGE, + IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK, + IB_DEVICE_CURR_QP_STATE_MOD, HCA_CAP_CUR_QP_STATE_MOD, + IB_DEVICE_SHUTDOWN_PORT, HCA_CAP_SHUTDOWN_PORT, + IB_DEVICE_INIT_TYPE, HCA_CAP_INIT_TYPE, + IB_DEVICE_PORT_ACTIVE_EVENT, HCA_CAP_PORT_ACTIVE_EVENT, + }; + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query device properties"); + ret = -EINVAL; + goto query_device1; + } + + memset(props, 0, sizeof(struct ib_device_attr)); + props->page_size_cap = shca->hca_cap_mr_pgsize; + props->fw_ver = rblock->hw_ver; + props->max_mr_size = rblock->max_mr_size; + props->vendor_id = rblock->vendor_id >> 8; + props->vendor_part_id = rblock->vendor_part_id >> 16; + props->hw_ver = rblock->hw_ver; + props->max_qp = limit_uint(rblock->max_qp); + props->max_qp_wr = limit_uint(rblock->max_wqes_wq); + props->max_sge = limit_uint(rblock->max_sge); + props->max_sge_rd = limit_uint(rblock->max_sge_rd); + props->max_cq = limit_uint(rblock->max_cq); + props->max_cqe = limit_uint(rblock->max_cqe); + props->max_mr = limit_uint(rblock->max_mr); + props->max_mw = limit_uint(rblock->max_mw); + props->max_pd = limit_uint(rblock->max_pd); + props->max_ah = limit_uint(rblock->max_ah); + props->max_ee = limit_uint(rblock->max_rd_ee_context); + props->max_rdd = limit_uint(rblock->max_rd_domain); + props->max_fmr = limit_uint(rblock->max_mr); + props->max_qp_rd_atom = limit_uint(rblock->max_rr_qp); + props->max_ee_rd_atom = limit_uint(rblock->max_rr_ee_context); + props->max_res_rd_atom = limit_uint(rblock->max_rr_hca); + props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp); + props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context); + + if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { + props->max_srq = limit_uint(props->max_qp); + props->max_srq_wr = limit_uint(props->max_qp_wr); + props->max_srq_sge = 3; + } + + props->max_pkeys = 16; + /* Some FW versions say 0 here; insert sensible value in that case */ + props->local_ca_ack_delay = rblock->local_ca_ack_delay ? + min_t(u8, rblock->local_ca_ack_delay, 255) : 12; + props->max_raw_ipv6_qp = limit_uint(rblock->max_raw_ipv6_qp); + props->max_raw_ethy_qp = limit_uint(rblock->max_raw_ethy_qp); + props->max_mcast_grp = limit_uint(rblock->max_mcast_grp); + props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach); + props->max_total_mcast_qp_attach + = limit_uint(rblock->max_total_mcast_qp_attach); + + /* translate device capabilities */ + props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID | + IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ; + for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2) + if (rblock->hca_cap_indicators & cap_mapping[i + 1]) + props->device_cap_flags |= cap_mapping[i]; + +query_device1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu) +{ + switch (fw_mtu) { + case 0x1: + return IB_MTU_256; + case 0x2: + return IB_MTU_512; + case 0x3: + return IB_MTU_1024; + case 0x4: + return IB_MTU_2048; + case 0x5: + return IB_MTU_4096; + default: + ehca_err(&shca->ib_device, "Unknown MTU size: %x.", + fw_mtu); + return 0; + } +} + +static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap) +{ + switch (vl_cap) { + case 0x1: + return 1; + case 0x2: + return 2; + case 0x3: + return 4; + case 0x4: + return 8; + case 0x5: + return 15; + default: + ehca_err(&shca->ib_device, "invalid Vl Capability: %x.", + vl_cap); + return 0; + } +} + +int ehca_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_port *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_port1; + } + + memset(props, 0, sizeof(struct ib_port_attr)); + + props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu); + props->port_cap_flags = rblock->capability_mask; + props->gid_tbl_len = rblock->gid_tbl_len; + if (rblock->max_msg_sz) + props->max_msg_sz = rblock->max_msg_sz; + else + props->max_msg_sz = 0x1 << 31; + props->bad_pkey_cntr = rblock->bad_pkey_cntr; + props->qkey_viol_cntr = rblock->qkey_viol_cntr; + props->pkey_tbl_len = rblock->pkey_tbl_len; + props->lid = rblock->lid; + props->sm_lid = rblock->sm_lid; + props->lmc = rblock->lmc; + props->sm_sl = rblock->sm_sl; + props->subnet_timeout = rblock->subnet_timeout; + props->init_type_reply = rblock->init_type_reply; + props->max_vl_num = map_number_of_vls(shca, rblock->vl_cap); + + if (rblock->state && rblock->phys_width) { + props->phys_state = rblock->phys_pstate; + props->state = rblock->phys_state; + props->active_width = rblock->phys_width; + props->active_speed = rblock->phys_speed; + } else { + /* old firmware releases don't report physical + * port info, so use default values + */ + props->phys_state = 5; + props->state = rblock->state; + props->active_width = IB_WIDTH_12X; + props->active_speed = IB_SPEED_SDR; + } + +query_port1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_sma_attr(struct ehca_shca *shca, + u8 port, struct ehca_sma_attr *attr) +{ + int ret = 0; + u64 h_ret; + struct hipz_query_port *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_sma_attr1; + } + + memset(attr, 0, sizeof(struct ehca_sma_attr)); + + attr->lid = rblock->lid; + attr->lmc = rblock->lmc; + attr->sm_sl = rblock->sm_sl; + attr->sm_lid = rblock->sm_lid; + + attr->pkey_tbl_len = rblock->pkey_tbl_len; + memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys)); + +query_sma_attr1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca; + struct hipz_query_port *rblock; + + shca = container_of(ibdev, struct ehca_shca, ib_device); + if (index > 16) { + ehca_err(&shca->ib_device, "Invalid index: %x.", index); + return -EINVAL; + } + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_pkey1; + } + + memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16)); + +query_pkey1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +int ehca_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = container_of(ibdev, struct ehca_shca, + ib_device); + struct hipz_query_port *rblock; + + if (index < 0 || index > 255) { + ehca_err(&shca->ib_device, "Invalid index: %x.", index); + return -EINVAL; + } + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto query_gid1; + } + + memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64)); + memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64)); + +query_gid1: + ehca_free_fw_ctrlblock(rblock); + + return ret; +} + +static const u32 allowed_port_caps = ( + IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP | + IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP); + +int ehca_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + int ret = 0; + struct ehca_shca *shca; + struct hipz_query_port *rblock; + u32 cap; + u64 hret; + + shca = container_of(ibdev, struct ehca_shca, ib_device); + if ((props->set_port_cap_mask | props->clr_port_cap_mask) + & ~allowed_port_caps) { + ehca_err(&shca->ib_device, "Non-changeable bits set in masks " + "set=%x clr=%x allowed=%x", props->set_port_cap_mask, + props->clr_port_cap_mask, allowed_port_caps); + return -EINVAL; + } + + if (mutex_lock_interruptible(&shca->modify_mutex)) + return -ERESTARTSYS; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + ret = -ENOMEM; + goto modify_port1; + } + + hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock); + if (hret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query port properties"); + ret = -EINVAL; + goto modify_port2; + } + + cap = (rblock->capability_mask | props->set_port_cap_mask) + & ~props->clr_port_cap_mask; + + hret = hipz_h_modify_port(shca->ipz_hca_handle, port, + cap, props->init_type, port_modify_mask); + if (hret != H_SUCCESS) { + ehca_err(&shca->ib_device, "Modify port failed h_ret=%lli", + hret); + ret = -EINVAL; + } + +modify_port2: + ehca_free_fw_ctrlblock(rblock); + +modify_port1: + mutex_unlock(&shca->modify_mutex); + + return ret; +} diff --git a/drivers/staging/rdma/ehca/ehca_irq.c b/drivers/staging/rdma/ehca/ehca_irq.c new file mode 100644 index 000000000..8615d7cf7 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_irq.c @@ -0,0 +1,870 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Functions for EQs, NEQs and interrupts + * + * Authors: Heiko J Schick + * Khadija Souissi + * Hoang-Nam Nguyen + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "ehca_classes.h" +#include "ehca_irq.h" +#include "ehca_iverbs.h" +#include "ehca_tools.h" +#include "hcp_if.h" +#include "hipz_fns.h" +#include "ipz_pt_fn.h" + +#define EQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1) +#define EQE_CQ_QP_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_EE_IDENTIFIER EHCA_BMASK_IBM( 2, 7) +#define EQE_CQ_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_QP_NUMBER EHCA_BMASK_IBM( 8, 31) +#define EQE_QP_TOKEN EHCA_BMASK_IBM(32, 63) +#define EQE_CQ_TOKEN EHCA_BMASK_IBM(32, 63) + +#define NEQE_COMPLETION_EVENT EHCA_BMASK_IBM( 1, 1) +#define NEQE_EVENT_CODE EHCA_BMASK_IBM( 2, 7) +#define NEQE_PORT_NUMBER EHCA_BMASK_IBM( 8, 15) +#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16) +#define NEQE_DISRUPTIVE EHCA_BMASK_IBM(16, 16) +#define NEQE_SPECIFIC_EVENT EHCA_BMASK_IBM(16, 23) + +#define ERROR_DATA_LENGTH EHCA_BMASK_IBM(52, 63) +#define ERROR_DATA_TYPE EHCA_BMASK_IBM( 0, 7) + +static void queue_comp_task(struct ehca_cq *__cq); + +static struct ehca_comp_pool *pool; + +static inline void comp_event_callback(struct ehca_cq *cq) +{ + if (!cq->ib_cq.comp_handler) + return; + + spin_lock(&cq->cb_lock); + cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context); + spin_unlock(&cq->cb_lock); + + return; +} + +static void print_error_data(struct ehca_shca *shca, void *data, + u64 *rblock, int length) +{ + u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]); + u64 resource = rblock[1]; + + switch (type) { + case 0x1: /* Queue Pair */ + { + struct ehca_qp *qp = (struct ehca_qp *)data; + + /* only print error data if AER is set */ + if (rblock[6] == 0) + return; + + ehca_err(&shca->ib_device, + "QP 0x%x (resource=%llx) has errors.", + qp->ib_qp.qp_num, resource); + break; + } + case 0x4: /* Completion Queue */ + { + struct ehca_cq *cq = (struct ehca_cq *)data; + + ehca_err(&shca->ib_device, + "CQ 0x%x (resource=%llx) has errors.", + cq->cq_number, resource); + break; + } + default: + ehca_err(&shca->ib_device, + "Unknown error type: %llx on %s.", + type, shca->ib_device.name); + break; + } + + ehca_err(&shca->ib_device, "Error data is available: %llx.", resource); + ehca_err(&shca->ib_device, "EHCA ----- error data begin " + "---------------------------------------------------"); + ehca_dmp(rblock, length, "resource=%llx", resource); + ehca_err(&shca->ib_device, "EHCA ----- error data end " + "----------------------------------------------------"); + + return; +} + +int ehca_error_data(struct ehca_shca *shca, void *data, + u64 resource) +{ + + unsigned long ret; + u64 *rblock; + unsigned long block_count; + + rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!rblock) { + ehca_err(&shca->ib_device, "Cannot allocate rblock memory."); + ret = -ENOMEM; + goto error_data1; + } + + /* rblock must be 4K aligned and should be 4K large */ + ret = hipz_h_error_data(shca->ipz_hca_handle, + resource, + rblock, + &block_count); + + if (ret == H_R_STATE) + ehca_err(&shca->ib_device, + "No error data is available: %llx.", resource); + else if (ret == H_SUCCESS) { + int length; + + length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]); + + if (length > EHCA_PAGESIZE) + length = EHCA_PAGESIZE; + + print_error_data(shca, data, rblock, length); + } else + ehca_err(&shca->ib_device, + "Error data could not be fetched: %llx", resource); + + ehca_free_fw_ctrlblock(rblock); + +error_data1: + return ret; + +} + +static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp, + enum ib_event_type event_type) +{ + struct ib_event event; + + /* PATH_MIG without the QP ever having been armed is false alarm */ + if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed) + return; + + event.device = &shca->ib_device; + event.event = event_type; + + if (qp->ext_type == EQPT_SRQ) { + if (!qp->ib_srq.event_handler) + return; + + event.element.srq = &qp->ib_srq; + qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context); + } else { + if (!qp->ib_qp.event_handler) + return; + + event.element.qp = &qp->ib_qp; + qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context); + } +} + +static void qp_event_callback(struct ehca_shca *shca, u64 eqe, + enum ib_event_type event_type, int fatal) +{ + struct ehca_qp *qp; + u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe); + + read_lock(&ehca_qp_idr_lock); + qp = idr_find(&ehca_qp_idr, token); + if (qp) + atomic_inc(&qp->nr_events); + read_unlock(&ehca_qp_idr_lock); + + if (!qp) + return; + + if (fatal) + ehca_error_data(shca, qp, qp->ipz_qp_handle.handle); + + dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ? + IB_EVENT_SRQ_ERR : event_type); + + /* + * eHCA only processes one WQE at a time for SRQ base QPs, + * so the last WQE has been processed as soon as the QP enters + * error state. + */ + if (fatal && qp->ext_type == EQPT_SRQBASE) + dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED); + + if (atomic_dec_and_test(&qp->nr_events)) + wake_up(&qp->wait_completion); + return; +} + +static void cq_event_callback(struct ehca_shca *shca, + u64 eqe) +{ + struct ehca_cq *cq; + u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe); + + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, token); + if (cq) + atomic_inc(&cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + + if (!cq) + return; + + ehca_error_data(shca, cq, cq->ipz_cq_handle.handle); + + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + + return; +} + +static void parse_identifier(struct ehca_shca *shca, u64 eqe) +{ + u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe); + + switch (identifier) { + case 0x02: /* path migrated */ + qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0); + break; + case 0x03: /* communication established */ + qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0); + break; + case 0x04: /* send queue drained */ + qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0); + break; + case 0x05: /* QP error */ + case 0x06: /* QP error */ + qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1); + break; + case 0x07: /* CQ error */ + case 0x08: /* CQ error */ + cq_event_callback(shca, eqe); + break; + case 0x09: /* MRMWPTE error */ + ehca_err(&shca->ib_device, "MRMWPTE error."); + break; + case 0x0A: /* port event */ + ehca_err(&shca->ib_device, "Port event."); + break; + case 0x0B: /* MR access error */ + ehca_err(&shca->ib_device, "MR access error."); + break; + case 0x0C: /* EQ error */ + ehca_err(&shca->ib_device, "EQ error."); + break; + case 0x0D: /* P/Q_Key mismatch */ + ehca_err(&shca->ib_device, "P/Q_Key mismatch."); + break; + case 0x10: /* sampling complete */ + ehca_err(&shca->ib_device, "Sampling complete."); + break; + case 0x11: /* unaffiliated access error */ + ehca_err(&shca->ib_device, "Unaffiliated access error."); + break; + case 0x12: /* path migrating */ + ehca_err(&shca->ib_device, "Path migrating."); + break; + case 0x13: /* interface trace stopped */ + ehca_err(&shca->ib_device, "Interface trace stopped."); + break; + case 0x14: /* first error capture info available */ + ehca_info(&shca->ib_device, "First error capture available"); + break; + case 0x15: /* SRQ limit reached */ + qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0); + break; + default: + ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.", + identifier, shca->ib_device.name); + break; + } + + return; +} + +static void dispatch_port_event(struct ehca_shca *shca, int port_num, + enum ib_event_type type, const char *msg) +{ + struct ib_event event; + + ehca_info(&shca->ib_device, "port %d %s.", port_num, msg); + event.device = &shca->ib_device; + event.event = type; + event.element.port_num = port_num; + ib_dispatch_event(&event); +} + +static void notify_port_conf_change(struct ehca_shca *shca, int port_num) +{ + struct ehca_sma_attr new_attr; + struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr; + + ehca_query_sma_attr(shca, port_num, &new_attr); + + if (new_attr.sm_sl != old_attr->sm_sl || + new_attr.sm_lid != old_attr->sm_lid) + dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE, + "SM changed"); + + if (new_attr.lid != old_attr->lid || + new_attr.lmc != old_attr->lmc) + dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE, + "LID changed"); + + if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len || + memcmp(new_attr.pkeys, old_attr->pkeys, + sizeof(u16) * new_attr.pkey_tbl_len)) + dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE, + "P_Key changed"); + + *old_attr = new_attr; +} + +/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */ +static int replay_modify_qp(struct ehca_sport *sport) +{ + int aqp1_destroyed; + unsigned long flags; + + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + + aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI]; + + if (sport->ibqp_sqp[IB_QPT_SMI]) + ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]); + if (!aqp1_destroyed) + ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]); + + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + + return aqp1_destroyed; +} + +static void parse_ec(struct ehca_shca *shca, u64 eqe) +{ + u8 ec = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe); + u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe); + u8 spec_event; + struct ehca_sport *sport = &shca->sport[port - 1]; + + switch (ec) { + case 0x30: /* port availability change */ + if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) { + /* only replay modify_qp calls in autodetect mode; + * if AQP1 was destroyed, the port is already down + * again and we can drop the event. + */ + if (ehca_nr_ports < 0) + if (replay_modify_qp(sport)) + break; + + sport->port_state = IB_PORT_ACTIVE; + dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE, + "is active"); + ehca_query_sma_attr(shca, port, &sport->saved_attr); + } else { + sport->port_state = IB_PORT_DOWN; + dispatch_port_event(shca, port, IB_EVENT_PORT_ERR, + "is inactive"); + } + break; + case 0x31: + /* port configuration change + * disruptive change is caused by + * LID, PKEY or SM change + */ + if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) { + ehca_warn(&shca->ib_device, "disruptive port " + "%d configuration change", port); + + sport->port_state = IB_PORT_DOWN; + dispatch_port_event(shca, port, IB_EVENT_PORT_ERR, + "is inactive"); + + sport->port_state = IB_PORT_ACTIVE; + dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE, + "is active"); + ehca_query_sma_attr(shca, port, + &sport->saved_attr); + } else + notify_port_conf_change(shca, port); + break; + case 0x32: /* adapter malfunction */ + ehca_err(&shca->ib_device, "Adapter malfunction."); + break; + case 0x33: /* trace stopped */ + ehca_err(&shca->ib_device, "Traced stopped."); + break; + case 0x34: /* util async event */ + spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe); + if (spec_event == 0x80) /* client reregister required */ + dispatch_port_event(shca, port, + IB_EVENT_CLIENT_REREGISTER, + "client reregister req."); + else + ehca_warn(&shca->ib_device, "Unknown util async " + "event %x on port %x", spec_event, port); + break; + default: + ehca_err(&shca->ib_device, "Unknown event code: %x on %s.", + ec, shca->ib_device.name); + break; + } + + return; +} + +static inline void reset_eq_pending(struct ehca_cq *cq) +{ + u64 CQx_EP; + struct h_galpa gal = cq->galpas.kernel; + + hipz_galpa_store_cq(gal, cqx_ep, 0x0); + CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep)); + + return; +} + +irqreturn_t ehca_interrupt_neq(int irq, void *dev_id) +{ + struct ehca_shca *shca = (struct ehca_shca*)dev_id; + + tasklet_hi_schedule(&shca->neq.interrupt_task); + + return IRQ_HANDLED; +} + +void ehca_tasklet_neq(unsigned long data) +{ + struct ehca_shca *shca = (struct ehca_shca*)data; + struct ehca_eqe *eqe; + u64 ret; + + eqe = ehca_poll_eq(shca, &shca->neq); + + while (eqe) { + if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry)) + parse_ec(shca, eqe->entry); + + eqe = ehca_poll_eq(shca, &shca->neq); + } + + ret = hipz_h_reset_event(shca->ipz_hca_handle, + shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL); + + if (ret != H_SUCCESS) + ehca_err(&shca->ib_device, "Can't clear notification events."); + + return; +} + +irqreturn_t ehca_interrupt_eq(int irq, void *dev_id) +{ + struct ehca_shca *shca = (struct ehca_shca*)dev_id; + + tasklet_hi_schedule(&shca->eq.interrupt_task); + + return IRQ_HANDLED; +} + + +static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe) +{ + u64 eqe_value; + u32 token; + struct ehca_cq *cq; + + eqe_value = eqe->entry; + ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value); + if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) { + ehca_dbg(&shca->ib_device, "Got completion event"); + token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value); + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, token); + if (cq) + atomic_inc(&cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + if (cq == NULL) { + ehca_err(&shca->ib_device, + "Invalid eqe for non-existing cq token=%x", + token); + return; + } + reset_eq_pending(cq); + if (ehca_scaling_code) + queue_comp_task(cq); + else { + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + } + } else { + ehca_dbg(&shca->ib_device, "Got non completion event"); + parse_identifier(shca, eqe_value); + } +} + +void ehca_process_eq(struct ehca_shca *shca, int is_irq) +{ + struct ehca_eq *eq = &shca->eq; + struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache; + u64 eqe_value, ret; + int eqe_cnt, i; + int eq_empty = 0; + + spin_lock(&eq->irq_spinlock); + if (is_irq) { + const int max_query_cnt = 100; + int query_cnt = 0; + int int_state = 1; + do { + int_state = hipz_h_query_int_state( + shca->ipz_hca_handle, eq->ist); + query_cnt++; + iosync(); + } while (int_state && query_cnt < max_query_cnt); + if (unlikely((query_cnt == max_query_cnt))) + ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x", + int_state, query_cnt); + } + + /* read out all eqes */ + eqe_cnt = 0; + do { + u32 token; + eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq); + if (!eqe_cache[eqe_cnt].eqe) + break; + eqe_value = eqe_cache[eqe_cnt].eqe->entry; + if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) { + token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value); + read_lock(&ehca_cq_idr_lock); + eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token); + if (eqe_cache[eqe_cnt].cq) + atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events); + read_unlock(&ehca_cq_idr_lock); + if (!eqe_cache[eqe_cnt].cq) { + ehca_err(&shca->ib_device, + "Invalid eqe for non-existing cq " + "token=%x", token); + continue; + } + } else + eqe_cache[eqe_cnt].cq = NULL; + eqe_cnt++; + } while (eqe_cnt < EHCA_EQE_CACHE_SIZE); + if (!eqe_cnt) { + if (is_irq) + ehca_dbg(&shca->ib_device, + "No eqe found for irq event"); + goto unlock_irq_spinlock; + } else if (!is_irq) { + ret = hipz_h_eoi(eq->ist); + if (ret != H_SUCCESS) + ehca_err(&shca->ib_device, + "bad return code EOI -rc = %lld\n", ret); + ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt); + } + if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE)) + ehca_dbg(&shca->ib_device, "too many eqes for one irq event"); + /* enable irq for new packets */ + for (i = 0; i < eqe_cnt; i++) { + if (eq->eqe_cache[i].cq) + reset_eq_pending(eq->eqe_cache[i].cq); + } + /* check eq */ + spin_lock(&eq->spinlock); + eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue)); + spin_unlock(&eq->spinlock); + /* call completion handler for cached eqes */ + for (i = 0; i < eqe_cnt; i++) + if (eq->eqe_cache[i].cq) { + if (ehca_scaling_code) + queue_comp_task(eq->eqe_cache[i].cq); + else { + struct ehca_cq *cq = eq->eqe_cache[i].cq; + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + } + } else { + ehca_dbg(&shca->ib_device, "Got non completion event"); + parse_identifier(shca, eq->eqe_cache[i].eqe->entry); + } + /* poll eq if not empty */ + if (eq_empty) + goto unlock_irq_spinlock; + do { + struct ehca_eqe *eqe; + eqe = ehca_poll_eq(shca, &shca->eq); + if (!eqe) + break; + process_eqe(shca, eqe); + } while (1); + +unlock_irq_spinlock: + spin_unlock(&eq->irq_spinlock); +} + +void ehca_tasklet_eq(unsigned long data) +{ + ehca_process_eq((struct ehca_shca*)data, 1); +} + +static int find_next_online_cpu(struct ehca_comp_pool *pool) +{ + int cpu; + unsigned long flags; + + WARN_ON_ONCE(!in_interrupt()); + if (ehca_debug_level >= 3) + ehca_dmp(cpu_online_mask, cpumask_size(), ""); + + spin_lock_irqsave(&pool->last_cpu_lock, flags); + do { + cpu = cpumask_next(pool->last_cpu, cpu_online_mask); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(cpu_online_mask); + pool->last_cpu = cpu; + } while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active); + spin_unlock_irqrestore(&pool->last_cpu_lock, flags); + + return cpu; +} + +static void __queue_comp_task(struct ehca_cq *__cq, + struct ehca_cpu_comp_task *cct, + struct task_struct *thread) +{ + unsigned long flags; + + spin_lock_irqsave(&cct->task_lock, flags); + spin_lock(&__cq->task_lock); + + if (__cq->nr_callbacks == 0) { + __cq->nr_callbacks++; + list_add_tail(&__cq->entry, &cct->cq_list); + cct->cq_jobs++; + wake_up_process(thread); + } else + __cq->nr_callbacks++; + + spin_unlock(&__cq->task_lock); + spin_unlock_irqrestore(&cct->task_lock, flags); +} + +static void queue_comp_task(struct ehca_cq *__cq) +{ + int cpu_id; + struct ehca_cpu_comp_task *cct; + struct task_struct *thread; + int cq_jobs; + unsigned long flags; + + cpu_id = find_next_online_cpu(pool); + BUG_ON(!cpu_online(cpu_id)); + + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id); + BUG_ON(!cct || !thread); + + spin_lock_irqsave(&cct->task_lock, flags); + cq_jobs = cct->cq_jobs; + spin_unlock_irqrestore(&cct->task_lock, flags); + if (cq_jobs > 0) { + cpu_id = find_next_online_cpu(pool); + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id); + BUG_ON(!cct || !thread); + } + __queue_comp_task(__cq, cct, thread); +} + +static void run_comp_task(struct ehca_cpu_comp_task *cct) +{ + struct ehca_cq *cq; + + while (!list_empty(&cct->cq_list)) { + cq = list_entry(cct->cq_list.next, struct ehca_cq, entry); + spin_unlock_irq(&cct->task_lock); + + comp_event_callback(cq); + if (atomic_dec_and_test(&cq->nr_events)) + wake_up(&cq->wait_completion); + + spin_lock_irq(&cct->task_lock); + spin_lock(&cq->task_lock); + cq->nr_callbacks--; + if (!cq->nr_callbacks) { + list_del_init(cct->cq_list.next); + cct->cq_jobs--; + } + spin_unlock(&cq->task_lock); + } +} + +static void comp_task_park(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + struct ehca_cpu_comp_task *target; + struct task_struct *thread; + struct ehca_cq *cq, *tmp; + LIST_HEAD(list); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + list_splice_init(&cct->cq_list, &list); + spin_unlock_irq(&cct->task_lock); + + cpu = find_next_online_cpu(pool); + target = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu); + spin_lock_irq(&target->task_lock); + list_for_each_entry_safe(cq, tmp, &list, entry) { + list_del(&cq->entry); + __queue_comp_task(cq, target, thread); + } + spin_unlock_irq(&target->task_lock); +} + +static void comp_task_stop(unsigned int cpu, bool online) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + + spin_lock_irq(&cct->task_lock); + cct->cq_jobs = 0; + cct->active = 0; + WARN_ON(!list_empty(&cct->cq_list)); + spin_unlock_irq(&cct->task_lock); +} + +static int comp_task_should_run(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + + return cct->cq_jobs; +} + +static void comp_task(unsigned int cpu) +{ + struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks); + int cql_empty; + + spin_lock_irq(&cct->task_lock); + cql_empty = list_empty(&cct->cq_list); + if (!cql_empty) { + __set_current_state(TASK_RUNNING); + run_comp_task(cct); + } + spin_unlock_irq(&cct->task_lock); +} + +static struct smp_hotplug_thread comp_pool_threads = { + .thread_should_run = comp_task_should_run, + .thread_fn = comp_task, + .thread_comm = "ehca_comp/%u", + .cleanup = comp_task_stop, + .park = comp_task_park, +}; + +int ehca_create_comp_pool(void) +{ + int cpu, ret = -ENOMEM; + + if (!ehca_scaling_code) + return 0; + + pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL); + if (pool == NULL) + return -ENOMEM; + + spin_lock_init(&pool->last_cpu_lock); + pool->last_cpu = cpumask_any(cpu_online_mask); + + pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task); + if (!pool->cpu_comp_tasks) + goto out_pool; + + pool->cpu_comp_threads = alloc_percpu(struct task_struct *); + if (!pool->cpu_comp_threads) + goto out_tasks; + + for_each_present_cpu(cpu) { + struct ehca_cpu_comp_task *cct; + + cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu); + spin_lock_init(&cct->task_lock); + INIT_LIST_HEAD(&cct->cq_list); + } + + comp_pool_threads.store = pool->cpu_comp_threads; + ret = smpboot_register_percpu_thread(&comp_pool_threads); + if (ret) + goto out_threads; + + pr_info("eHCA scaling code enabled\n"); + return ret; + +out_threads: + free_percpu(pool->cpu_comp_threads); +out_tasks: + free_percpu(pool->cpu_comp_tasks); +out_pool: + kfree(pool); + return ret; +} + +void ehca_destroy_comp_pool(void) +{ + if (!ehca_scaling_code) + return; + + smpboot_unregister_percpu_thread(&comp_pool_threads); + + free_percpu(pool->cpu_comp_threads); + free_percpu(pool->cpu_comp_tasks); + kfree(pool); +} diff --git a/drivers/staging/rdma/ehca/ehca_irq.h b/drivers/staging/rdma/ehca/ehca_irq.h new file mode 100644 index 000000000..5370199f0 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_irq.h @@ -0,0 +1,77 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Function definitions and structs for EQs, NEQs and interrupts + * + * Authors: Heiko J Schick + * Khadija Souissi + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_IRQ_H +#define __EHCA_IRQ_H + + +struct ehca_shca; + +#include +#include + +int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource); + +irqreturn_t ehca_interrupt_neq(int irq, void *dev_id); +void ehca_tasklet_neq(unsigned long data); + +irqreturn_t ehca_interrupt_eq(int irq, void *dev_id); +void ehca_tasklet_eq(unsigned long data); +void ehca_process_eq(struct ehca_shca *shca, int is_irq); + +struct ehca_cpu_comp_task { + struct list_head cq_list; + spinlock_t task_lock; + int cq_jobs; + int active; +}; + +struct ehca_comp_pool { + struct ehca_cpu_comp_task __percpu *cpu_comp_tasks; + struct task_struct * __percpu *cpu_comp_threads; + int last_cpu; + spinlock_t last_cpu_lock; +}; + +int ehca_create_comp_pool(void); +void ehca_destroy_comp_pool(void); + +#endif diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h new file mode 100644 index 000000000..80e6a3d5d --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_iverbs.h @@ -0,0 +1,218 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Function definitions for internal functions + * + * Authors: Heiko J Schick + * Dietmar Decker + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __EHCA_IVERBS_H__ +#define __EHCA_IVERBS_H__ + +#include "ehca_classes.h" + +int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw); + +int ehca_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); + +enum rdma_protocol_type +ehca_query_protocol(struct ib_device *device, u8 port_num); + +int ehca_query_sma_attr(struct ehca_shca *shca, u8 port, + struct ehca_sma_attr *attr); + +int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey); + +int ehca_query_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); + +int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask, + struct ib_port_modify *props); + +struct ib_pd *ehca_alloc_pd(struct ib_device *device, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ehca_dealloc_pd(struct ib_pd *pd); + +struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); + +int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); + +int ehca_destroy_ah(struct ib_ah *ah); + +struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags); + +struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, u64 *iova_start); + +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int mr_access_flags, + struct ib_udata *udata); + +int ehca_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, int mr_access_flags, u64 *iova_start); + +int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); + +int ehca_dereg_mr(struct ib_mr *mr); + +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); + +int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw, + struct ib_mw_bind *mw_bind); + +int ehca_dealloc_mw(struct ib_mw *mw); + +struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int ehca_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, int list_len, u64 iova); + +int ehca_unmap_fmr(struct list_head *fmr_list); + +int ehca_dealloc_fmr(struct ib_fmr *fmr); + +enum ehca_eq_type { + EHCA_EQ = 0, /* Event Queue */ + EHCA_NEQ /* Notification Event Queue */ +}; + +int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq, + enum ehca_eq_type type, const u32 length); + +int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq); + +void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq); + + +struct ib_cq *ehca_create_cq(struct ib_device *device, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ehca_destroy_cq(struct ib_cq *cq); + +int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata); + +int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc); + +int ehca_peek_cq(struct ib_cq *cq, int wc_cnt); + +int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags); + +struct ib_qp *ehca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int ehca_destroy_qp(struct ib_qp *qp); + +int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata); + +int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); + +int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr); + +int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + +int ehca_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); + +struct ib_srq *ehca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); + +int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); + +int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); + +int ehca_destroy_srq(struct ib_srq *srq); + +u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp, + struct ib_qp_init_attr *qp_init_attr); + +int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device, + struct ib_udata *udata); + +int ehca_dealloc_ucontext(struct ib_ucontext *context); + +int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index); + +void ehca_poll_eqs(unsigned long data); + +int ehca_calc_ipd(struct ehca_shca *shca, int port, + enum ib_rate path_rate, u32 *ipd); + +void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq); + +#ifdef CONFIG_PPC_64K_PAGES +void *ehca_alloc_fw_ctrlblock(gfp_t flags); +void ehca_free_fw_ctrlblock(void *ptr); +#else +#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags)) +#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr)) +#endif + +void ehca_recover_sqp(struct ib_qp *sqp); + +#endif diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c new file mode 100644 index 000000000..8246418cd --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_main.c @@ -0,0 +1,1123 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * module start stop, hca detection + * + * Authors: Heiko J Schick + * Hoang-Nam Nguyen + * Joachim Fenkes + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef CONFIG_PPC_64K_PAGES +#include +#endif + +#include +#include +#include +#include "ehca_classes.h" +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "ehca_tools.h" +#include "hcp_if.h" + +#define HCAD_VERSION "0029" + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Christoph Raisch "); +MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver"); +MODULE_VERSION(HCAD_VERSION); + +static bool ehca_open_aqp1 = 0; +static int ehca_hw_level = 0; +static bool ehca_poll_all_eqs = 1; + +int ehca_debug_level = 0; +int ehca_nr_ports = -1; +bool ehca_use_hp_mr = 0; +int ehca_port_act_time = 30; +int ehca_static_rate = -1; +bool ehca_scaling_code = 0; +int ehca_lock_hcalls = -1; +int ehca_max_cq = -1; +int ehca_max_qp = -1; + +module_param_named(open_aqp1, ehca_open_aqp1, bool, S_IRUGO); +module_param_named(debug_level, ehca_debug_level, int, S_IRUGO); +module_param_named(hw_level, ehca_hw_level, int, S_IRUGO); +module_param_named(nr_ports, ehca_nr_ports, int, S_IRUGO); +module_param_named(use_hp_mr, ehca_use_hp_mr, bool, S_IRUGO); +module_param_named(port_act_time, ehca_port_act_time, int, S_IRUGO); +module_param_named(poll_all_eqs, ehca_poll_all_eqs, bool, S_IRUGO); +module_param_named(static_rate, ehca_static_rate, int, S_IRUGO); +module_param_named(scaling_code, ehca_scaling_code, bool, S_IRUGO); +module_param_named(lock_hcalls, ehca_lock_hcalls, bint, S_IRUGO); +module_param_named(number_of_cqs, ehca_max_cq, int, S_IRUGO); +module_param_named(number_of_qps, ehca_max_qp, int, S_IRUGO); + +MODULE_PARM_DESC(open_aqp1, + "Open AQP1 on startup (default: no)"); +MODULE_PARM_DESC(debug_level, + "Amount of debug output (0: none (default), 1: traces, " + "2: some dumps, 3: lots)"); +MODULE_PARM_DESC(hw_level, + "Hardware level (0: autosensing (default), " + "0x10..0x14: eHCA, 0x20..0x23: eHCA2)"); +MODULE_PARM_DESC(nr_ports, + "number of connected ports (-1: autodetect (default), " + "1: port one only, 2: two ports)"); +MODULE_PARM_DESC(use_hp_mr, + "Use high performance MRs (default: no)"); +MODULE_PARM_DESC(port_act_time, + "Time to wait for port activation (default: 30 sec)"); +MODULE_PARM_DESC(poll_all_eqs, + "Poll all event queues periodically (default: yes)"); +MODULE_PARM_DESC(static_rate, + "Set permanent static rate (default: no static rate)"); +MODULE_PARM_DESC(scaling_code, + "Enable scaling code (default: no)"); +MODULE_PARM_DESC(lock_hcalls, + "Serialize all hCalls made by the driver " + "(default: autodetect)"); +MODULE_PARM_DESC(number_of_cqs, + "Max number of CQs which can be allocated " + "(default: autodetect)"); +MODULE_PARM_DESC(number_of_qps, + "Max number of QPs which can be allocated " + "(default: autodetect)"); + +DEFINE_RWLOCK(ehca_qp_idr_lock); +DEFINE_RWLOCK(ehca_cq_idr_lock); +DEFINE_IDR(ehca_qp_idr); +DEFINE_IDR(ehca_cq_idr); + +static LIST_HEAD(shca_list); /* list of all registered ehcas */ +DEFINE_SPINLOCK(shca_list_lock); + +static struct timer_list poll_eqs_timer; + +#ifdef CONFIG_PPC_64K_PAGES +static struct kmem_cache *ctblk_cache; + +void *ehca_alloc_fw_ctrlblock(gfp_t flags) +{ + void *ret = kmem_cache_zalloc(ctblk_cache, flags); + if (!ret) + ehca_gen_err("Out of memory for ctblk"); + return ret; +} + +void ehca_free_fw_ctrlblock(void *ptr) +{ + if (ptr) + kmem_cache_free(ctblk_cache, ptr); + +} +#endif + +int ehca2ib_return_code(u64 ehca_rc) +{ + switch (ehca_rc) { + case H_SUCCESS: + return 0; + case H_RESOURCE: /* Resource in use */ + case H_BUSY: + return -EBUSY; + case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */ + case H_CONSTRAINED: /* resource constraint */ + case H_NO_MEM: + return -ENOMEM; + default: + return -EINVAL; + } +} + +static int ehca_create_slab_caches(void) +{ + int ret; + + ret = ehca_init_pd_cache(); + if (ret) { + ehca_gen_err("Cannot create PD SLAB cache."); + return ret; + } + + ret = ehca_init_cq_cache(); + if (ret) { + ehca_gen_err("Cannot create CQ SLAB cache."); + goto create_slab_caches2; + } + + ret = ehca_init_qp_cache(); + if (ret) { + ehca_gen_err("Cannot create QP SLAB cache."); + goto create_slab_caches3; + } + + ret = ehca_init_av_cache(); + if (ret) { + ehca_gen_err("Cannot create AV SLAB cache."); + goto create_slab_caches4; + } + + ret = ehca_init_mrmw_cache(); + if (ret) { + ehca_gen_err("Cannot create MR&MW SLAB cache."); + goto create_slab_caches5; + } + + ret = ehca_init_small_qp_cache(); + if (ret) { + ehca_gen_err("Cannot create small queue SLAB cache."); + goto create_slab_caches6; + } + +#ifdef CONFIG_PPC_64K_PAGES + ctblk_cache = kmem_cache_create("ehca_cache_ctblk", + EHCA_PAGESIZE, H_CB_ALIGNMENT, + SLAB_HWCACHE_ALIGN, + NULL); + if (!ctblk_cache) { + ehca_gen_err("Cannot create ctblk SLAB cache."); + ehca_cleanup_small_qp_cache(); + ret = -ENOMEM; + goto create_slab_caches6; + } +#endif + return 0; + +create_slab_caches6: + ehca_cleanup_mrmw_cache(); + +create_slab_caches5: + ehca_cleanup_av_cache(); + +create_slab_caches4: + ehca_cleanup_qp_cache(); + +create_slab_caches3: + ehca_cleanup_cq_cache(); + +create_slab_caches2: + ehca_cleanup_pd_cache(); + + return ret; +} + +static void ehca_destroy_slab_caches(void) +{ + ehca_cleanup_small_qp_cache(); + ehca_cleanup_mrmw_cache(); + ehca_cleanup_av_cache(); + ehca_cleanup_qp_cache(); + ehca_cleanup_cq_cache(); + ehca_cleanup_pd_cache(); +#ifdef CONFIG_PPC_64K_PAGES + if (ctblk_cache) + kmem_cache_destroy(ctblk_cache); +#endif +} + +#define EHCA_HCAAVER EHCA_BMASK_IBM(32, 39) +#define EHCA_REVID EHCA_BMASK_IBM(40, 63) + +static struct cap_descr { + u64 mask; + char *descr; +} hca_cap_descr[] = { + { HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" }, + { HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" }, + { HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" }, + { HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" }, + { HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" }, + { HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" }, + { HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" }, + { HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" }, + { HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" }, + { HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" }, + { HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" }, + { HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" }, + { HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" }, + { HCA_CAP_SRQ, "HCA_CAP_SRQ" }, + { HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" }, + { HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" }, + { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" }, + { HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" }, +}; + +static int ehca_sense_attributes(struct ehca_shca *shca) +{ + int i, ret = 0; + u64 h_ret; + struct hipz_query_hca *rblock; + struct hipz_query_port *port; + const char *loc_code; + + static const u32 pgsize_map[] = { + HCA_CAP_MR_PGSIZE_4K, 0x1000, + HCA_CAP_MR_PGSIZE_64K, 0x10000, + HCA_CAP_MR_PGSIZE_1M, 0x100000, + HCA_CAP_MR_PGSIZE_16M, 0x1000000, + }; + + ehca_gen_dbg("Probing adapter %s...", + shca->ofdev->dev.of_node->full_name); + loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code", + NULL); + if (loc_code) + ehca_gen_dbg(" ... location lode=%s", loc_code); + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_gen_err("Cannot allocate rblock memory."); + return -ENOMEM; + } + + h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock); + if (h_ret != H_SUCCESS) { + ehca_gen_err("Cannot query device properties. h_ret=%lli", + h_ret); + ret = -EPERM; + goto sense_attributes1; + } + + if (ehca_nr_ports == 1) + shca->num_ports = 1; + else + shca->num_ports = (u8)rblock->num_ports; + + ehca_gen_dbg(" ... found %x ports", rblock->num_ports); + + if (ehca_hw_level == 0) { + u32 hcaaver; + u32 revid; + + hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver); + revid = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver); + + ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid); + + if (hcaaver == 1) { + if (revid <= 3) + shca->hw_level = 0x10 | (revid + 1); + else + shca->hw_level = 0x14; + } else if (hcaaver == 2) { + if (revid == 0) + shca->hw_level = 0x21; + else if (revid == 0x10) + shca->hw_level = 0x22; + else if (revid == 0x20 || revid == 0x21) + shca->hw_level = 0x23; + } + + if (!shca->hw_level) { + ehca_gen_warn("unknown hardware version" + " - assuming default level"); + shca->hw_level = 0x22; + } + } else + shca->hw_level = ehca_hw_level; + ehca_gen_dbg(" ... hardware level=%x", shca->hw_level); + + shca->hca_cap = rblock->hca_cap_indicators; + ehca_gen_dbg(" ... HCA capabilities:"); + for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++) + if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap)) + ehca_gen_dbg(" %s", hca_cap_descr[i].descr); + + /* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is + * a firmware property, so it's valid across all adapters + */ + if (ehca_lock_hcalls == -1) + ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC, + shca->hca_cap); + + /* translate supported MR page sizes; always support 4K */ + shca->hca_cap_mr_pgsize = EHCA_PAGESIZE; + for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2) + if (rblock->memory_page_size_supported & pgsize_map[i]) + shca->hca_cap_mr_pgsize |= pgsize_map[i + 1]; + + /* Set maximum number of CQs and QPs to calculate EQ size */ + if (shca->max_num_qps == -1) + shca->max_num_qps = min_t(int, rblock->max_qp, + EHCA_MAX_NUM_QUEUES); + else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) { + ehca_gen_warn("The requested number of QPs is out of range " + "(1 - %i) specified by HW. Value is set to %i", + rblock->max_qp, rblock->max_qp); + shca->max_num_qps = rblock->max_qp; + } + + if (shca->max_num_cqs == -1) + shca->max_num_cqs = min_t(int, rblock->max_cq, + EHCA_MAX_NUM_QUEUES); + else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) { + ehca_gen_warn("The requested number of CQs is out of range " + "(1 - %i) specified by HW. Value is set to %i", + rblock->max_cq, rblock->max_cq); + } + + /* query max MTU from first port -- it's the same for all ports */ + port = (struct hipz_query_port *)rblock; + h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port); + if (h_ret != H_SUCCESS) { + ehca_gen_err("Cannot query port properties. h_ret=%lli", + h_ret); + ret = -EPERM; + goto sense_attributes1; + } + + shca->max_mtu = port->max_mtu; + +sense_attributes1: + ehca_free_fw_ctrlblock(rblock); + return ret; +} + +static int init_node_guid(struct ehca_shca *shca) +{ + int ret = 0; + struct hipz_query_hca *rblock; + + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!rblock) { + ehca_err(&shca->ib_device, "Can't allocate rblock memory."); + return -ENOMEM; + } + + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { + ehca_err(&shca->ib_device, "Can't query device properties"); + ret = -EINVAL; + goto init_node_guid1; + } + + memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64)); + +init_node_guid1: + ehca_free_fw_ctrlblock(rblock); + return ret; +} + +static int ehca_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + err = ehca_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +static int ehca_init_device(struct ehca_shca *shca) +{ + int ret; + + ret = init_node_guid(shca); + if (ret) + return ret; + + strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX); + shca->ib_device.owner = THIS_MODULE; + + shca->ib_device.uverbs_abi_ver = 8; + shca->ib_device.uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST); + + shca->ib_device.node_type = RDMA_NODE_IB_CA; + shca->ib_device.phys_port_cnt = shca->num_ports; + shca->ib_device.num_comp_vectors = 1; + shca->ib_device.dma_device = &shca->ofdev->dev; + shca->ib_device.query_device = ehca_query_device; + shca->ib_device.query_port = ehca_query_port; + shca->ib_device.query_gid = ehca_query_gid; + shca->ib_device.query_pkey = ehca_query_pkey; + /* shca->in_device.modify_device = ehca_modify_device */ + shca->ib_device.modify_port = ehca_modify_port; + shca->ib_device.alloc_ucontext = ehca_alloc_ucontext; + shca->ib_device.dealloc_ucontext = ehca_dealloc_ucontext; + shca->ib_device.alloc_pd = ehca_alloc_pd; + shca->ib_device.dealloc_pd = ehca_dealloc_pd; + shca->ib_device.create_ah = ehca_create_ah; + /* shca->ib_device.modify_ah = ehca_modify_ah; */ + shca->ib_device.query_ah = ehca_query_ah; + shca->ib_device.destroy_ah = ehca_destroy_ah; + shca->ib_device.create_qp = ehca_create_qp; + shca->ib_device.modify_qp = ehca_modify_qp; + shca->ib_device.query_qp = ehca_query_qp; + shca->ib_device.destroy_qp = ehca_destroy_qp; + shca->ib_device.post_send = ehca_post_send; + shca->ib_device.post_recv = ehca_post_recv; + shca->ib_device.create_cq = ehca_create_cq; + shca->ib_device.destroy_cq = ehca_destroy_cq; + shca->ib_device.resize_cq = ehca_resize_cq; + shca->ib_device.poll_cq = ehca_poll_cq; + /* shca->ib_device.peek_cq = ehca_peek_cq; */ + shca->ib_device.req_notify_cq = ehca_req_notify_cq; + /* shca->ib_device.req_ncomp_notif = ehca_req_ncomp_notif; */ + shca->ib_device.get_dma_mr = ehca_get_dma_mr; + shca->ib_device.reg_phys_mr = ehca_reg_phys_mr; + shca->ib_device.reg_user_mr = ehca_reg_user_mr; + shca->ib_device.query_mr = ehca_query_mr; + shca->ib_device.dereg_mr = ehca_dereg_mr; + shca->ib_device.rereg_phys_mr = ehca_rereg_phys_mr; + shca->ib_device.alloc_mw = ehca_alloc_mw; + shca->ib_device.bind_mw = ehca_bind_mw; + shca->ib_device.dealloc_mw = ehca_dealloc_mw; + shca->ib_device.alloc_fmr = ehca_alloc_fmr; + shca->ib_device.map_phys_fmr = ehca_map_phys_fmr; + shca->ib_device.unmap_fmr = ehca_unmap_fmr; + shca->ib_device.dealloc_fmr = ehca_dealloc_fmr; + shca->ib_device.attach_mcast = ehca_attach_mcast; + shca->ib_device.detach_mcast = ehca_detach_mcast; + shca->ib_device.process_mad = ehca_process_mad; + shca->ib_device.mmap = ehca_mmap; + shca->ib_device.dma_ops = &ehca_dma_mapping_ops; + shca->ib_device.get_port_immutable = ehca_port_immutable; + + if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) { + shca->ib_device.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + shca->ib_device.create_srq = ehca_create_srq; + shca->ib_device.modify_srq = ehca_modify_srq; + shca->ib_device.query_srq = ehca_query_srq; + shca->ib_device.destroy_srq = ehca_destroy_srq; + shca->ib_device.post_srq_recv = ehca_post_srq_recv; + } + + return ret; +} + +static int ehca_create_aqp1(struct ehca_shca *shca, u32 port) +{ + struct ehca_sport *sport = &shca->sport[port - 1]; + struct ib_cq *ibcq; + struct ib_qp *ibqp; + struct ib_qp_init_attr qp_init_attr; + struct ib_cq_init_attr cq_attr = {}; + int ret; + + if (sport->ibcq_aqp1) { + ehca_err(&shca->ib_device, "AQP1 CQ is already created."); + return -EPERM; + } + + cq_attr.cqe = 10; + ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), + &cq_attr); + if (IS_ERR(ibcq)) { + ehca_err(&shca->ib_device, "Cannot create AQP1 CQ."); + return PTR_ERR(ibcq); + } + sport->ibcq_aqp1 = ibcq; + + if (sport->ibqp_sqp[IB_QPT_GSI]) { + ehca_err(&shca->ib_device, "AQP1 QP is already created."); + ret = -EPERM; + goto create_aqp1; + } + + memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr)); + qp_init_attr.send_cq = ibcq; + qp_init_attr.recv_cq = ibcq; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.cap.max_send_wr = 100; + qp_init_attr.cap.max_recv_wr = 100; + qp_init_attr.cap.max_send_sge = 2; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.qp_type = IB_QPT_GSI; + qp_init_attr.port_num = port; + qp_init_attr.qp_context = NULL; + qp_init_attr.event_handler = NULL; + qp_init_attr.srq = NULL; + + ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr); + if (IS_ERR(ibqp)) { + ehca_err(&shca->ib_device, "Cannot create AQP1 QP."); + ret = PTR_ERR(ibqp); + goto create_aqp1; + } + sport->ibqp_sqp[IB_QPT_GSI] = ibqp; + + return 0; + +create_aqp1: + ib_destroy_cq(sport->ibcq_aqp1); + return ret; +} + +static int ehca_destroy_aqp1(struct ehca_sport *sport) +{ + int ret; + + ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]); + if (ret) { + ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret); + return ret; + } + + ret = ib_destroy_cq(sport->ibcq_aqp1); + if (ret) + ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret); + + return ret; +} + +static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level); +} + +static ssize_t ehca_store_debug_level(struct device_driver *ddp, + const char *buf, size_t count) +{ + int value = (*buf) - '0'; + if (value >= 0 && value <= 9) + ehca_debug_level = value; + return 1; +} + +static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR, + ehca_show_debug_level, ehca_store_debug_level); + +static struct attribute *ehca_drv_attrs[] = { + &driver_attr_debug_level.attr, + NULL +}; + +static struct attribute_group ehca_drv_attr_grp = { + .attrs = ehca_drv_attrs +}; + +static const struct attribute_group *ehca_drv_attr_groups[] = { + &ehca_drv_attr_grp, + NULL, +}; + +#define EHCA_RESOURCE_ATTR(name) \ +static ssize_t ehca_show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct ehca_shca *shca; \ + struct hipz_query_hca *rblock; \ + int data; \ + \ + shca = dev_get_drvdata(dev); \ + \ + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); \ + if (!rblock) { \ + dev_err(dev, "Can't allocate rblock memory.\n"); \ + return 0; \ + } \ + \ + if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \ + dev_err(dev, "Can't query device properties\n"); \ + ehca_free_fw_ctrlblock(rblock); \ + return 0; \ + } \ + \ + data = rblock->name; \ + ehca_free_fw_ctrlblock(rblock); \ + \ + if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1)) \ + return snprintf(buf, 256, "1\n"); \ + else \ + return snprintf(buf, 256, "%d\n", data); \ + \ +} \ +static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL); + +EHCA_RESOURCE_ATTR(num_ports); +EHCA_RESOURCE_ATTR(hw_ver); +EHCA_RESOURCE_ATTR(max_eq); +EHCA_RESOURCE_ATTR(cur_eq); +EHCA_RESOURCE_ATTR(max_cq); +EHCA_RESOURCE_ATTR(cur_cq); +EHCA_RESOURCE_ATTR(max_qp); +EHCA_RESOURCE_ATTR(cur_qp); +EHCA_RESOURCE_ATTR(max_mr); +EHCA_RESOURCE_ATTR(cur_mr); +EHCA_RESOURCE_ATTR(max_mw); +EHCA_RESOURCE_ATTR(cur_mw); +EHCA_RESOURCE_ATTR(max_pd); +EHCA_RESOURCE_ATTR(max_ah); + +static ssize_t ehca_show_adapter_handle(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ehca_shca *shca = dev_get_drvdata(dev); + + return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle); + +} +static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL); + +static struct attribute *ehca_dev_attrs[] = { + &dev_attr_adapter_handle.attr, + &dev_attr_num_ports.attr, + &dev_attr_hw_ver.attr, + &dev_attr_max_eq.attr, + &dev_attr_cur_eq.attr, + &dev_attr_max_cq.attr, + &dev_attr_cur_cq.attr, + &dev_attr_max_qp.attr, + &dev_attr_cur_qp.attr, + &dev_attr_max_mr.attr, + &dev_attr_cur_mr.attr, + &dev_attr_max_mw.attr, + &dev_attr_cur_mw.attr, + &dev_attr_max_pd.attr, + &dev_attr_max_ah.attr, + NULL +}; + +static struct attribute_group ehca_dev_attr_grp = { + .attrs = ehca_dev_attrs +}; + +static int ehca_probe(struct platform_device *dev) +{ + struct ehca_shca *shca; + const u64 *handle; + struct ib_pd *ibpd; + int ret, i, eq_size; + unsigned long flags; + + handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL); + if (!handle) { + ehca_gen_err("Cannot get eHCA handle for adapter: %s.", + dev->dev.of_node->full_name); + return -ENODEV; + } + + if (!(*handle)) { + ehca_gen_err("Wrong eHCA handle for adapter: %s.", + dev->dev.of_node->full_name); + return -ENODEV; + } + + shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca)); + if (!shca) { + ehca_gen_err("Cannot allocate shca memory."); + return -ENOMEM; + } + + mutex_init(&shca->modify_mutex); + atomic_set(&shca->num_cqs, 0); + atomic_set(&shca->num_qps, 0); + shca->max_num_qps = ehca_max_qp; + shca->max_num_cqs = ehca_max_cq; + + for (i = 0; i < ARRAY_SIZE(shca->sport); i++) + spin_lock_init(&shca->sport[i].mod_sqp_lock); + + shca->ofdev = dev; + shca->ipz_hca_handle.handle = *handle; + dev_set_drvdata(&dev->dev, shca); + + ret = ehca_sense_attributes(shca); + if (ret < 0) { + ehca_gen_err("Cannot sense eHCA attributes."); + goto probe1; + } + + ret = ehca_init_device(shca); + if (ret) { + ehca_gen_err("Cannot init ehca device struct"); + goto probe1; + } + + eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps; + /* create event queues */ + ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size); + if (ret) { + ehca_err(&shca->ib_device, "Cannot create EQ."); + goto probe1; + } + + ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513); + if (ret) { + ehca_err(&shca->ib_device, "Cannot create NEQ."); + goto probe3; + } + + /* create internal protection domain */ + ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL); + if (IS_ERR(ibpd)) { + ehca_err(&shca->ib_device, "Cannot create internal PD."); + ret = PTR_ERR(ibpd); + goto probe4; + } + + shca->pd = container_of(ibpd, struct ehca_pd, ib_pd); + shca->pd->ib_pd.device = &shca->ib_device; + + /* create internal max MR */ + ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr); + + if (ret) { + ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i", + ret); + goto probe5; + } + + ret = ib_register_device(&shca->ib_device, NULL); + if (ret) { + ehca_err(&shca->ib_device, + "ib_register_device() failed ret=%i", ret); + goto probe6; + } + + /* create AQP1 for port 1 */ + if (ehca_open_aqp1 == 1) { + shca->sport[0].port_state = IB_PORT_DOWN; + ret = ehca_create_aqp1(shca, 1); + if (ret) { + ehca_err(&shca->ib_device, + "Cannot create AQP1 for port 1."); + goto probe7; + } + } + + /* create AQP1 for port 2 */ + if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) { + shca->sport[1].port_state = IB_PORT_DOWN; + ret = ehca_create_aqp1(shca, 2); + if (ret) { + ehca_err(&shca->ib_device, + "Cannot create AQP1 for port 2."); + goto probe8; + } + } + + ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp); + if (ret) /* only complain; we can live without attributes */ + ehca_err(&shca->ib_device, + "Cannot create device attributes ret=%d", ret); + + spin_lock_irqsave(&shca_list_lock, flags); + list_add(&shca->shca_list, &shca_list); + spin_unlock_irqrestore(&shca_list_lock, flags); + + return 0; + +probe8: + ret = ehca_destroy_aqp1(&shca->sport[0]); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy AQP1 for port 1. ret=%i", ret); + +probe7: + ib_unregister_device(&shca->ib_device); + +probe6: + ret = ehca_dereg_internal_maxmr(shca); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal MR. ret=%x", ret); + +probe5: + ret = ehca_dealloc_pd(&shca->pd->ib_pd); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal PD. ret=%x", ret); + +probe4: + ret = ehca_destroy_eq(shca, &shca->neq); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy NEQ. ret=%x", ret); + +probe3: + ret = ehca_destroy_eq(shca, &shca->eq); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy EQ. ret=%x", ret); + +probe1: + ib_dealloc_device(&shca->ib_device); + + return -EINVAL; +} + +static int ehca_remove(struct platform_device *dev) +{ + struct ehca_shca *shca = dev_get_drvdata(&dev->dev); + unsigned long flags; + int ret; + + sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp); + + if (ehca_open_aqp1 == 1) { + int i; + for (i = 0; i < shca->num_ports; i++) { + ret = ehca_destroy_aqp1(&shca->sport[i]); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy AQP1 for port %x " + "ret=%i", ret, i); + } + } + + ib_unregister_device(&shca->ib_device); + + ret = ehca_dereg_internal_maxmr(shca); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal MR. ret=%i", ret); + + ret = ehca_dealloc_pd(&shca->pd->ib_pd); + if (ret) + ehca_err(&shca->ib_device, + "Cannot destroy internal PD. ret=%i", ret); + + ret = ehca_destroy_eq(shca, &shca->eq); + if (ret) + ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret); + + ret = ehca_destroy_eq(shca, &shca->neq); + if (ret) + ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret); + + ib_dealloc_device(&shca->ib_device); + + spin_lock_irqsave(&shca_list_lock, flags); + list_del(&shca->shca_list); + spin_unlock_irqrestore(&shca_list_lock, flags); + + return ret; +} + +static struct of_device_id ehca_device_table[] = +{ + { + .name = "lhca", + .compatible = "IBM,lhca", + }, + {}, +}; +MODULE_DEVICE_TABLE(of, ehca_device_table); + +static struct platform_driver ehca_driver = { + .probe = ehca_probe, + .remove = ehca_remove, + .driver = { + .name = "ehca", + .owner = THIS_MODULE, + .groups = ehca_drv_attr_groups, + .of_match_table = ehca_device_table, + }, +}; + +void ehca_poll_eqs(unsigned long data) +{ + struct ehca_shca *shca; + + spin_lock(&shca_list_lock); + list_for_each_entry(shca, &shca_list, shca_list) { + if (shca->eq.is_initialized) { + /* call deadman proc only if eq ptr does not change */ + struct ehca_eq *eq = &shca->eq; + int max = 3; + volatile u64 q_ofs, q_ofs2; + unsigned long flags; + spin_lock_irqsave(&eq->spinlock, flags); + q_ofs = eq->ipz_queue.current_q_offset; + spin_unlock_irqrestore(&eq->spinlock, flags); + do { + spin_lock_irqsave(&eq->spinlock, flags); + q_ofs2 = eq->ipz_queue.current_q_offset; + spin_unlock_irqrestore(&eq->spinlock, flags); + max--; + } while (q_ofs == q_ofs2 && max > 0); + if (q_ofs == q_ofs2) + ehca_process_eq(shca, 0); + } + } + mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ)); + spin_unlock(&shca_list_lock); +} + +static int ehca_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + static unsigned long ehca_dmem_warn_time; + unsigned long flags; + + switch (action) { + case MEM_CANCEL_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_ONLINE: + case MEM_OFFLINE: + return NOTIFY_OK; + case MEM_GOING_ONLINE: + case MEM_GOING_OFFLINE: + /* only ok if no hca is attached to the lpar */ + spin_lock_irqsave(&shca_list_lock, flags); + if (list_empty(&shca_list)) { + spin_unlock_irqrestore(&shca_list_lock, flags); + return NOTIFY_OK; + } else { + spin_unlock_irqrestore(&shca_list_lock, flags); + if (printk_timed_ratelimit(&ehca_dmem_warn_time, + 30 * 1000)) + ehca_gen_err("DMEM operations are not allowed" + "in conjunction with eHCA"); + return NOTIFY_BAD; + } + } + return NOTIFY_OK; +} + +static struct notifier_block ehca_mem_nb = { + .notifier_call = ehca_mem_notifier, +}; + +static int __init ehca_module_init(void) +{ + int ret; + + printk(KERN_INFO "eHCA Infiniband Device Driver " + "(Version " HCAD_VERSION ")\n"); + + ret = ehca_create_comp_pool(); + if (ret) { + ehca_gen_err("Cannot create comp pool."); + return ret; + } + + ret = ehca_create_slab_caches(); + if (ret) { + ehca_gen_err("Cannot create SLAB caches"); + ret = -ENOMEM; + goto module_init1; + } + + ret = ehca_create_busmap(); + if (ret) { + ehca_gen_err("Cannot create busmap."); + goto module_init2; + } + + ret = ibmebus_register_driver(&ehca_driver); + if (ret) { + ehca_gen_err("Cannot register eHCA device driver"); + ret = -EINVAL; + goto module_init3; + } + + ret = register_memory_notifier(&ehca_mem_nb); + if (ret) { + ehca_gen_err("Failed registering memory add/remove notifier"); + goto module_init4; + } + + if (ehca_poll_all_eqs != 1) { + ehca_gen_err("WARNING!!!"); + ehca_gen_err("It is possible to lose interrupts."); + } else { + init_timer(&poll_eqs_timer); + poll_eqs_timer.function = ehca_poll_eqs; + poll_eqs_timer.expires = jiffies + HZ; + add_timer(&poll_eqs_timer); + } + + return 0; + +module_init4: + ibmebus_unregister_driver(&ehca_driver); + +module_init3: + ehca_destroy_busmap(); + +module_init2: + ehca_destroy_slab_caches(); + +module_init1: + ehca_destroy_comp_pool(); + return ret; +}; + +static void __exit ehca_module_exit(void) +{ + if (ehca_poll_all_eqs == 1) + del_timer_sync(&poll_eqs_timer); + + ibmebus_unregister_driver(&ehca_driver); + + unregister_memory_notifier(&ehca_mem_nb); + + ehca_destroy_busmap(); + + ehca_destroy_slab_caches(); + + ehca_destroy_comp_pool(); + + idr_destroy(&ehca_cq_idr); + idr_destroy(&ehca_qp_idr); +}; + +module_init(ehca_module_init); +module_exit(ehca_module_exit); diff --git a/drivers/staging/rdma/ehca/ehca_mcast.c b/drivers/staging/rdma/ehca/ehca_mcast.c new file mode 100644 index 000000000..cec181532 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_mcast.c @@ -0,0 +1,131 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * mcast functions + * + * Authors: Khadija Souissi + * Waleri Fomin + * Reinhard Ernst + * Hoang-Nam Nguyen + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +#define MAX_MC_LID 0xFFFE +#define MIN_MC_LID 0xC000 /* Multicast limits */ +#define EHCA_VALID_MULTICAST_GID(gid) ((gid)[0] == 0xFF) +#define EHCA_VALID_MULTICAST_LID(lid) \ + (((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID)) + +int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca, + ib_device); + union ib_gid my_gid; + u64 subnet_prefix, interface_id, h_ret; + + if (ibqp->qp_type != IB_QPT_UD) { + ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type); + return -EINVAL; + } + + if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) { + ehca_err(ibqp->device, "invalid mulitcast gid"); + return -EINVAL; + } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) { + ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid); + return -EINVAL; + } + + memcpy(&my_gid, gid->raw, sizeof(union ib_gid)); + + subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix); + interface_id = be64_to_cpu(my_gid.global.interface_id); + h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + my_qp->galpas.kernel, + lid, subnet_prefix, interface_id); + if (h_ret != H_SUCCESS) + ehca_err(ibqp->device, + "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed " + "h_ret=%lli", my_qp, ibqp->qp_num, h_ret); + + return ehca2ib_return_code(h_ret); +} + +int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(ibqp->pd->device, + struct ehca_shca, ib_device); + union ib_gid my_gid; + u64 subnet_prefix, interface_id, h_ret; + + if (ibqp->qp_type != IB_QPT_UD) { + ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type); + return -EINVAL; + } + + if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) { + ehca_err(ibqp->device, "invalid mulitcast gid"); + return -EINVAL; + } else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) { + ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid); + return -EINVAL; + } + + memcpy(&my_gid, gid->raw, sizeof(union ib_gid)); + + subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix); + interface_id = be64_to_cpu(my_gid.global.interface_id); + h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + my_qp->galpas.kernel, + lid, subnet_prefix, interface_id); + if (h_ret != H_SUCCESS) + ehca_err(ibqp->device, + "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed " + "h_ret=%lli", my_qp, ibqp->qp_num, h_ret); + + return ehca2ib_return_code(h_ret); +} diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c new file mode 100644 index 000000000..f914b3099 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_mrmw.c @@ -0,0 +1,2593 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * MR/MW functions + * + * Authors: Dietmar Decker + * Christoph Raisch + * Hoang-Nam Nguyen + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "hcp_if.h" +#include "hipz_hw.h" + +#define NUM_CHUNKS(length, chunk_size) \ + (((length) + (chunk_size - 1)) / (chunk_size)) + +/* max number of rpages (per hcall register_rpages) */ +#define MAX_RPAGES 512 + +/* DMEM toleration management */ +#define EHCA_SECTSHIFT SECTION_SIZE_BITS +#define EHCA_SECTSIZE (1UL << EHCA_SECTSHIFT) +#define EHCA_HUGEPAGESHIFT 34 +#define EHCA_HUGEPAGE_SIZE (1UL << EHCA_HUGEPAGESHIFT) +#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT) +#define EHCA_INVAL_ADDR 0xFFFFFFFFFFFFFFFFULL +#define EHCA_DIR_INDEX_SHIFT 13 /* 8k Entries in 64k block */ +#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2) +#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT) +#define EHCA_TOP_MAP_SIZE (0x10000) /* currently fixed map size */ +#define EHCA_DIR_MAP_SIZE (0x10000) +#define EHCA_ENT_MAP_SIZE (0x10000) +#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1) + +static unsigned long ehca_mr_len; + +/* + * Memory map data structures + */ +struct ehca_dir_bmap { + u64 ent[EHCA_MAP_ENTRIES]; +}; +struct ehca_top_bmap { + struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES]; +}; +struct ehca_bmap { + struct ehca_top_bmap *top[EHCA_MAP_ENTRIES]; +}; + +static struct ehca_bmap *ehca_bmap; + +static struct kmem_cache *mr_cache; +static struct kmem_cache *mw_cache; + +enum ehca_mr_pgsize { + EHCA_MR_PGSIZE4K = 0x1000L, + EHCA_MR_PGSIZE64K = 0x10000L, + EHCA_MR_PGSIZE1M = 0x100000L, + EHCA_MR_PGSIZE16M = 0x1000000L +}; + +#define EHCA_MR_PGSHIFT4K 12 +#define EHCA_MR_PGSHIFT64K 16 +#define EHCA_MR_PGSHIFT1M 20 +#define EHCA_MR_PGSHIFT16M 24 + +static u64 ehca_map_vaddr(void *caddr); + +static u32 ehca_encode_hwpage_size(u32 pgsize) +{ + int log = ilog2(pgsize); + WARN_ON(log < 12 || log > 24 || log & 3); + return (log - 12) / 4; +} + +static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca) +{ + return rounddown_pow_of_two(shca->hca_cap_mr_pgsize); +} + +static struct ehca_mr *ehca_mr_new(void) +{ + struct ehca_mr *me; + + me = kmem_cache_zalloc(mr_cache, GFP_KERNEL); + if (me) + spin_lock_init(&me->mrlock); + else + ehca_gen_err("alloc failed"); + + return me; +} + +static void ehca_mr_delete(struct ehca_mr *me) +{ + kmem_cache_free(mr_cache, me); +} + +static struct ehca_mw *ehca_mw_new(void) +{ + struct ehca_mw *me; + + me = kmem_cache_zalloc(mw_cache, GFP_KERNEL); + if (me) + spin_lock_init(&me->mwlock); + else + ehca_gen_err("alloc failed"); + + return me; +} + +static void ehca_mw_delete(struct ehca_mw *me) +{ + kmem_cache_free(mw_cache, me); +} + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags) +{ + struct ib_mr *ib_mr; + int ret; + struct ehca_mr *e_maxmr; + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + + if (shca->maxmr) { + e_maxmr = ehca_mr_new(); + if (!e_maxmr) { + ehca_err(&shca->ib_device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto get_dma_mr_exit0; + } + + ret = ehca_reg_maxmr(shca, e_maxmr, + (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)), + mr_access_flags, e_pd, + &e_maxmr->ib.ib_mr.lkey, + &e_maxmr->ib.ib_mr.rkey); + if (ret) { + ehca_mr_delete(e_maxmr); + ib_mr = ERR_PTR(ret); + goto get_dma_mr_exit0; + } + ib_mr = &e_maxmr->ib.ib_mr; + } else { + ehca_err(&shca->ib_device, "no internal max-MR exist!"); + ib_mr = ERR_PTR(-EINVAL); + goto get_dma_mr_exit0; + } + +get_dma_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x", + PTR_ERR(ib_mr), pd, mr_access_flags); + return ib_mr; +} /* end ehca_get_dma_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + struct ib_mr *ib_mr; + int ret; + struct ehca_mr *e_mr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + + u64 size; + + if ((num_phys_buf <= 0) || !phys_buf_array) { + ehca_err(pd->device, "bad input values: num_phys_buf=%x " + "phys_buf_array=%p", num_phys_buf, phys_buf_array); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + + /* check physical buffer list and calculate size */ + ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf, + iova_start, &size); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit0; + } + if ((size == 0) || + (((u64)iova_start + size) < (u64)iova_start)) { + ehca_err(pd->device, "bad input values: size=%llx iova_start=%p", + size, iova_start); + ib_mr = ERR_PTR(-EINVAL); + goto reg_phys_mr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(pd->device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto reg_phys_mr_exit0; + } + + /* register MR on HCA */ + if (ehca_mr_is_maxmr(size, iova_start)) { + e_mr->flags |= EHCA_MR_FLAG_MAXMR; + ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags, + e_pd, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit1; + } + } else { + struct ehca_mr_pginfo pginfo; + u32 num_kpages; + u32 num_hwpages; + u64 hw_pgsize; + + num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size, + PAGE_SIZE); + /* for kernel space we try most possible pgsize */ + hw_pgsize = ehca_get_max_hwpage_size(shca); + num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size, + hw_pgsize); + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.num_hwpages = num_hwpages; + pginfo.u.phy.num_phys_buf = num_phys_buf; + pginfo.u.phy.phys_buf_array = phys_buf_array; + pginfo.next_hwpage = + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags, + e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_phys_mr_exit1; + } + } + + /* successful registration of all pages */ + return &e_mr->ib.ib_mr; + +reg_phys_mr_exit1: + ehca_mr_delete(e_mr); +reg_phys_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p " + "num_phys_buf=%x mr_access_flags=%x iova_start=%p", + PTR_ERR(ib_mr), pd, phys_buf_array, + num_phys_buf, mr_access_flags, iova_start); + return ib_mr; +} /* end ehca_reg_phys_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt, int mr_access_flags, + struct ib_udata *udata) +{ + struct ib_mr *ib_mr; + struct ehca_mr *e_mr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_mr_pginfo pginfo; + int ret, page_shift; + u32 num_kpages; + u32 num_hwpages; + u64 hwpage_size; + + if (!pd) { + ehca_gen_err("bad pd=%p", pd); + return ERR_PTR(-EFAULT); + } + + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit0; + } + + if (length == 0 || virt + length < virt) { + ehca_err(pd->device, "bad input values: length=%llx " + "virt_base=%llx", length, virt); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(pd->device, "out of memory"); + ib_mr = ERR_PTR(-ENOMEM); + goto reg_user_mr_exit0; + } + + e_mr->umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(e_mr->umem)) { + ib_mr = (void *)e_mr->umem; + goto reg_user_mr_exit1; + } + + if (e_mr->umem->page_size != PAGE_SIZE) { + ehca_err(pd->device, "page size not supported, " + "e_mr->umem->page_size=%x", e_mr->umem->page_size); + ib_mr = ERR_PTR(-EINVAL); + goto reg_user_mr_exit2; + } + + /* determine number of MR pages */ + num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE); + /* select proper hw_pgsize */ + page_shift = PAGE_SHIFT; + if (e_mr->umem->hugetlb) { + /* determine page_shift, clamp between 4K and 16M */ + page_shift = (fls64(length - 1) + 3) & ~3; + page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K), + EHCA_MR_PGSHIFT16M); + } + hwpage_size = 1UL << page_shift; + + /* now that we have the desired page size, shift until it's + * supported, too. 4K is always supported, so this terminates. + */ + while (!(hwpage_size & shca->hca_cap_mr_pgsize)) + hwpage_size >>= 4; + +reg_user_mr_fallback: + num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size); + /* register MR on HCA */ + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_USER; + pginfo.hwpage_size = hwpage_size; + pginfo.num_kpages = num_kpages; + pginfo.num_hwpages = num_hwpages; + pginfo.u.usr.region = e_mr->umem; + pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size; + pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; + ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, + e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); + if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) { + ehca_warn(pd->device, "failed to register mr " + "with hwpage_size=%llx", hwpage_size); + ehca_info(pd->device, "try to register mr with " + "kpage_size=%lx", PAGE_SIZE); + /* + * this means kpages are not contiguous for a hw page + * try kernel page size as fallback solution + */ + hwpage_size = PAGE_SIZE; + goto reg_user_mr_fallback; + } + if (ret) { + ib_mr = ERR_PTR(ret); + goto reg_user_mr_exit2; + } + + /* successful registration of all pages */ + return &e_mr->ib.ib_mr; + +reg_user_mr_exit2: + ib_umem_release(e_mr->umem); +reg_user_mr_exit1: + ehca_mr_delete(e_mr); +reg_user_mr_exit0: + if (IS_ERR(ib_mr)) + ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p", + PTR_ERR(ib_mr), pd, mr_access_flags, udata); + return ib_mr; +} /* end ehca_reg_user_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_rereg_phys_mr(struct ib_mr *mr, + int mr_rereg_mask, + struct ib_pd *pd, + struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + int mr_access_flags, + u64 *iova_start) +{ + int ret; + + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + u64 new_size; + u64 *new_start; + u32 new_acl; + struct ehca_pd *new_pd; + u32 tmp_lkey, tmp_rkey; + unsigned long sl_flags; + u32 num_kpages = 0; + u32 num_hwpages = 0; + struct ehca_mr_pginfo pginfo; + + if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) { + /* TODO not supported, because PHYP rereg hCall needs pages */ + ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not " + "supported yet, mr_rereg_mask=%x", mr_rereg_mask); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + if (mr_rereg_mask & IB_MR_REREG_PD) { + if (!pd) { + ehca_err(mr->device, "rereg with bad pd, pd=%p " + "mr_rereg_mask=%x", pd, mr_rereg_mask); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + } + + if ((mr_rereg_mask & + ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) || + (mr_rereg_mask == 0)) { + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + /* check other parameters */ + if (e_mr == shca->maxmr) { + /* should be impossible, however reject to be sure */ + ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p " + "shca->maxmr=%p mr->lkey=%x", + mr, shca->maxmr, mr->lkey); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */ + if (e_mr->flags & EHCA_MR_FLAG_FMR) { + ehca_err(mr->device, "not supported for FMR, mr=%p " + "flags=%x", mr, e_mr->flags); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + if (!phys_buf_array || num_phys_buf <= 0) { + ehca_err(mr->device, "bad input values mr_rereg_mask=%x" + " phys_buf_array=%p num_phys_buf=%x", + mr_rereg_mask, phys_buf_array, num_phys_buf); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + } + if ((mr_rereg_mask & IB_MR_REREG_ACCESS) && /* change ACL */ + (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(mr->device, "bad input values: mr_rereg_mask=%x " + "mr_access_flags=%x", mr_rereg_mask, mr_access_flags); + ret = -EINVAL; + goto rereg_phys_mr_exit0; + } + + /* set requested values dependent on rereg request */ + spin_lock_irqsave(&e_mr->mrlock, sl_flags); + new_start = e_mr->start; + new_size = e_mr->size; + new_acl = e_mr->acl; + new_pd = container_of(mr->pd, struct ehca_pd, ib_pd); + + if (mr_rereg_mask & IB_MR_REREG_TRANS) { + u64 hw_pgsize = ehca_get_max_hwpage_size(shca); + + new_start = iova_start; /* change address */ + /* check physical buffer list and calculate size */ + ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, + num_phys_buf, iova_start, + &new_size); + if (ret) + goto rereg_phys_mr_exit1; + if ((new_size == 0) || + (((u64)iova_start + new_size) < (u64)iova_start)) { + ehca_err(mr->device, "bad input values: new_size=%llx " + "iova_start=%p", new_size, iova_start); + ret = -EINVAL; + goto rereg_phys_mr_exit1; + } + num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) + + new_size, PAGE_SIZE); + num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) + + new_size, hw_pgsize); + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.num_hwpages = num_hwpages; + pginfo.u.phy.num_phys_buf = num_phys_buf; + pginfo.u.phy.phys_buf_array = phys_buf_array; + pginfo.next_hwpage = + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; + } + if (mr_rereg_mask & IB_MR_REREG_ACCESS) + new_acl = mr_access_flags; + if (mr_rereg_mask & IB_MR_REREG_PD) + new_pd = container_of(pd, struct ehca_pd, ib_pd); + + ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl, + new_pd, &pginfo, &tmp_lkey, &tmp_rkey); + if (ret) + goto rereg_phys_mr_exit1; + + /* successful reregistration */ + if (mr_rereg_mask & IB_MR_REREG_PD) + mr->pd = pd; + mr->lkey = tmp_lkey; + mr->rkey = tmp_rkey; + +rereg_phys_mr_exit1: + spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); +rereg_phys_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p " + "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x " + "iova_start=%p", + ret, mr, mr_rereg_mask, pd, phys_buf_array, + num_phys_buf, mr_access_flags, iova_start); + return ret; +} /* end ehca_rereg_phys_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + unsigned long sl_flags; + struct ehca_mr_hipzout_parms hipzout; + + if ((e_mr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p " + "e_mr->flags=%x", mr, e_mr, e_mr->flags); + ret = -EINVAL; + goto query_mr_exit0; + } + + memset(mr_attr, 0, sizeof(struct ib_mr_attr)); + spin_lock_irqsave(&e_mr->mrlock, sl_flags); + + h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p " + "hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, mr->lkey); + ret = ehca2ib_return_code(h_ret); + goto query_mr_exit1; + } + mr_attr->pd = mr->pd; + mr_attr->device_virt_addr = hipzout.vaddr; + mr_attr->size = hipzout.len; + mr_attr->lkey = hipzout.lkey; + mr_attr->rkey = hipzout.rkey; + ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags); + +query_mr_exit1: + spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); +query_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p", + ret, mr, mr_attr); + return ret; +} /* end ehca_query_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dereg_mr(struct ib_mr *mr) +{ + int ret = 0; + u64 h_ret; + struct ehca_shca *shca = + container_of(mr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); + + if ((e_mr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p " + "e_mr->flags=%x", mr, e_mr, e_mr->flags); + ret = -EINVAL; + goto dereg_mr_exit0; + } else if (e_mr == shca->maxmr) { + /* should be impossible, however reject to be sure */ + ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p " + "shca->maxmr=%p mr->lkey=%x", + mr, shca->maxmr, mr->lkey); + ret = -EINVAL; + goto dereg_mr_exit0; + } + + /* TODO: BUSY: MR still has bound window(s) */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p " + "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x", + h_ret, shca, e_mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, mr->lkey); + ret = ehca2ib_return_code(h_ret); + goto dereg_mr_exit0; + } + + if (e_mr->umem) + ib_umem_release(e_mr->umem); + + /* successful deregistration */ + ehca_mr_delete(e_mr); + +dereg_mr_exit0: + if (ret) + ehca_err(mr->device, "ret=%i mr=%p", ret, mr); + return ret; +} /* end ehca_dereg_mr() */ + +/*----------------------------------------------------------------------*/ + +struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) +{ + struct ib_mw *ib_mw; + u64 h_ret; + struct ehca_mw *e_mw; + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_mw_hipzout_parms hipzout; + + if (type != IB_MW_TYPE_1) + return ERR_PTR(-EINVAL); + + e_mw = ehca_mw_new(); + if (!e_mw) { + ib_mw = ERR_PTR(-ENOMEM); + goto alloc_mw_exit0; + } + + h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw, + e_pd->fw_pd, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli " + "shca=%p hca_hndl=%llx mw=%p", + h_ret, shca, shca->ipz_hca_handle.handle, e_mw); + ib_mw = ERR_PTR(ehca2ib_return_code(h_ret)); + goto alloc_mw_exit1; + } + /* successful MW allocation */ + e_mw->ipz_mw_handle = hipzout.handle; + e_mw->ib_mw.rkey = hipzout.rkey; + return &e_mw->ib_mw; + +alloc_mw_exit1: + ehca_mw_delete(e_mw); +alloc_mw_exit0: + if (IS_ERR(ib_mw)) + ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd); + return ib_mw; +} /* end ehca_alloc_mw() */ + +/*----------------------------------------------------------------------*/ + +int ehca_bind_mw(struct ib_qp *qp, + struct ib_mw *mw, + struct ib_mw_bind *mw_bind) +{ + /* TODO: not supported up to now */ + ehca_gen_err("bind MW currently not supported by HCAD"); + + return -EPERM; +} /* end ehca_bind_mw() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dealloc_mw(struct ib_mw *mw) +{ + u64 h_ret; + struct ehca_shca *shca = + container_of(mw->device, struct ehca_shca, ib_device); + struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw); + + h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw); + if (h_ret != H_SUCCESS) { + ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p " + "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx", + h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle, + e_mw->ipz_mw_handle.handle); + return ehca2ib_return_code(h_ret); + } + /* successful deallocation */ + ehca_mw_delete(e_mw); + return 0; +} /* end ehca_dealloc_mw() */ + +/*----------------------------------------------------------------------*/ + +struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, + int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ib_fmr *ib_fmr; + struct ehca_shca *shca = + container_of(pd->device, struct ehca_shca, ib_device); + struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_mr *e_fmr; + int ret; + u32 tmp_lkey, tmp_rkey; + struct ehca_mr_pginfo pginfo; + u64 hw_pgsize; + + /* check other parameters */ + if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || + ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && + !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { + /* + * Remote Write Access requires Local Write Access + * Remote Atomic Access requires Local Write Access + */ + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + if (mr_access_flags & IB_ACCESS_MW_BIND) { + ehca_err(pd->device, "bad input values: mr_access_flags=%x", + mr_access_flags); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) { + ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x " + "fmr_attr->max_maps=%x fmr_attr->page_shift=%x", + fmr_attr->max_pages, fmr_attr->max_maps, + fmr_attr->page_shift); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + + hw_pgsize = 1 << fmr_attr->page_shift; + if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) { + ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x", + fmr_attr->page_shift); + ib_fmr = ERR_PTR(-EINVAL); + goto alloc_fmr_exit0; + } + + e_fmr = ehca_mr_new(); + if (!e_fmr) { + ib_fmr = ERR_PTR(-ENOMEM); + goto alloc_fmr_exit0; + } + e_fmr->flags |= EHCA_MR_FLAG_FMR; + + /* register MR on HCA */ + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.hwpage_size = hw_pgsize; + /* + * pginfo.num_hwpages==0, ie register_rpages() will not be called + * but deferred to map_phys_fmr() + */ + ret = ehca_reg_mr(shca, e_fmr, NULL, + fmr_attr->max_pages * (1 << fmr_attr->page_shift), + mr_access_flags, e_pd, &pginfo, + &tmp_lkey, &tmp_rkey, EHCA_REG_MR); + if (ret) { + ib_fmr = ERR_PTR(ret); + goto alloc_fmr_exit1; + } + + /* successful */ + e_fmr->hwpage_size = hw_pgsize; + e_fmr->fmr_page_size = 1 << fmr_attr->page_shift; + e_fmr->fmr_max_pages = fmr_attr->max_pages; + e_fmr->fmr_max_maps = fmr_attr->max_maps; + e_fmr->fmr_map_cnt = 0; + return &e_fmr->ib.ib_fmr; + +alloc_fmr_exit1: + ehca_mr_delete(e_fmr); +alloc_fmr_exit0: + return ib_fmr; +} /* end ehca_alloc_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_map_phys_fmr(struct ib_fmr *fmr, + u64 *page_list, + int list_len, + u64 iova) +{ + int ret; + struct ehca_shca *shca = + container_of(fmr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr); + struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd); + struct ehca_mr_pginfo pginfo; + u32 tmp_lkey, tmp_rkey; + + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x", + e_fmr, e_fmr->flags); + ret = -EINVAL; + goto map_phys_fmr_exit0; + } + ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len); + if (ret) + goto map_phys_fmr_exit0; + if (iova % e_fmr->fmr_page_size) { + /* only whole-numbered pages */ + ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x", + iova, e_fmr->fmr_page_size); + ret = -EINVAL; + goto map_phys_fmr_exit0; + } + if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) { + /* HCAD does not limit the maps, however trace this anyway */ + ehca_info(fmr->device, "map limit exceeded, fmr=%p " + "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x", + fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps); + } + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_FMR; + pginfo.num_kpages = list_len; + pginfo.hwpage_size = e_fmr->hwpage_size; + pginfo.num_hwpages = + list_len * e_fmr->fmr_page_size / pginfo.hwpage_size; + pginfo.u.fmr.page_list = page_list; + pginfo.next_hwpage = + (iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size; + pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size; + + ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova, + list_len * e_fmr->fmr_page_size, + e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey); + if (ret) + goto map_phys_fmr_exit0; + + /* successful reregistration */ + e_fmr->fmr_map_cnt++; + e_fmr->ib.ib_fmr.lkey = tmp_lkey; + e_fmr->ib.ib_fmr.rkey = tmp_rkey; + return 0; + +map_phys_fmr_exit0: + if (ret) + ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x " + "iova=%llx", ret, fmr, page_list, list_len, iova); + return ret; +} /* end ehca_map_phys_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_unmap_fmr(struct list_head *fmr_list) +{ + int ret = 0; + struct ib_fmr *ib_fmr; + struct ehca_shca *shca = NULL; + struct ehca_shca *prev_shca; + struct ehca_mr *e_fmr; + u32 num_fmr = 0; + u32 unmap_fmr_cnt = 0; + + /* check all FMR belong to same SHCA, and check internal flag */ + list_for_each_entry(ib_fmr, fmr_list, list) { + prev_shca = shca; + shca = container_of(ib_fmr->device, struct ehca_shca, + ib_device); + e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr); + if ((shca != prev_shca) && prev_shca) { + ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p " + "prev_shca=%p e_fmr=%p", + shca, prev_shca, e_fmr); + ret = -EINVAL; + goto unmap_fmr_exit0; + } + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p " + "e_fmr->flags=%x", e_fmr, e_fmr->flags); + ret = -EINVAL; + goto unmap_fmr_exit0; + } + num_fmr++; + } + + /* loop over all FMRs to unmap */ + list_for_each_entry(ib_fmr, fmr_list, list) { + unmap_fmr_cnt++; + e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr); + shca = container_of(ib_fmr->device, struct ehca_shca, + ib_device); + ret = ehca_unmap_one_fmr(shca, e_fmr); + if (ret) { + /* unmap failed, stop unmapping of rest of FMRs */ + ehca_err(&shca->ib_device, "unmap of one FMR failed, " + "stop rest, e_fmr=%p num_fmr=%x " + "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr, + unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey); + goto unmap_fmr_exit0; + } + } + +unmap_fmr_exit0: + if (ret) + ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x", + ret, fmr_list, num_fmr, unmap_fmr_cnt); + return ret; +} /* end ehca_unmap_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dealloc_fmr(struct ib_fmr *fmr) +{ + int ret; + u64 h_ret; + struct ehca_shca *shca = + container_of(fmr->device, struct ehca_shca, ib_device); + struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr); + + if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) { + ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x", + e_fmr, e_fmr->flags); + ret = -EINVAL; + goto free_fmr_exit0; + } + + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); + if (h_ret != H_SUCCESS) { + ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p " + "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, fmr->lkey); + ret = ehca2ib_return_code(h_ret); + goto free_fmr_exit0; + } + /* successful deregistration */ + ehca_mr_delete(e_fmr); + return 0; + +free_fmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr); + return ret; +} /* end ehca_dealloc_fmr() */ + +/*----------------------------------------------------------------------*/ + +static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo); + +int ehca_reg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, /*OUT*/ + u32 *rkey, /*OUT*/ + enum ehca_reg_type reg_type) +{ + int ret; + u64 h_ret; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl); + if (ehca_use_hp_mr == 1) + hipz_acl |= 0x00000001; + + h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr, + (u64)iova_start, size, hipz_acl, + e_pd->fw_pd, &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli " + "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle); + ret = ehca2ib_return_code(h_ret); + goto ehca_reg_mr_exit0; + } + + e_mr->ipz_mr_handle = hipzout.handle; + + if (reg_type == EHCA_REG_BUSMAP_MR) + ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo); + else if (reg_type == EHCA_REG_MR) + ret = ehca_reg_mr_rpages(shca, e_mr, pginfo); + else + ret = -EINVAL; + + if (ret) + goto ehca_reg_mr_exit1; + + /* successful registration */ + e_mr->num_kpages = pginfo->num_kpages; + e_mr->num_hwpages = pginfo->num_hwpages; + e_mr->hwpage_size = pginfo->hwpage_size; + e_mr->start = iova_start; + e_mr->size = size; + e_mr->acl = acl; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; + +ehca_reg_mr_exit1: + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x " + "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i", + h_ret, shca, e_mr, iova_start, size, acl, e_pd, + hipzout.lkey, pginfo, pginfo->num_kpages, + pginfo->num_hwpages, ret); + ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, " + "not recoverable"); + } +ehca_reg_mr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p " + "num_kpages=%llx num_hwpages=%llx", + ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo, + pginfo->num_kpages, pginfo->num_hwpages); + return ret; +} /* end ehca_reg_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo) +{ + int ret = 0; + u64 h_ret; + u32 rnum; + u64 rpage; + u32 i; + u64 *kpage; + + if (!pginfo->num_hwpages) /* in case of fmr */ + return 0; + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + ret = -ENOMEM; + goto ehca_reg_mr_rpages_exit0; + } + + /* max MAX_RPAGES ehca mr pages per register call */ + for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) { + + if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) { + rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */ + if (rnum == 0) + rnum = MAX_RPAGES; /* last shot is full */ + } else + rnum = MAX_RPAGES; + + ret = ehca_set_pagebuf(pginfo, rnum, kpage); + if (ret) { + ehca_err(&shca->ib_device, "ehca_set_pagebuf " + "bad rc, ret=%i rnum=%x kpage=%p", + ret, rnum, kpage); + goto ehca_reg_mr_rpages_exit1; + } + + if (rnum > 1) { + rpage = __pa(kpage); + if (!rpage) { + ehca_err(&shca->ib_device, "kpage=%p i=%x", + kpage, i); + ret = -EFAULT; + goto ehca_reg_mr_rpages_exit1; + } + } else + rpage = *kpage; + + h_ret = hipz_h_register_rpage_mr( + shca->ipz_hca_handle, e_mr, + ehca_encode_hwpage_size(pginfo->hwpage_size), + 0, rpage, rnum); + + if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) { + /* + * check for 'registration complete'==H_SUCCESS + * and for 'page registered'==H_PAGE_REGISTERED + */ + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "last " + "hipz_reg_rpage_mr failed, h_ret=%lli " + "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx" + " lkey=%x", h_ret, e_mr, i, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + break; + } else + ret = 0; + } else if (h_ret != H_PAGE_REGISTERED) { + ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, " + "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx " + "mr_hndl=%llx", h_ret, e_mr, i, + e_mr->ib.ib_mr.lkey, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle); + ret = ehca2ib_return_code(h_ret); + break; + } else + ret = 0; + } /* end for(i) */ + + +ehca_reg_mr_rpages_exit1: + ehca_free_fw_ctrlblock(kpage); +ehca_reg_mr_rpages_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p " + "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr, + pginfo, pginfo->num_kpages, pginfo->num_hwpages); + return ret; +} /* end ehca_reg_mr_rpages() */ + +/*----------------------------------------------------------------------*/ + +inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + u32 acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, /*OUT*/ + u32 *rkey) /*OUT*/ +{ + int ret; + u64 h_ret; + u32 hipz_acl; + u64 *kpage; + u64 rpage; + struct ehca_mr_pginfo pginfo_save; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl); + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + ret = -ENOMEM; + goto ehca_rereg_mr_rereg1_exit0; + } + + pginfo_save = *pginfo; + ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage); + if (ret) { + ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p " + "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx " + "kpage=%p", e_mr, pginfo, pginfo->type, + pginfo->num_kpages, pginfo->num_hwpages, kpage); + goto ehca_rereg_mr_rereg1_exit1; + } + rpage = __pa(kpage); + if (!rpage) { + ehca_err(&shca->ib_device, "kpage=%p", kpage); + ret = -EFAULT; + goto ehca_rereg_mr_rereg1_exit1; + } + h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr, + (u64)iova_start, size, hipz_acl, + e_pd->fw_pd, rpage, &hipzout); + if (h_ret != H_SUCCESS) { + /* + * reregistration unsuccessful, try it again with the 3 hCalls, + * e.g. this is required in case H_MR_CONDITION + * (MW bound or MR is shared) + */ + ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed " + "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr); + *pginfo = pginfo_save; + ret = -EAGAIN; + } else if ((u64 *)hipzout.vaddr != iova_start) { + ehca_err(&shca->ib_device, "PHYP changed iova_start in " + "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p " + "mr_handle=%llx lkey=%x lkey_out=%x", iova_start, + hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey, hipzout.lkey); + ret = -EFAULT; + } else { + /* + * successful reregistration + * note: start and start_out are identical for eServer HCAs + */ + e_mr->num_kpages = pginfo->num_kpages; + e_mr->num_hwpages = pginfo->num_hwpages; + e_mr->hwpage_size = pginfo->hwpage_size; + e_mr->start = iova_start; + e_mr->size = size; + e_mr->acl = acl; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + } + +ehca_rereg_mr_rereg1_exit1: + ehca_free_fw_ctrlblock(kpage); +ehca_rereg_mr_rereg1_exit0: + if ( ret && (ret != -EAGAIN) ) + ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x " + "pginfo=%p num_kpages=%llx num_hwpages=%llx", + ret, *lkey, *rkey, pginfo, pginfo->num_kpages, + pginfo->num_hwpages); + return ret; +} /* end ehca_rereg_mr_rereg1() */ + +/*----------------------------------------------------------------------*/ + +int ehca_rereg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey) +{ + int ret = 0; + u64 h_ret; + int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */ + int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */ + + /* first determine reregistration hCall(s) */ + if ((pginfo->num_hwpages > MAX_RPAGES) || + (e_mr->num_hwpages > MAX_RPAGES) || + (pginfo->num_hwpages > e_mr->num_hwpages)) { + ehca_dbg(&shca->ib_device, "Rereg3 case, " + "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x", + pginfo->num_hwpages, e_mr->num_hwpages); + rereg_1_hcall = 0; + rereg_3_hcall = 1; + } + + if (e_mr->flags & EHCA_MR_FLAG_MAXMR) { /* check for max-MR */ + rereg_1_hcall = 0; + rereg_3_hcall = 1; + e_mr->flags &= ~EHCA_MR_FLAG_MAXMR; + ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p", + e_mr); + } + + if (rereg_1_hcall) { + ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size, + acl, e_pd, pginfo, lkey, rkey); + if (ret) { + if (ret == -EAGAIN) + rereg_3_hcall = 1; + else + goto ehca_rereg_mr_exit0; + } + } + + if (rereg_3_hcall) { + struct ehca_mr save_mr; + + /* first deregister old MR */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_free_mr failed, " + "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx " + "mr->lkey=%x", + h_ret, e_mr, shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle, + e_mr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_rereg_mr_exit0; + } + /* clean ehca_mr_t, without changing struct ib_mr and lock */ + save_mr = *e_mr; + ehca_mr_deletenew(e_mr); + + /* set some MR values */ + e_mr->flags = save_mr.flags; + e_mr->hwpage_size = save_mr.hwpage_size; + e_mr->fmr_page_size = save_mr.fmr_page_size; + e_mr->fmr_max_pages = save_mr.fmr_max_pages; + e_mr->fmr_max_maps = save_mr.fmr_max_maps; + e_mr->fmr_map_cnt = save_mr.fmr_map_cnt; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl, + e_pd, pginfo, lkey, rkey, EHCA_REG_MR); + if (ret) { + u32 offset = (u64)(&e_mr->flags) - (u64)e_mr; + memcpy(&e_mr->flags, &(save_mr.flags), + sizeof(struct ehca_mr) - offset); + goto ehca_rereg_mr_exit0; + } + } + +ehca_rereg_mr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p " + "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p " + "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x " + "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size, + acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey, + rereg_1_hcall, rereg_3_hcall); + return ret; +} /* end ehca_rereg_mr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_unmap_one_fmr(struct ehca_shca *shca, + struct ehca_mr *e_fmr) +{ + int ret = 0; + u64 h_ret; + struct ehca_pd *e_pd = + container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd); + struct ehca_mr save_fmr; + u32 tmp_lkey, tmp_rkey; + struct ehca_mr_pginfo pginfo; + struct ehca_mr_hipzout_parms hipzout; + struct ehca_mr save_mr; + + if (e_fmr->fmr_max_pages <= MAX_RPAGES) { + /* + * note: after using rereg hcall with len=0, + * rereg hcall must be used again for registering pages + */ + h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0, + 0, 0, e_pd->fw_pd, 0, &hipzout); + if (h_ret == H_SUCCESS) { + /* successful reregistration */ + e_fmr->start = NULL; + e_fmr->size = 0; + tmp_lkey = hipzout.lkey; + tmp_rkey = hipzout.rkey; + return 0; + } + /* + * should not happen, because length checked above, + * FMRs are not shared and no MW bound to FMRs + */ + ehca_err(&shca->ib_device, "hipz_reregister_pmr failed " + "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx " + "mr_hndl=%llx lkey=%x lkey_out=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, + e_fmr->ib.ib_fmr.lkey, hipzout.lkey); + /* try free and rereg */ + } + + /* first free old FMR */ + h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_free_mr failed, " + "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx " + "lkey=%x", + h_ret, e_fmr, shca->ipz_hca_handle.handle, + e_fmr->ipz_mr_handle.handle, + e_fmr->ib.ib_fmr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_unmap_one_fmr_exit0; + } + /* clean ehca_mr_t, without changing lock */ + save_fmr = *e_fmr; + ehca_mr_deletenew(e_fmr); + + /* set some MR values */ + e_fmr->flags = save_fmr.flags; + e_fmr->hwpage_size = save_fmr.hwpage_size; + e_fmr->fmr_page_size = save_fmr.fmr_page_size; + e_fmr->fmr_max_pages = save_fmr.fmr_max_pages; + e_fmr->fmr_max_maps = save_fmr.fmr_max_maps; + e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt; + e_fmr->acl = save_fmr.acl; + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_FMR; + ret = ehca_reg_mr(shca, e_fmr, NULL, + (e_fmr->fmr_max_pages * e_fmr->fmr_page_size), + e_fmr->acl, e_pd, &pginfo, &tmp_lkey, + &tmp_rkey, EHCA_REG_MR); + if (ret) { + u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr; + memcpy(&e_fmr->flags, &(save_mr.flags), + sizeof(struct ehca_mr) - offset); + } + +ehca_unmap_one_fmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x " + "fmr_max_pages=%x", + ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages); + return ret; +} /* end ehca_unmap_one_fmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_smr(struct ehca_shca *shca, + struct ehca_mr *e_origmr, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, /*OUT*/ + u32 *rkey) /*OUT*/ +{ + int ret = 0; + u64 h_ret; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl); + + h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr, + (u64)iova_start, hipz_acl, e_pd->fw_pd, + &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli " + "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x " + "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd, + shca->ipz_hca_handle.handle, + e_origmr->ipz_mr_handle.handle, + e_origmr->ib.ib_mr.lkey); + ret = ehca2ib_return_code(h_ret); + goto ehca_reg_smr_exit0; + } + /* successful registration */ + e_newmr->num_kpages = e_origmr->num_kpages; + e_newmr->num_hwpages = e_origmr->num_hwpages; + e_newmr->hwpage_size = e_origmr->hwpage_size; + e_newmr->start = iova_start; + e_newmr->size = e_origmr->size; + e_newmr->acl = acl; + e_newmr->ipz_mr_handle = hipzout.handle; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; + +ehca_reg_smr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p " + "e_newmr=%p iova_start=%p acl=%x e_pd=%p", + ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd); + return ret; +} /* end ehca_reg_smr() */ + +/*----------------------------------------------------------------------*/ +static inline void *ehca_calc_sectbase(int top, int dir, int idx) +{ + unsigned long ret = idx; + ret |= dir << EHCA_DIR_INDEX_SHIFT; + ret |= top << EHCA_TOP_INDEX_SHIFT; + return __va(ret << SECTION_SIZE_BITS); +} + +#define ehca_bmap_valid(entry) \ + ((u64)entry != (u64)EHCA_INVAL_ADDR) + +static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage, + struct ehca_shca *shca, struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 h_ret = 0; + unsigned long page = 0; + u64 rpage = __pa(kpage); + int page_count; + + void *sectbase = ehca_calc_sectbase(top, dir, idx); + if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) { + ehca_err(&shca->ib_device, "reg_mr_section will probably fail:" + "hwpage_size does not fit to " + "section start address"); + } + page_count = EHCA_SECTSIZE / pginfo->hwpage_size; + + while (page < page_count) { + u64 rnum; + for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count); + rnum++) { + void *pg = sectbase + ((page++) * pginfo->hwpage_size); + kpage[rnum] = __pa(pg); + } + + h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr, + ehca_encode_hwpage_size(pginfo->hwpage_size), + 0, rpage, rnum); + + if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) { + ehca_err(&shca->ib_device, "register_rpage_mr failed"); + return h_ret; + } + } + return h_ret; +} + +static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage, + struct ehca_shca *shca, struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 hret = H_SUCCESS; + int idx; + + for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx])) + continue; + + hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr, + pginfo); + if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED)) + return hret; + } + return hret; +} + +static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca, + struct ehca_mr *mr, + struct ehca_mr_pginfo *pginfo) +{ + u64 hret = H_SUCCESS; + int dir; + + for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + continue; + + hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo); + if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED)) + return hret; + } + return hret; +} + +/* register internal max-MR to internal SHCA */ +int ehca_reg_internal_maxmr( + struct ehca_shca *shca, + struct ehca_pd *e_pd, + struct ehca_mr **e_maxmr) /*OUT*/ +{ + int ret; + struct ehca_mr *e_mr; + u64 *iova_start; + u64 size_maxmr; + struct ehca_mr_pginfo pginfo; + struct ib_phys_buf ib_pbuf; + u32 num_kpages; + u32 num_hwpages; + u64 hw_pgsize; + + if (!ehca_bmap) { + ret = -EFAULT; + goto ehca_reg_internal_maxmr_exit0; + } + + e_mr = ehca_mr_new(); + if (!e_mr) { + ehca_err(&shca->ib_device, "out of memory"); + ret = -ENOMEM; + goto ehca_reg_internal_maxmr_exit0; + } + e_mr->flags |= EHCA_MR_FLAG_MAXMR; + + /* register internal max-MR on HCA */ + size_maxmr = ehca_mr_len; + iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)); + ib_pbuf.addr = 0; + ib_pbuf.size = size_maxmr; + num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr, + PAGE_SIZE); + hw_pgsize = ehca_get_max_hwpage_size(shca); + num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr, + hw_pgsize); + + memset(&pginfo, 0, sizeof(pginfo)); + pginfo.type = EHCA_MR_PGI_PHYS; + pginfo.num_kpages = num_kpages; + pginfo.num_hwpages = num_hwpages; + pginfo.hwpage_size = hw_pgsize; + pginfo.u.phy.num_phys_buf = 1; + pginfo.u.phy.phys_buf_array = &ib_pbuf; + + ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd, + &pginfo, &e_mr->ib.ib_mr.lkey, + &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR); + if (ret) { + ehca_err(&shca->ib_device, "reg of internal max MR failed, " + "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x " + "num_hwpages=%x", e_mr, iova_start, size_maxmr, + num_kpages, num_hwpages); + goto ehca_reg_internal_maxmr_exit1; + } + + /* successful registration of all pages */ + e_mr->ib.ib_mr.device = e_pd->ib_pd.device; + e_mr->ib.ib_mr.pd = &e_pd->ib_pd; + e_mr->ib.ib_mr.uobject = NULL; + atomic_inc(&(e_pd->ib_pd.usecnt)); + atomic_set(&(e_mr->ib.ib_mr.usecnt), 0); + *e_maxmr = e_mr; + return 0; + +ehca_reg_internal_maxmr_exit1: + ehca_mr_delete(e_mr); +ehca_reg_internal_maxmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p", + ret, shca, e_pd, e_maxmr); + return ret; +} /* end ehca_reg_internal_maxmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_reg_maxmr(struct ehca_shca *shca, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey) +{ + u64 h_ret; + struct ehca_mr *e_origmr = shca->maxmr; + u32 hipz_acl; + struct ehca_mr_hipzout_parms hipzout; + + ehca_mrmw_map_acl(acl, &hipz_acl); + ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl); + + h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr, + (u64)iova_start, hipz_acl, e_pd->fw_pd, + &hipzout); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli " + "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x", + h_ret, e_origmr, shca->ipz_hca_handle.handle, + e_origmr->ipz_mr_handle.handle, + e_origmr->ib.ib_mr.lkey); + return ehca2ib_return_code(h_ret); + } + /* successful registration */ + e_newmr->num_kpages = e_origmr->num_kpages; + e_newmr->num_hwpages = e_origmr->num_hwpages; + e_newmr->hwpage_size = e_origmr->hwpage_size; + e_newmr->start = iova_start; + e_newmr->size = e_origmr->size; + e_newmr->acl = acl; + e_newmr->ipz_mr_handle = hipzout.handle; + *lkey = hipzout.lkey; + *rkey = hipzout.rkey; + return 0; +} /* end ehca_reg_maxmr() */ + +/*----------------------------------------------------------------------*/ + +int ehca_dereg_internal_maxmr(struct ehca_shca *shca) +{ + int ret; + struct ehca_mr *e_maxmr; + struct ib_pd *ib_pd; + + if (!shca->maxmr) { + ehca_err(&shca->ib_device, "bad call, shca=%p", shca); + ret = -EINVAL; + goto ehca_dereg_internal_maxmr_exit0; + } + + e_maxmr = shca->maxmr; + ib_pd = e_maxmr->ib.ib_mr.pd; + shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */ + + ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr); + if (ret) { + ehca_err(&shca->ib_device, "dereg internal max-MR failed, " + "ret=%i e_maxmr=%p shca=%p lkey=%x", + ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey); + shca->maxmr = e_maxmr; + goto ehca_dereg_internal_maxmr_exit0; + } + + atomic_dec(&ib_pd->usecnt); + +ehca_dereg_internal_maxmr_exit0: + if (ret) + ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p", + ret, shca, shca->maxmr); + return ret; +} /* end ehca_dereg_internal_maxmr() */ + +/*----------------------------------------------------------------------*/ + +/* + * check physical buffer array of MR verbs for validness and + * calculates MR size + */ +int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + u64 *iova_start, + u64 *size) +{ + struct ib_phys_buf *pbuf = phys_buf_array; + u64 size_count = 0; + u32 i; + + if (num_phys_buf == 0) { + ehca_gen_err("bad phys buf array len, num_phys_buf=0"); + return -EINVAL; + } + /* check first buffer */ + if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) { + ehca_gen_err("iova_start/addr mismatch, iova_start=%p " + "pbuf->addr=%llx pbuf->size=%llx", + iova_start, pbuf->addr, pbuf->size); + return -EINVAL; + } + if (((pbuf->addr + pbuf->size) % PAGE_SIZE) && + (num_phys_buf > 1)) { + ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx " + "pbuf->size=%llx", pbuf->addr, pbuf->size); + return -EINVAL; + } + + for (i = 0; i < num_phys_buf; i++) { + if ((i > 0) && (pbuf->addr % PAGE_SIZE)) { + ehca_gen_err("bad address, i=%x pbuf->addr=%llx " + "pbuf->size=%llx", + i, pbuf->addr, pbuf->size); + return -EINVAL; + } + if (((i > 0) && /* not 1st */ + (i < (num_phys_buf - 1)) && /* not last */ + (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) { + ehca_gen_err("bad size, i=%x pbuf->size=%llx", + i, pbuf->size); + return -EINVAL; + } + size_count += pbuf->size; + pbuf++; + } + + *size = size_count; + return 0; +} /* end ehca_mr_chk_buf_and_calc_size() */ + +/*----------------------------------------------------------------------*/ + +/* check page list of map FMR verb for validness */ +int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, + u64 *page_list, + int list_len) +{ + u32 i; + u64 *page; + + if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) { + ehca_gen_err("bad list_len, list_len=%x " + "e_fmr->fmr_max_pages=%x fmr=%p", + list_len, e_fmr->fmr_max_pages, e_fmr); + return -EINVAL; + } + + /* each page must be aligned */ + page = page_list; + for (i = 0; i < list_len; i++) { + if (*page % e_fmr->fmr_page_size) { + ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p " + "fmr_page_size=%x", i, *page, page, e_fmr, + e_fmr->fmr_page_size); + return -EINVAL; + } + page++; + } + + return 0; +} /* end ehca_fmr_check_page_list() */ + +/*----------------------------------------------------------------------*/ + +/* PAGE_SIZE >= pginfo->hwpage_size */ +static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret = 0; + u64 pgaddr; + u32 j = 0; + int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + pgaddr = page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT; + *kpage = pgaddr + (pginfo->next_hwpage * + pginfo->hwpage_size); + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx " + "sg_dma_address=%llx " + "entry=%llx next_hwpage=%llx", + pgaddr, (u64)sg_dma_address(*sg), + pginfo->u.usr.next_nmap, + pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + kpage++; + if (pginfo->next_hwpage % hwpages_per_kpage == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.usr.next_nmap)++; + pginfo->next_hwpage = 0; + *sg = sg_next(*sg); + } + j++; + if (j >= number) + break; + } + + return ret; +} + +/* + * check given pages for contiguous layout + * last page addr is returned in prev_pgaddr for further check + */ +static int ehca_check_kpages_per_ate(struct scatterlist **sg, + int num_pages, + u64 *prev_pgaddr) +{ + for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) { + u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT; + if (ehca_debug_level >= 3) + ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr, + *(u64 *)__va(pgaddr)); + if (pgaddr - PAGE_SIZE != *prev_pgaddr) { + ehca_gen_err("uncontiguous page found pgaddr=%llx " + "prev_pgaddr=%llx entries_left_in_hwpage=%x", + pgaddr, *prev_pgaddr, num_pages); + return -EINVAL; + } + *prev_pgaddr = pgaddr; + } + return 0; +} + +/* PAGE_SIZE < pginfo->hwpage_size */ +static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret = 0; + u64 pgaddr, prev_pgaddr; + u32 j = 0; + int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE; + int nr_kpages = kpages_per_hwpage; + struct scatterlist **sg = &pginfo->u.usr.next_sg; + + while (*sg != NULL) { + + if (nr_kpages == kpages_per_hwpage) { + pgaddr = (page_to_pfn(sg_page(*sg)) + << PAGE_SHIFT); + *kpage = pgaddr; + if (!(*kpage)) { + ehca_gen_err("pgaddr=%llx entry=%llx", + pgaddr, pginfo->u.usr.next_nmap); + ret = -EFAULT; + return ret; + } + /* + * The first page in a hwpage must be aligned; + * the first MR page is exempt from this rule. + */ + if (pgaddr & (pginfo->hwpage_size - 1)) { + if (pginfo->hwpage_cnt) { + ehca_gen_err( + "invalid alignment " + "pgaddr=%llx entry=%llx " + "mr_pgsize=%llx", + pgaddr, pginfo->u.usr.next_nmap, + pginfo->hwpage_size); + ret = -EFAULT; + return ret; + } + /* first MR page */ + pginfo->kpage_cnt = + (pgaddr & + (pginfo->hwpage_size - 1)) >> + PAGE_SHIFT; + nr_kpages -= pginfo->kpage_cnt; + *kpage = pgaddr & + ~(pginfo->hwpage_size - 1); + } + if (ehca_debug_level >= 3) { + u64 val = *(u64 *)__va(pgaddr); + ehca_gen_dbg("kpage=%llx page=%llx " + "value=%016llx", + *kpage, pgaddr, val); + } + prev_pgaddr = pgaddr; + *sg = sg_next(*sg); + pginfo->kpage_cnt++; + pginfo->u.usr.next_nmap++; + nr_kpages--; + if (!nr_kpages) + goto next_kpage; + continue; + } + + ret = ehca_check_kpages_per_ate(sg, nr_kpages, + &prev_pgaddr); + if (ret) + return ret; + pginfo->kpage_cnt += nr_kpages; + pginfo->u.usr.next_nmap += nr_kpages; + +next_kpage: + nr_kpages = kpages_per_hwpage; + (pginfo->hwpage_cnt)++; + kpage++; + j++; + if (j >= number) + break; + } + + return ret; +} + +static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo, + u32 number, u64 *kpage) +{ + int ret = 0; + struct ib_phys_buf *pbuf; + u64 num_hw, offs_hw; + u32 i = 0; + + /* loop over desired phys_buf_array entries */ + while (i < number) { + pbuf = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf; + num_hw = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) + + pbuf->size, pginfo->hwpage_size); + offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) / + pginfo->hwpage_size; + while (pginfo->next_hwpage < offs_hw + num_hw) { + /* sanity check */ + if ((pginfo->kpage_cnt >= pginfo->num_kpages) || + (pginfo->hwpage_cnt >= pginfo->num_hwpages)) { + ehca_gen_err("kpage_cnt >= num_kpages, " + "kpage_cnt=%llx num_kpages=%llx " + "hwpage_cnt=%llx " + "num_hwpages=%llx i=%x", + pginfo->kpage_cnt, + pginfo->num_kpages, + pginfo->hwpage_cnt, + pginfo->num_hwpages, i); + return -EFAULT; + } + *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) + + (pginfo->next_hwpage * pginfo->hwpage_size); + if ( !(*kpage) && pbuf->addr ) { + ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx " + "next_hwpage=%llx", pbuf->addr, + pbuf->size, pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + if (PAGE_SIZE >= pginfo->hwpage_size) { + if (pginfo->next_hwpage % + (PAGE_SIZE / pginfo->hwpage_size) == 0) + (pginfo->kpage_cnt)++; + } else + pginfo->kpage_cnt += pginfo->hwpage_size / + PAGE_SIZE; + kpage++; + i++; + if (i >= number) break; + } + if (pginfo->next_hwpage >= offs_hw + num_hw) { + (pginfo->u.phy.next_buf)++; + pginfo->next_hwpage = 0; + } + } + return ret; +} + +static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo, + u32 number, u64 *kpage) +{ + int ret = 0; + u64 *fmrlist; + u32 i; + + /* loop over desired page_list entries */ + fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem; + for (i = 0; i < number; i++) { + *kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) + + pginfo->next_hwpage * pginfo->hwpage_size; + if ( !(*kpage) ) { + ehca_gen_err("*fmrlist=%llx fmrlist=%p " + "next_listelem=%llx next_hwpage=%llx", + *fmrlist, fmrlist, + pginfo->u.fmr.next_listelem, + pginfo->next_hwpage); + return -EFAULT; + } + (pginfo->hwpage_cnt)++; + if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) { + if (pginfo->next_hwpage % + (pginfo->u.fmr.fmr_pgsize / + pginfo->hwpage_size) == 0) { + (pginfo->kpage_cnt)++; + (pginfo->u.fmr.next_listelem)++; + fmrlist++; + pginfo->next_hwpage = 0; + } else + (pginfo->next_hwpage)++; + } else { + unsigned int cnt_per_hwpage = pginfo->hwpage_size / + pginfo->u.fmr.fmr_pgsize; + unsigned int j; + u64 prev = *kpage; + /* check if adrs are contiguous */ + for (j = 1; j < cnt_per_hwpage; j++) { + u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1); + if (prev + pginfo->u.fmr.fmr_pgsize != p) { + ehca_gen_err("uncontiguous fmr pages " + "found prev=%llx p=%llx " + "idx=%x", prev, p, i + j); + return -EINVAL; + } + prev = p; + } + pginfo->kpage_cnt += cnt_per_hwpage; + pginfo->u.fmr.next_listelem += cnt_per_hwpage; + fmrlist += cnt_per_hwpage; + } + kpage++; + } + return ret; +} + +/* setup page buffer from page info */ +int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage) +{ + int ret; + + switch (pginfo->type) { + case EHCA_MR_PGI_PHYS: + ret = ehca_set_pagebuf_phys(pginfo, number, kpage); + break; + case EHCA_MR_PGI_USER: + ret = PAGE_SIZE >= pginfo->hwpage_size ? + ehca_set_pagebuf_user1(pginfo, number, kpage) : + ehca_set_pagebuf_user2(pginfo, number, kpage); + break; + case EHCA_MR_PGI_FMR: + ret = ehca_set_pagebuf_fmr(pginfo, number, kpage); + break; + default: + ehca_gen_err("bad pginfo->type=%x", pginfo->type); + ret = -EFAULT; + break; + } + return ret; +} /* end ehca_set_pagebuf() */ + +/*----------------------------------------------------------------------*/ + +/* + * check MR if it is a max-MR, i.e. uses whole memory + * in case it's a max-MR 1 is returned, else 0 + */ +int ehca_mr_is_maxmr(u64 size, + u64 *iova_start) +{ + /* a MR is treated as max-MR only if it fits following: */ + if ((size == ehca_mr_len) && + (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) { + ehca_gen_dbg("this is a max-MR"); + return 1; + } else + return 0; +} /* end ehca_mr_is_maxmr() */ + +/*----------------------------------------------------------------------*/ + +/* map access control for MR/MW. This routine is used for MR and MW. */ +void ehca_mrmw_map_acl(int ib_acl, + u32 *hipz_acl) +{ + *hipz_acl = 0; + if (ib_acl & IB_ACCESS_REMOTE_READ) + *hipz_acl |= HIPZ_ACCESSCTRL_R_READ; + if (ib_acl & IB_ACCESS_REMOTE_WRITE) + *hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE; + if (ib_acl & IB_ACCESS_REMOTE_ATOMIC) + *hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC; + if (ib_acl & IB_ACCESS_LOCAL_WRITE) + *hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE; + if (ib_acl & IB_ACCESS_MW_BIND) + *hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND; +} /* end ehca_mrmw_map_acl() */ + +/*----------------------------------------------------------------------*/ + +/* sets page size in hipz access control for MR/MW. */ +void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/ +{ + *hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24); +} /* end ehca_mrmw_set_pgsize_hipz_acl() */ + +/*----------------------------------------------------------------------*/ + +/* + * reverse map access control for MR/MW. + * This routine is used for MR and MW. + */ +void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl, + int *ib_acl) /*OUT*/ +{ + *ib_acl = 0; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ) + *ib_acl |= IB_ACCESS_REMOTE_READ; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE) + *ib_acl |= IB_ACCESS_REMOTE_WRITE; + if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC) + *ib_acl |= IB_ACCESS_REMOTE_ATOMIC; + if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE) + *ib_acl |= IB_ACCESS_LOCAL_WRITE; + if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND) + *ib_acl |= IB_ACCESS_MW_BIND; +} /* end ehca_mrmw_reverse_map_acl() */ + + +/*----------------------------------------------------------------------*/ + +/* + * MR destructor and constructor + * used in Reregister MR verb, sets all fields in ehca_mr_t to 0, + * except struct ib_mr and spinlock + */ +void ehca_mr_deletenew(struct ehca_mr *mr) +{ + mr->flags = 0; + mr->num_kpages = 0; + mr->num_hwpages = 0; + mr->acl = 0; + mr->start = NULL; + mr->fmr_page_size = 0; + mr->fmr_max_pages = 0; + mr->fmr_max_maps = 0; + mr->fmr_map_cnt = 0; + memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle)); + memset(&mr->galpas, 0, sizeof(mr->galpas)); +} /* end ehca_mr_deletenew() */ + +int ehca_init_mrmw_cache(void) +{ + mr_cache = kmem_cache_create("ehca_cache_mr", + sizeof(struct ehca_mr), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!mr_cache) + return -ENOMEM; + mw_cache = kmem_cache_create("ehca_cache_mw", + sizeof(struct ehca_mw), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!mw_cache) { + kmem_cache_destroy(mr_cache); + mr_cache = NULL; + return -ENOMEM; + } + return 0; +} + +void ehca_cleanup_mrmw_cache(void) +{ + if (mr_cache) + kmem_cache_destroy(mr_cache); + if (mw_cache) + kmem_cache_destroy(mw_cache); +} + +static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap, + int dir) +{ + if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) { + ehca_top_bmap->dir[dir] = + kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL); + if (!ehca_top_bmap->dir[dir]) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE); + } + return 0; +} + +static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir) +{ + if (!ehca_bmap_valid(ehca_bmap->top[top])) { + ehca_bmap->top[top] = + kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL); + if (!ehca_bmap->top[top]) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE); + } + return ehca_init_top_bmap(ehca_bmap->top[top], dir); +} + +static inline int ehca_calc_index(unsigned long i, unsigned long s) +{ + return (i >> s) & EHCA_INDEX_MASK; +} + +void ehca_destroy_busmap(void) +{ + int top, dir; + + if (!ehca_bmap) + return; + + for (top = 0; top < EHCA_MAP_ENTRIES; top++) { + if (!ehca_bmap_valid(ehca_bmap->top[top])) + continue; + for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) { + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + continue; + + kfree(ehca_bmap->top[top]->dir[dir]); + } + + kfree(ehca_bmap->top[top]); + } + + kfree(ehca_bmap); + ehca_bmap = NULL; +} + +static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i, start_section, end_section; + int top, dir, idx; + + if (!nr_pages) + return 0; + + if (!ehca_bmap) { + ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL); + if (!ehca_bmap) + return -ENOMEM; + /* Set map block to 0xFF according to EHCA_INVAL_ADDR */ + memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE); + } + + start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE; + end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE; + for (i = start_section; i < end_section; i++) { + int ret; + top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT); + dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT); + idx = i & EHCA_INDEX_MASK; + + ret = ehca_init_bmap(ehca_bmap, top, dir); + if (ret) { + ehca_destroy_busmap(); + return ret; + } + ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len; + ehca_mr_len += EHCA_SECTSIZE; + } + return 0; +} + +static int ehca_is_hugepage(unsigned long pfn) +{ + int page_order; + + if (pfn & EHCA_HUGEPAGE_PFN_MASK) + return 0; + + page_order = compound_order(pfn_to_page(pfn)); + if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT) + return 0; + + return 1; +} + +static int ehca_create_busmap_callback(unsigned long initial_pfn, + unsigned long total_nr_pages, void *arg) +{ + int ret; + unsigned long pfn, start_pfn, end_pfn, nr_pages; + + if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE) + return ehca_update_busmap(initial_pfn, total_nr_pages); + + /* Given chunk is >= 16GB -> check for hugepages */ + start_pfn = initial_pfn; + end_pfn = initial_pfn + total_nr_pages; + pfn = start_pfn; + + while (pfn < end_pfn) { + if (ehca_is_hugepage(pfn)) { + /* Add mem found in front of the hugepage */ + nr_pages = pfn - start_pfn; + ret = ehca_update_busmap(start_pfn, nr_pages); + if (ret) + return ret; + /* Skip the hugepage */ + pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE); + start_pfn = pfn; + } else + pfn += (EHCA_SECTSIZE / PAGE_SIZE); + } + + /* Add mem found behind the hugepage(s) */ + nr_pages = pfn - start_pfn; + return ehca_update_busmap(start_pfn, nr_pages); +} + +int ehca_create_busmap(void) +{ + int ret; + + ehca_mr_len = 0; + ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL, + ehca_create_busmap_callback); + return ret; +} + +static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo) +{ + int top; + u64 hret, *kpage; + + kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!kpage) { + ehca_err(&shca->ib_device, "kpage alloc failed"); + return -ENOMEM; + } + for (top = 0; top < EHCA_MAP_ENTRIES; top++) { + if (!ehca_bmap_valid(ehca_bmap->top[top])) + continue; + hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo); + if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS)) + break; + } + + ehca_free_fw_ctrlblock(kpage); + + if (hret == H_SUCCESS) + return 0; /* Everything is fine */ + else { + ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, " + "h_ret=%lli e_mr=%p top=%x lkey=%x " + "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top, + e_mr->ib.ib_mr.lkey, + shca->ipz_hca_handle.handle, + e_mr->ipz_mr_handle.handle); + return ehca2ib_return_code(hret); + } +} + +static u64 ehca_map_vaddr(void *caddr) +{ + int top, dir, idx; + unsigned long abs_addr, offset; + u64 entry; + + if (!ehca_bmap) + return EHCA_INVAL_ADDR; + + abs_addr = __pa(caddr); + top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT); + if (!ehca_bmap_valid(ehca_bmap->top[top])) + return EHCA_INVAL_ADDR; + + dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT); + if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir])) + return EHCA_INVAL_ADDR; + + idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT); + + entry = ehca_bmap->top[top]->dir[dir]->ent[idx]; + if (ehca_bmap_valid(entry)) { + offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1); + return entry | offset; + } else + return EHCA_INVAL_ADDR; +} + +static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == EHCA_INVAL_ADDR; +} + +static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + if (cpu_addr) + return ehca_map_vaddr(cpu_addr); + else + return EHCA_INVAL_ADDR; +} + +static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + if (offset + size > PAGE_SIZE) + return EHCA_INVAL_ADDR; + + addr = ehca_map_vaddr(page_address(page)); + if (!ehca_dma_mapping_error(dev, addr)) + addr += offset; + + return addr; +} + +static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) { + u64 addr; + addr = ehca_map_vaddr(sg_virt(sg)); + if (ehca_dma_mapping_error(dev, addr)) + return 0; + + sg->dma_address = addr; + sg->dma_length = sg->length; + } + return nents; +} + +static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction direction) +{ + /* This is only a stub; nothing to be done here */ +} + +static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ + dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); +} + +static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ + dma_sync_single_for_device(dev->dma_device, addr, size, dir); +} + +static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + u64 dma_addr; + + p = alloc_pages(flag, get_order(size)); + if (p) { + addr = page_address(p); + dma_addr = ehca_map_vaddr(addr); + if (ehca_dma_mapping_error(dev, dma_addr)) { + free_pages((unsigned long)addr, get_order(size)); + return NULL; + } + if (dma_handle) + *dma_handle = dma_addr; + return addr; + } + return NULL; +} + +static void ehca_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + if (cpu_addr && size) + free_pages((unsigned long)cpu_addr, get_order(size)); +} + + +struct ib_dma_mapping_ops ehca_dma_mapping_ops = { + .mapping_error = ehca_dma_mapping_error, + .map_single = ehca_dma_map_single, + .unmap_single = ehca_dma_unmap_single, + .map_page = ehca_dma_map_page, + .unmap_page = ehca_dma_unmap_page, + .map_sg = ehca_dma_map_sg, + .unmap_sg = ehca_dma_unmap_sg, + .sync_single_for_cpu = ehca_dma_sync_single_for_cpu, + .sync_single_for_device = ehca_dma_sync_single_for_device, + .alloc_coherent = ehca_dma_alloc_coherent, + .free_coherent = ehca_dma_free_coherent, +}; diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h new file mode 100644 index 000000000..50d8b5130 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_mrmw.h @@ -0,0 +1,132 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * MR/MW declarations and inline functions + * + * Authors: Dietmar Decker + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _EHCA_MRMW_H_ +#define _EHCA_MRMW_H_ + +enum ehca_reg_type { + EHCA_REG_MR, + EHCA_REG_BUSMAP_MR +}; + +int ehca_reg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int acl, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey, + enum ehca_reg_type reg_type); + +int ehca_reg_mr_rpages(struct ehca_shca *shca, + struct ehca_mr *e_mr, + struct ehca_mr_pginfo *pginfo); + +int ehca_rereg_mr(struct ehca_shca *shca, + struct ehca_mr *e_mr, + u64 *iova_start, + u64 size, + int mr_access_flags, + struct ehca_pd *e_pd, + struct ehca_mr_pginfo *pginfo, + u32 *lkey, + u32 *rkey); + +int ehca_unmap_one_fmr(struct ehca_shca *shca, + struct ehca_mr *e_fmr); + +int ehca_reg_smr(struct ehca_shca *shca, + struct ehca_mr *e_origmr, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey); + +int ehca_reg_internal_maxmr(struct ehca_shca *shca, + struct ehca_pd *e_pd, + struct ehca_mr **maxmr); + +int ehca_reg_maxmr(struct ehca_shca *shca, + struct ehca_mr *e_newmr, + u64 *iova_start, + int acl, + struct ehca_pd *e_pd, + u32 *lkey, + u32 *rkey); + +int ehca_dereg_internal_maxmr(struct ehca_shca *shca); + +int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, + int num_phys_buf, + u64 *iova_start, + u64 *size); + +int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, + u64 *page_list, + int list_len); + +int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo, + u32 number, + u64 *kpage); + +int ehca_mr_is_maxmr(u64 size, + u64 *iova_start); + +void ehca_mrmw_map_acl(int ib_acl, + u32 *hipz_acl); + +void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl); + +void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl, + int *ib_acl); + +void ehca_mr_deletenew(struct ehca_mr *mr); + +int ehca_create_busmap(void); + +void ehca_destroy_busmap(void); + +extern struct ib_dma_mapping_ops ehca_dma_mapping_ops; +#endif /*_EHCA_MRMW_H_*/ diff --git a/drivers/staging/rdma/ehca/ehca_pd.c b/drivers/staging/rdma/ehca/ehca_pd.c new file mode 100644 index 000000000..351577a66 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_pd.c @@ -0,0 +1,124 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * PD functions + * + * Authors: Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ehca_iverbs.h" + +static struct kmem_cache *pd_cache; + +struct ib_pd *ehca_alloc_pd(struct ib_device *device, + struct ib_ucontext *context, struct ib_udata *udata) +{ + struct ehca_pd *pd; + int i; + + pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL); + if (!pd) { + ehca_err(device, "device=%p context=%p out of memory", + device, context); + return ERR_PTR(-ENOMEM); + } + + for (i = 0; i < 2; i++) { + INIT_LIST_HEAD(&pd->free[i]); + INIT_LIST_HEAD(&pd->full[i]); + } + mutex_init(&pd->lock); + + /* + * Kernel PD: when device = -1, 0 + * User PD: when context != -1 + */ + if (!context) { + /* + * Kernel PDs after init reuses always + * the one created in ehca_shca_reopen() + */ + struct ehca_shca *shca = container_of(device, struct ehca_shca, + ib_device); + pd->fw_pd.value = shca->pd->fw_pd.value; + } else + pd->fw_pd.value = (u64)pd; + + return &pd->ib_pd; +} + +int ehca_dealloc_pd(struct ib_pd *pd) +{ + struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd); + int i, leftovers = 0; + struct ipz_small_queue_page *page, *tmp; + + for (i = 0; i < 2; i++) { + list_splice(&my_pd->full[i], &my_pd->free[i]); + list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) { + leftovers = 1; + free_page(page->page); + kmem_cache_free(small_qp_cache, page); + } + } + + if (leftovers) + ehca_warn(pd->device, + "Some small queue pages were not freed"); + + kmem_cache_free(pd_cache, my_pd); + + return 0; +} + +int ehca_init_pd_cache(void) +{ + pd_cache = kmem_cache_create("ehca_cache_pd", + sizeof(struct ehca_pd), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!pd_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_pd_cache(void) +{ + if (pd_cache) + kmem_cache_destroy(pd_cache); +} diff --git a/drivers/staging/rdma/ehca/ehca_qes.h b/drivers/staging/rdma/ehca/ehca_qes.h new file mode 100644 index 000000000..90c4efa67 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_qes.h @@ -0,0 +1,260 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Hardware request structures + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _EHCA_QES_H_ +#define _EHCA_QES_H_ + +#include "ehca_tools.h" + +/* virtual scatter gather entry to specify remote addresses with length */ +struct ehca_vsgentry { + u64 vaddr; + u32 lkey; + u32 length; +}; + +#define GRH_FLAG_MASK EHCA_BMASK_IBM( 7, 7) +#define GRH_IPVERSION_MASK EHCA_BMASK_IBM( 0, 3) +#define GRH_TCLASS_MASK EHCA_BMASK_IBM( 4, 12) +#define GRH_FLOWLABEL_MASK EHCA_BMASK_IBM(13, 31) +#define GRH_PAYLEN_MASK EHCA_BMASK_IBM(32, 47) +#define GRH_NEXTHEADER_MASK EHCA_BMASK_IBM(48, 55) +#define GRH_HOPLIMIT_MASK EHCA_BMASK_IBM(56, 63) + +/* + * Unreliable Datagram Address Vector Format + * see IBTA Vol1 chapter 8.3 Global Routing Header + */ +struct ehca_ud_av { + u8 sl; + u8 lnh; + u16 dlid; + u8 reserved1; + u8 reserved2; + u8 reserved3; + u8 slid_path_bits; + u8 reserved4; + u8 ipd; + u8 reserved5; + u8 pmtu; + u32 reserved6; + u64 reserved7; + union { + struct { + u64 word_0; /* always set to 6 */ + /*should be 0x1B for IB transport */ + u64 word_1; + u64 word_2; + u64 word_3; + u64 word_4; + } grh; + struct { + u32 wd_0; + u32 wd_1; + /* DWord_1 --> SGID */ + + u32 sgid_wd3; + u32 sgid_wd2; + + u32 sgid_wd1; + u32 sgid_wd0; + /* DWord_3 --> DGID */ + + u32 dgid_wd3; + u32 dgid_wd2; + + u32 dgid_wd1; + u32 dgid_wd0; + } grh_l; + }; +}; + +/* maximum number of sg entries allowed in a WQE */ +#define MAX_WQE_SG_ENTRIES 252 + +#define WQE_OPTYPE_SEND 0x80 +#define WQE_OPTYPE_RDMAREAD 0x40 +#define WQE_OPTYPE_RDMAWRITE 0x20 +#define WQE_OPTYPE_CMPSWAP 0x10 +#define WQE_OPTYPE_FETCHADD 0x08 +#define WQE_OPTYPE_BIND 0x04 + +#define WQE_WRFLAG_REQ_SIGNAL_COM 0x80 +#define WQE_WRFLAG_FENCE 0x40 +#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20 +#define WQE_WRFLAG_SOLIC_EVENT 0x10 + +#define WQEF_CACHE_HINT 0x80 +#define WQEF_CACHE_HINT_RD_WR 0x40 +#define WQEF_TIMED_WQE 0x20 +#define WQEF_PURGE 0x08 +#define WQEF_HIGH_NIBBLE 0xF0 + +#define MW_BIND_ACCESSCTRL_R_WRITE 0x40 +#define MW_BIND_ACCESSCTRL_R_READ 0x20 +#define MW_BIND_ACCESSCTRL_R_ATOMIC 0x10 + +struct ehca_wqe { + u64 work_request_id; + u8 optype; + u8 wr_flag; + u16 pkeyi; + u8 wqef; + u8 nr_of_data_seg; + u16 wqe_provided_slid; + u32 destination_qp_number; + u32 resync_psn_sqp; + u32 local_ee_context_qkey; + u32 immediate_data; + union { + struct { + u64 remote_virtual_address; + u32 rkey; + u32 reserved; + u64 atomic_1st_op_dma_len; + u64 atomic_2nd_op; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + + } nud; + struct { + u64 ehca_ud_av_ptr; + u64 reserved1; + u64 reserved2; + u64 reserved3; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + } ud_avp; + struct { + struct ehca_ud_av ud_av; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES - + 2]; + } ud_av; + struct { + u64 reserved0; + u64 reserved1; + u64 reserved2; + u64 reserved3; + struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES]; + } all_rcv; + + struct { + u64 reserved; + u32 rkey; + u32 old_rkey; + u64 reserved1; + u64 reserved2; + u64 virtual_address; + u32 reserved3; + u32 length; + u32 reserved4; + u16 reserved5; + u8 reserved6; + u8 lr_ctl; + u32 lkey; + u32 reserved7; + u64 reserved8; + u64 reserved9; + u64 reserved10; + u64 reserved11; + } bind; + struct { + u64 reserved12; + u64 reserved13; + u32 size; + u32 start; + } inline_data; + } u; + +}; + +#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0) +#define WC_IMM_DATA EHCA_BMASK_IBM(1, 1) +#define WC_GRH_PRESENT EHCA_BMASK_IBM(2, 2) +#define WC_SE_BIT EHCA_BMASK_IBM(3, 3) +#define WC_STATUS_ERROR_BIT 0x80000000 +#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800 +#define WC_STATUS_PURGE_BIT 0x10 +#define WC_SEND_RECEIVE_BIT 0x80 + +struct ehca_cqe { + u64 work_request_id; + u8 optype; + u8 w_completion_flags; + u16 reserved1; + u32 nr_bytes_transferred; + u32 immediate_data; + u32 local_qp_number; + u8 freed_resource_count; + u8 service_level; + u16 wqe_count; + u32 qp_token; + u32 qkey_ee_token; + u32 remote_qp_number; + u16 dlid; + u16 rlid; + u16 reserved2; + u16 pkey_index; + u32 cqe_timestamp; + u32 wqe_timestamp; + u8 wqe_timestamp_valid; + u8 reserved3; + u8 reserved4; + u8 cqe_flags; + u32 status; +}; + +struct ehca_eqe { + u64 entry; +}; + +struct ehca_mrte { + u64 starting_va; + u64 length; /* length of memory region in bytes*/ + u32 pd; + u8 key_instance; + u8 pagesize; + u8 mr_control; + u8 local_remote_access_ctrl; + u8 reserved[0x20 - 0x18]; + u64 at_pointer[4]; +}; +#endif /*_EHCA_QES_H_*/ diff --git a/drivers/staging/rdma/ehca/ehca_qp.c b/drivers/staging/rdma/ehca/ehca_qp.c new file mode 100644 index 000000000..2e89356c4 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_qp.c @@ -0,0 +1,2257 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * QP functions + * + * Authors: Joachim Fenkes + * Stefan Roscher + * Waleri Fomin + * Hoang-Nam Nguyen + * Reinhard Ernst + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" +#include "hipz_fns.h" + +static struct kmem_cache *qp_cache; + +/* + * attributes not supported by query qp + */ +#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS | \ + IB_QP_EN_SQD_ASYNC_NOTIFY) + +/* + * ehca (internal) qp state values + */ +enum ehca_qp_state { + EHCA_QPS_RESET = 1, + EHCA_QPS_INIT = 2, + EHCA_QPS_RTR = 3, + EHCA_QPS_RTS = 5, + EHCA_QPS_SQD = 6, + EHCA_QPS_SQE = 8, + EHCA_QPS_ERR = 128 +}; + +/* + * qp state transitions as defined by IB Arch Rel 1.1 page 431 + */ +enum ib_qp_statetrans { + IB_QPST_ANY2RESET, + IB_QPST_ANY2ERR, + IB_QPST_RESET2INIT, + IB_QPST_INIT2RTR, + IB_QPST_INIT2INIT, + IB_QPST_RTR2RTS, + IB_QPST_RTS2SQD, + IB_QPST_RTS2RTS, + IB_QPST_SQD2RTS, + IB_QPST_SQE2RTS, + IB_QPST_SQD2SQD, + IB_QPST_MAX /* nr of transitions, this must be last!!! */ +}; + +/* + * ib2ehca_qp_state maps IB to ehca qp_state + * returns ehca qp state corresponding to given ib qp state + */ +static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state) +{ + switch (ib_qp_state) { + case IB_QPS_RESET: + return EHCA_QPS_RESET; + case IB_QPS_INIT: + return EHCA_QPS_INIT; + case IB_QPS_RTR: + return EHCA_QPS_RTR; + case IB_QPS_RTS: + return EHCA_QPS_RTS; + case IB_QPS_SQD: + return EHCA_QPS_SQD; + case IB_QPS_SQE: + return EHCA_QPS_SQE; + case IB_QPS_ERR: + return EHCA_QPS_ERR; + default: + ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state); + return -EINVAL; + } +} + +/* + * ehca2ib_qp_state maps ehca to IB qp_state + * returns ib qp state corresponding to given ehca qp state + */ +static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state + ehca_qp_state) +{ + switch (ehca_qp_state) { + case EHCA_QPS_RESET: + return IB_QPS_RESET; + case EHCA_QPS_INIT: + return IB_QPS_INIT; + case EHCA_QPS_RTR: + return IB_QPS_RTR; + case EHCA_QPS_RTS: + return IB_QPS_RTS; + case EHCA_QPS_SQD: + return IB_QPS_SQD; + case EHCA_QPS_SQE: + return IB_QPS_SQE; + case EHCA_QPS_ERR: + return IB_QPS_ERR; + default: + ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state); + return -EINVAL; + } +} + +/* + * ehca_qp_type used as index for req_attr and opt_attr of + * struct ehca_modqp_statetrans + */ +enum ehca_qp_type { + QPT_RC = 0, + QPT_UC = 1, + QPT_UD = 2, + QPT_SQP = 3, + QPT_MAX +}; + +/* + * ib2ehcaqptype maps Ib to ehca qp_type + * returns ehca qp type corresponding to ib qp type + */ +static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype) +{ + switch (ibqptype) { + case IB_QPT_SMI: + case IB_QPT_GSI: + return QPT_SQP; + case IB_QPT_RC: + return QPT_RC; + case IB_QPT_UC: + return QPT_UC; + case IB_QPT_UD: + return QPT_UD; + default: + ehca_gen_err("Invalid ibqptype=%x", ibqptype); + return -EINVAL; + } +} + +static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate, + int ib_tostate) +{ + int index = -EINVAL; + switch (ib_tostate) { + case IB_QPS_RESET: + index = IB_QPST_ANY2RESET; + break; + case IB_QPS_INIT: + switch (ib_fromstate) { + case IB_QPS_RESET: + index = IB_QPST_RESET2INIT; + break; + case IB_QPS_INIT: + index = IB_QPST_INIT2INIT; + break; + } + break; + case IB_QPS_RTR: + if (ib_fromstate == IB_QPS_INIT) + index = IB_QPST_INIT2RTR; + break; + case IB_QPS_RTS: + switch (ib_fromstate) { + case IB_QPS_RTR: + index = IB_QPST_RTR2RTS; + break; + case IB_QPS_RTS: + index = IB_QPST_RTS2RTS; + break; + case IB_QPS_SQD: + index = IB_QPST_SQD2RTS; + break; + case IB_QPS_SQE: + index = IB_QPST_SQE2RTS; + break; + } + break; + case IB_QPS_SQD: + if (ib_fromstate == IB_QPS_RTS) + index = IB_QPST_RTS2SQD; + break; + case IB_QPS_SQE: + break; + case IB_QPS_ERR: + index = IB_QPST_ANY2ERR; + break; + default: + break; + } + return index; +} + +/* + * ibqptype2servicetype returns hcp service type corresponding to given + * ib qp type used by create_qp() + */ +static inline int ibqptype2servicetype(enum ib_qp_type ibqptype) +{ + switch (ibqptype) { + case IB_QPT_SMI: + case IB_QPT_GSI: + return ST_UD; + case IB_QPT_RC: + return ST_RC; + case IB_QPT_UC: + return ST_UC; + case IB_QPT_UD: + return ST_UD; + case IB_QPT_RAW_IPV6: + return -EINVAL; + case IB_QPT_RAW_ETHERTYPE: + return -EINVAL; + default: + ehca_gen_err("Invalid ibqptype=%x", ibqptype); + return -EINVAL; + } +} + +/* + * init userspace queue info from ipz_queue data + */ +static inline void queue2resp(struct ipzu_queue_resp *resp, + struct ipz_queue *queue) +{ + resp->qe_size = queue->qe_size; + resp->act_nr_of_sg = queue->act_nr_of_sg; + resp->queue_length = queue->queue_length; + resp->pagesize = queue->pagesize; + resp->toggle_state = queue->toggle_state; + resp->offset = queue->offset; +} + +/* + * init_qp_queue initializes/constructs r/squeue and registers queue pages. + */ +static inline int init_qp_queue(struct ehca_shca *shca, + struct ehca_pd *pd, + struct ehca_qp *my_qp, + struct ipz_queue *queue, + int q_type, + u64 expected_hret, + struct ehca_alloc_queue_parms *parms, + int wqe_size) +{ + int ret, cnt, ipz_rc, nr_q_pages; + void *vpage; + u64 rpage, h_ret; + struct ib_device *ib_dev = &shca->ib_device; + struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle; + + if (!parms->queue_size) + return 0; + + if (parms->is_small) { + nr_q_pages = 1; + ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages, + 128 << parms->page_size, + wqe_size, parms->act_nr_sges, 1); + } else { + nr_q_pages = parms->queue_size; + ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages, + EHCA_PAGESIZE, wqe_size, + parms->act_nr_sges, 0); + } + + if (!ipz_rc) { + ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i", + ipz_rc); + return -EBUSY; + } + + /* register queue pages */ + for (cnt = 0; cnt < nr_q_pages; cnt++) { + vpage = ipz_qpageit_get_inc(queue); + if (!vpage) { + ehca_err(ib_dev, "ipz_qpageit_get_inc() " + "failed p_vpage= %p", vpage); + ret = -EINVAL; + goto init_qp_queue1; + } + rpage = __pa(vpage); + + h_ret = hipz_h_register_rpage_qp(ipz_hca_handle, + my_qp->ipz_qp_handle, + NULL, 0, q_type, + rpage, parms->is_small ? 0 : 1, + my_qp->galpas.kernel); + if (cnt == (nr_q_pages - 1)) { /* last page! */ + if (h_ret != expected_hret) { + ehca_err(ib_dev, "hipz_qp_register_rpage() " + "h_ret=%lli", h_ret); + ret = ehca2ib_return_code(h_ret); + goto init_qp_queue1; + } + vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue); + if (vpage) { + ehca_err(ib_dev, "ipz_qpageit_get_inc() " + "should not succeed vpage=%p", vpage); + ret = -EINVAL; + goto init_qp_queue1; + } + } else { + if (h_ret != H_PAGE_REGISTERED) { + ehca_err(ib_dev, "hipz_qp_register_rpage() " + "h_ret=%lli", h_ret); + ret = ehca2ib_return_code(h_ret); + goto init_qp_queue1; + } + } + } + + ipz_qeit_reset(queue); + + return 0; + +init_qp_queue1: + ipz_queue_dtor(pd, queue); + return ret; +} + +static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp) +{ + if (is_llqp) + return 128 << act_nr_sge; + else + return offsetof(struct ehca_wqe, + u.nud.sg_list[act_nr_sge]); +} + +static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue, + int req_nr_sge, int is_llqp) +{ + u32 wqe_size, q_size; + int act_nr_sge = req_nr_sge; + + if (!is_llqp) + /* round up #SGEs so WQE size is a power of 2 */ + for (act_nr_sge = 4; act_nr_sge <= 252; + act_nr_sge = 4 + 2 * act_nr_sge) + if (act_nr_sge >= req_nr_sge) + break; + + wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp); + q_size = wqe_size * (queue->max_wr + 1); + + if (q_size <= 512) + queue->page_size = 2; + else if (q_size <= 1024) + queue->page_size = 3; + else + queue->page_size = 0; + + queue->is_small = (queue->page_size != 0); +} + +/* needs to be called with cq->spinlock held */ +void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq) +{ + struct list_head *list, *node; + + /* TODO: support low latency QPs */ + if (qp->ext_type == EQPT_LLQP) + return; + + if (on_sq) { + list = &qp->send_cq->sqp_err_list; + node = &qp->sq_err_node; + } else { + list = &qp->recv_cq->rqp_err_list; + node = &qp->rq_err_node; + } + + if (list_empty(node)) + list_add_tail(node, list); + + return; +} + +static void del_from_err_list(struct ehca_cq *cq, struct list_head *node) +{ + unsigned long flags; + + spin_lock_irqsave(&cq->spinlock, flags); + + if (!list_empty(node)) + list_del_init(node); + + spin_unlock_irqrestore(&cq->spinlock, flags); +} + +static void reset_queue_map(struct ehca_queue_map *qmap) +{ + int i; + + qmap->tail = qmap->entries - 1; + qmap->left_to_poll = 0; + qmap->next_wqe_idx = 0; + for (i = 0; i < qmap->entries; i++) { + qmap->map[i].reported = 1; + qmap->map[i].cqe_req = 0; + } +} + +/* + * Create an ib_qp struct that is either a QP or an SRQ, depending on + * the value of the is_srq parameter. If init_attr and srq_init_attr share + * fields, the field out of init_attr is used. + */ +static struct ehca_qp *internal_create_qp( + struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata, int is_srq) +{ + struct ehca_qp *my_qp, *my_srq = NULL; + struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd); + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + struct ib_ucontext *context = NULL; + u64 h_ret; + int is_llqp = 0, has_srq = 0, is_user = 0; + int qp_type, max_send_sge, max_recv_sge, ret; + + /* h_call's out parameters */ + struct ehca_alloc_qp_parms parms; + u32 swqe_size = 0, rwqe_size = 0, ib_qp_num; + unsigned long flags; + + if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) { + ehca_err(pd->device, "Unable to create QP, max number of %i " + "QPs reached.", shca->max_num_qps); + ehca_err(pd->device, "To increase the maximum number of QPs " + "use the number_of_qps module parameter.\n"); + return ERR_PTR(-ENOSPC); + } + + if (init_attr->create_flags) { + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + memset(&parms, 0, sizeof(parms)); + qp_type = init_attr->qp_type; + + if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR && + init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) { + ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed", + init_attr->sq_sig_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + /* save LLQP info */ + if (qp_type & 0x80) { + is_llqp = 1; + parms.ext_type = EQPT_LLQP; + parms.ll_comp_flags = qp_type & LLQP_COMP_MASK; + } + qp_type &= 0x1F; + init_attr->qp_type &= 0x1F; + + /* handle SRQ base QPs */ + if (init_attr->srq) { + my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq); + + if (qp_type == IB_QPT_UC) { + ehca_err(pd->device, "UC with SRQ not supported"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + has_srq = 1; + parms.ext_type = EQPT_SRQBASE; + parms.srq_qpn = my_srq->real_qp_num; + } + + if (is_llqp && has_srq) { + ehca_err(pd->device, "LLQPs can't have an SRQ"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + /* handle SRQs */ + if (is_srq) { + parms.ext_type = EQPT_SRQ; + parms.srq_limit = srq_init_attr->attr.srq_limit; + if (init_attr->cap.max_recv_sge > 3) { + ehca_err(pd->device, "no more than three SGEs " + "supported for SRQ pd=%p max_sge=%x", + pd, init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } + + /* check QP type */ + if (qp_type != IB_QPT_UD && + qp_type != IB_QPT_UC && + qp_type != IB_QPT_RC && + qp_type != IB_QPT_SMI && + qp_type != IB_QPT_GSI) { + ehca_err(pd->device, "wrong QP Type=%x", qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + + if (is_llqp) { + switch (qp_type) { + case IB_QPT_RC: + if ((init_attr->cap.max_send_wr > 255) || + (init_attr->cap.max_recv_wr > 255)) { + ehca_err(pd->device, + "Invalid Number of max_sq_wr=%x " + "or max_rq_wr=%x for RC LLQP", + init_attr->cap.max_send_wr, + init_attr->cap.max_recv_wr); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + break; + case IB_QPT_UD: + if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) { + ehca_err(pd->device, "UD LLQP not supported " + "by this adapter"); + atomic_dec(&shca->num_qps); + return ERR_PTR(-ENOSYS); + } + if (!(init_attr->cap.max_send_sge <= 5 + && init_attr->cap.max_send_sge >= 1 + && init_attr->cap.max_recv_sge <= 5 + && init_attr->cap.max_recv_sge >= 1)) { + ehca_err(pd->device, + "Invalid Number of max_send_sge=%x " + "or max_recv_sge=%x for UD LLQP", + init_attr->cap.max_send_sge, + init_attr->cap.max_recv_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } else if (init_attr->cap.max_send_wr > 255) { + ehca_err(pd->device, + "Invalid Number of " + "max_send_wr=%x for UD QP_TYPE=%x", + init_attr->cap.max_send_wr, qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + break; + default: + ehca_err(pd->device, "unsupported LL QP Type=%x", + qp_type); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } else { + int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI + || qp_type == IB_QPT_GSI) ? 250 : 252; + + if (init_attr->cap.max_send_sge > max_sge + || init_attr->cap.max_recv_sge > max_sge) { + ehca_err(pd->device, "Invalid number of SGEs requested " + "send_sge=%x recv_sge=%x max_sge=%x", + init_attr->cap.max_send_sge, + init_attr->cap.max_recv_sge, max_sge); + atomic_dec(&shca->num_qps); + return ERR_PTR(-EINVAL); + } + } + + my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL); + if (!my_qp) { + ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd); + atomic_dec(&shca->num_qps); + return ERR_PTR(-ENOMEM); + } + + if (pd->uobject && udata) { + is_user = 1; + context = pd->uobject->context; + } + + atomic_set(&my_qp->nr_events, 0); + init_waitqueue_head(&my_qp->wait_completion); + spin_lock_init(&my_qp->spinlock_s); + spin_lock_init(&my_qp->spinlock_r); + my_qp->qp_type = qp_type; + my_qp->ext_type = parms.ext_type; + my_qp->state = IB_QPS_RESET; + + if (init_attr->recv_cq) + my_qp->recv_cq = + container_of(init_attr->recv_cq, struct ehca_cq, ib_cq); + if (init_attr->send_cq) + my_qp->send_cq = + container_of(init_attr->send_cq, struct ehca_cq, ib_cq); + + idr_preload(GFP_KERNEL); + write_lock_irqsave(&ehca_qp_idr_lock, flags); + + ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT); + if (ret >= 0) + my_qp->token = ret; + + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + idr_preload_end(); + if (ret < 0) { + if (ret == -ENOSPC) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid number of qp"); + } else { + ret = -ENOMEM; + ehca_err(pd->device, "Can't allocate new idr entry."); + } + goto create_qp_exit0; + } + + if (has_srq) + parms.srq_token = my_qp->token; + + parms.servicetype = ibqptype2servicetype(qp_type); + if (parms.servicetype < 0) { + ret = -EINVAL; + ehca_err(pd->device, "Invalid qp_type=%x", qp_type); + goto create_qp_exit1; + } + + /* Always signal by WQE so we can hide circ. WQEs */ + parms.sigtype = HCALL_SIGT_BY_WQE; + + /* UD_AV CIRCUMVENTION */ + max_send_sge = init_attr->cap.max_send_sge; + max_recv_sge = init_attr->cap.max_recv_sge; + if (parms.servicetype == ST_UD && !is_llqp) { + max_send_sge += 2; + max_recv_sge += 2; + } + + parms.token = my_qp->token; + parms.eq_handle = shca->eq.ipz_eq_handle; + parms.pd = my_pd->fw_pd; + if (my_qp->send_cq) + parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle; + if (my_qp->recv_cq) + parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle; + + parms.squeue.max_wr = init_attr->cap.max_send_wr; + parms.rqueue.max_wr = init_attr->cap.max_recv_wr; + parms.squeue.max_sge = max_send_sge; + parms.rqueue.max_sge = max_recv_sge; + + /* RC QPs need one more SWQE for unsolicited ack circumvention */ + if (qp_type == IB_QPT_RC) + parms.squeue.max_wr++; + + if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) { + if (HAS_SQ(my_qp)) + ehca_determine_small_queue( + &parms.squeue, max_send_sge, is_llqp); + if (HAS_RQ(my_qp)) + ehca_determine_small_queue( + &parms.rqueue, max_recv_sge, is_llqp); + parms.qp_storage = + (parms.squeue.is_small || parms.rqueue.is_small); + } + + h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user); + if (h_ret != H_SUCCESS) { + ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli", + h_ret); + ret = ehca2ib_return_code(h_ret); + goto create_qp_exit1; + } + + ib_qp_num = my_qp->real_qp_num = parms.real_qp_num; + my_qp->ipz_qp_handle = parms.qp_handle; + my_qp->galpas = parms.galpas; + + swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp); + rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp); + + switch (qp_type) { + case IB_QPT_RC: + if (is_llqp) { + parms.squeue.act_nr_sges = 1; + parms.rqueue.act_nr_sges = 1; + } + /* hide the extra WQE */ + parms.squeue.act_nr_wqes--; + break; + case IB_QPT_UD: + case IB_QPT_GSI: + case IB_QPT_SMI: + /* UD circumvention */ + if (is_llqp) { + parms.squeue.act_nr_sges = 1; + parms.rqueue.act_nr_sges = 1; + } else { + parms.squeue.act_nr_sges -= 2; + parms.rqueue.act_nr_sges -= 2; + } + + if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) { + parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr; + parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr; + parms.squeue.act_nr_sges = init_attr->cap.max_send_sge; + parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge; + ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1; + } + + break; + + default: + break; + } + + /* initialize r/squeue and register queue pages */ + if (HAS_SQ(my_qp)) { + ret = init_qp_queue( + shca, my_pd, my_qp, &my_qp->ipz_squeue, 0, + HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS, + &parms.squeue, swqe_size); + if (ret) { + ehca_err(pd->device, "Couldn't initialize squeue " + "and pages ret=%i", ret); + goto create_qp_exit2; + } + + if (!is_user) { + my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length / + my_qp->ipz_squeue.qe_size; + my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries * + sizeof(struct ehca_qmap_entry)); + if (!my_qp->sq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " + "map ret=%i", ret); + goto create_qp_exit3; + } + INIT_LIST_HEAD(&my_qp->sq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->sq_map); + } + } + + if (HAS_RQ(my_qp)) { + ret = init_qp_queue( + shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1, + H_SUCCESS, &parms.rqueue, rwqe_size); + if (ret) { + ehca_err(pd->device, "Couldn't initialize rqueue " + "and pages ret=%i", ret); + goto create_qp_exit4; + } + if (!is_user) { + my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length / + my_qp->ipz_rqueue.qe_size; + my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries * + sizeof(struct ehca_qmap_entry)); + if (!my_qp->rq_map.map) { + ehca_err(pd->device, "Couldn't allocate squeue " + "map ret=%i", ret); + goto create_qp_exit5; + } + INIT_LIST_HEAD(&my_qp->rq_err_node); + /* to avoid the generation of bogus flush CQEs */ + reset_queue_map(&my_qp->rq_map); + } + } else if (init_attr->srq && !is_user) { + /* this is a base QP, use the queue map of the SRQ */ + my_qp->rq_map = my_srq->rq_map; + INIT_LIST_HEAD(&my_qp->rq_err_node); + + my_qp->ipz_rqueue = my_srq->ipz_rqueue; + } + + if (is_srq) { + my_qp->ib_srq.pd = &my_pd->ib_pd; + my_qp->ib_srq.device = my_pd->ib_pd.device; + + my_qp->ib_srq.srq_context = init_attr->qp_context; + my_qp->ib_srq.event_handler = init_attr->event_handler; + } else { + my_qp->ib_qp.qp_num = ib_qp_num; + my_qp->ib_qp.pd = &my_pd->ib_pd; + my_qp->ib_qp.device = my_pd->ib_pd.device; + + my_qp->ib_qp.recv_cq = init_attr->recv_cq; + my_qp->ib_qp.send_cq = init_attr->send_cq; + + my_qp->ib_qp.qp_type = qp_type; + my_qp->ib_qp.srq = init_attr->srq; + + my_qp->ib_qp.qp_context = init_attr->qp_context; + my_qp->ib_qp.event_handler = init_attr->event_handler; + } + + init_attr->cap.max_inline_data = 0; /* not supported yet */ + init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges; + init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes; + init_attr->cap.max_send_sge = parms.squeue.act_nr_sges; + init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes; + my_qp->init_attr = *init_attr; + + if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) { + shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] = + &my_qp->ib_qp; + if (ehca_nr_ports < 0) { + /* alloc array to cache subsequent modify qp parms + * for autodetect mode + */ + my_qp->mod_qp_parm = + kzalloc(EHCA_MOD_QP_PARM_MAX * + sizeof(*my_qp->mod_qp_parm), + GFP_KERNEL); + if (!my_qp->mod_qp_parm) { + ehca_err(pd->device, + "Could not alloc mod_qp_parm"); + goto create_qp_exit5; + } + } + } + + /* NOTE: define_apq0() not supported yet */ + if (qp_type == IB_QPT_GSI) { + h_ret = ehca_define_sqp(shca, my_qp, init_attr); + if (h_ret != H_SUCCESS) { + kfree(my_qp->mod_qp_parm); + my_qp->mod_qp_parm = NULL; + /* the QP pointer is no longer valid */ + shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] = + NULL; + ret = ehca2ib_return_code(h_ret); + goto create_qp_exit6; + } + } + + if (my_qp->send_cq) { + ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp); + if (ret) { + ehca_err(pd->device, + "Couldn't assign qp to send_cq ret=%i", ret); + goto create_qp_exit7; + } + } + + /* copy queues, galpa data to user space */ + if (context && udata) { + struct ehca_create_qp_resp resp; + memset(&resp, 0, sizeof(resp)); + + resp.qp_num = my_qp->real_qp_num; + resp.token = my_qp->token; + resp.qp_type = my_qp->qp_type; + resp.ext_type = my_qp->ext_type; + resp.qkey = my_qp->qkey; + resp.real_qp_num = my_qp->real_qp_num; + + if (HAS_SQ(my_qp)) + queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue); + if (HAS_RQ(my_qp)) + queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue); + resp.fw_handle_ofs = (u32) + (my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1)); + + if (ib_copy_to_udata(udata, &resp, sizeof resp)) { + ehca_err(pd->device, "Copy to udata failed"); + ret = -EINVAL; + goto create_qp_exit8; + } + } + + return my_qp; + +create_qp_exit8: + ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num); + +create_qp_exit7: + kfree(my_qp->mod_qp_parm); + +create_qp_exit6: + if (HAS_RQ(my_qp) && !is_user) + vfree(my_qp->rq_map.map); + +create_qp_exit5: + if (HAS_RQ(my_qp)) + ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); + +create_qp_exit4: + if (HAS_SQ(my_qp) && !is_user) + vfree(my_qp->sq_map.map); + +create_qp_exit3: + if (HAS_SQ(my_qp)) + ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); + +create_qp_exit2: + hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); + +create_qp_exit1: + write_lock_irqsave(&ehca_qp_idr_lock, flags); + idr_remove(&ehca_qp_idr, my_qp->token); + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + +create_qp_exit0: + kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); + return ERR_PTR(ret); +} + +struct ib_qp *ehca_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata) +{ + struct ehca_qp *ret; + + ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0); + return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp; +} + +static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, + struct ib_uobject *uobject); + +struct ib_srq *ehca_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct ib_qp_init_attr qp_init_attr; + struct ehca_qp *my_qp; + struct ib_srq *ret; + struct ehca_shca *shca = container_of(pd->device, struct ehca_shca, + ib_device); + struct hcp_modify_qp_control_block *mqpcb; + u64 hret, update_mask; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) + return ERR_PTR(-ENOSYS); + + /* For common attributes, internal_create_qp() takes its info + * out of qp_init_attr, so copy all common attrs there. + */ + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.event_handler = srq_init_attr->event_handler; + qp_init_attr.qp_context = srq_init_attr->srq_context; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.qp_type = IB_QPT_RC; + qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr; + qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge; + + my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1); + if (IS_ERR(my_qp)) + return (struct ib_srq *)my_qp; + + /* copy back return values */ + srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr; + srq_init_attr->attr.max_sge = 3; + + /* drive SRQ into RTR state */ + mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!mqpcb) { + ehca_err(pd->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num); + ret = ERR_PTR(-ENOMEM); + goto create_srq1; + } + + mqpcb->qp_state = EHCA_QPS_INIT; + mqpcb->prim_phys_port = 1; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not modify SRQ to INIT " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + mqpcb->qp_enable = 1; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not enable SRQ " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + mqpcb->qp_state = EHCA_QPS_RTR; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + hret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + if (hret != H_SUCCESS) { + ehca_err(pd->device, "Could not modify SRQ to RTR " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, hret); + goto create_srq2; + } + + ehca_free_fw_ctrlblock(mqpcb); + + return &my_qp->ib_srq; + +create_srq2: + ret = ERR_PTR(ehca2ib_return_code(hret)); + ehca_free_fw_ctrlblock(mqpcb); + +create_srq1: + internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject); + + return ret; +} + +/* + * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts + * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe + * returns total number of bad wqes in bad_wqe_cnt + */ +static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca, + int *bad_wqe_cnt) +{ + u64 h_ret; + struct ipz_queue *squeue; + void *bad_send_wqe_p, *bad_send_wqe_v; + u64 q_ofs; + struct ehca_wqe *wqe; + int qp_num = my_qp->ib_qp.qp_num; + + /* get send wqe pointer */ + h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, &my_qp->pf, + &bad_send_wqe_p, NULL, 2); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed" + " ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp_num, h_ret); + return ehca2ib_return_code(h_ret); + } + bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63))); + ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p", + qp_num, bad_send_wqe_p); + /* convert wqe pointer to vadr */ + bad_send_wqe_v = __va((u64)bad_send_wqe_p); + if (ehca_debug_level >= 2) + ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num); + squeue = &my_qp->ipz_squeue; + if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) { + ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x" + " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p); + return -EFAULT; + } + + /* loop sets wqe's purge bit */ + wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs); + *bad_wqe_cnt = 0; + while (wqe->optype != 0xff && wqe->wqef != 0xff) { + if (ehca_debug_level >= 2) + ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num); + wqe->nr_of_data_seg = 0; /* suppress data access */ + wqe->wqef = WQEF_PURGE; /* WQE to be purged */ + q_ofs = ipz_queue_advance_offset(squeue, q_ofs); + wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs); + *bad_wqe_cnt = (*bad_wqe_cnt)+1; + } + /* + * bad wqe will be reprocessed and ignored when pol_cq() is called, + * i.e. nr of wqes with flush error status is one less + */ + ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x", + qp_num, (*bad_wqe_cnt)-1); + wqe->wqef = 0; + + return 0; +} + +static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue, + struct ehca_queue_map *qmap) +{ + void *wqe_v; + u64 q_ofs; + u32 wqe_idx; + unsigned int tail_idx; + + /* convert real to abs address */ + wqe_p = wqe_p & (~(1UL << 63)); + + wqe_v = __va(wqe_p); + + if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) { + ehca_gen_err("Invalid offset for calculating left cqes " + "wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v); + return -EFAULT; + } + + tail_idx = next_index(qmap->tail, qmap->entries); + wqe_idx = q_ofs / ipz_queue->qe_size; + + /* check all processed wqes, whether a cqe is requested or not */ + while (tail_idx != wqe_idx) { + if (qmap->map[tail_idx].cqe_req) + qmap->left_to_poll++; + tail_idx = next_index(tail_idx, qmap->entries); + } + /* save index in queue, where we have to start flushing */ + qmap->next_wqe_idx = wqe_idx; + return 0; +} + +static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca) +{ + u64 h_ret; + void *send_wqe_p, *recv_wqe_p; + int ret; + unsigned long flags; + int qp_num = my_qp->ib_qp.qp_num; + + /* this hcall is not supported on base QPs */ + if (my_qp->ext_type != EQPT_SRQBASE) { + /* get send and receive wqe pointer */ + h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, &my_qp->pf, + &send_wqe_p, &recv_wqe_p, 4); + if (h_ret != H_SUCCESS) { + ehca_err(&shca->ib_device, "disable_and_get_wqe() " + "failed ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp_num, h_ret); + return ehca2ib_return_code(h_ret); + } + + /* + * acquire lock to ensure that nobody is polling the cq which + * could mean that the qmap->tail pointer is in an + * inconsistent state. + */ + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue, + &my_qp->sq_map); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + if (ret) + return ret; + + + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue, + &my_qp->rq_map); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags); + if (ret) + return ret; + } else { + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + my_qp->sq_map.left_to_poll = 0; + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + my_qp->rq_map.left_to_poll = 0; + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags); + } + + /* this assures flush cqes being generated only for pending wqes */ + if ((my_qp->sq_map.left_to_poll == 0) && + (my_qp->rq_map.left_to_poll == 0)) { + spin_lock_irqsave(&my_qp->send_cq->spinlock, flags); + ehca_add_to_err_list(my_qp, 1); + spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags); + + if (HAS_RQ(my_qp)) { + spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags); + ehca_add_to_err_list(my_qp, 0); + spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, + flags); + } + } + + return 0; +} + +/* + * internal_modify_qp with circumvention to handle aqp0 properly + * smi_reset2init indicates if this is an internal reset-to-init-call for + * smi. This flag must always be zero if called from ehca_modify_qp()! + * This internal func was intorduced to avoid recursion of ehca_modify_qp()! + */ +static int internal_modify_qp(struct ib_qp *ibqp, + struct ib_qp_attr *attr, + int attr_mask, int smi_reset2init) +{ + enum ib_qp_state qp_cur_state, qp_new_state; + int cnt, qp_attr_idx, ret = 0; + enum ib_qp_statetrans statetrans; + struct hcp_modify_qp_control_block *mqpcb; + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = + container_of(ibqp->pd->device, struct ehca_shca, ib_device); + u64 update_mask; + u64 h_ret; + int bad_wqe_cnt = 0; + int is_user = 0; + int squeue_locked = 0; + unsigned long flags = 0; + + /* do query_qp to obtain current attr values */ + mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC); + if (!mqpcb) { + ehca_err(ibqp->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + mqpcb, my_qp->galpas.kernel); + if (h_ret != H_SUCCESS) { + ehca_err(ibqp->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, ibqp->qp_num, h_ret); + ret = ehca2ib_return_code(h_ret); + goto modify_qp_exit1; + } + if (ibqp->uobject) + is_user = 1; + + qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state); + + if (qp_cur_state == -EINVAL) { /* invalid qp state */ + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x " + "ehca_qp=%p qp_num=%x", + mqpcb->qp_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + /* + * circumvention to set aqp0 initial state to init + * as expected by IB spec + */ + if (smi_reset2init == 0 && + ibqp->qp_type == IB_QPT_SMI && + qp_cur_state == IB_QPS_RESET && + (attr_mask & IB_QP_STATE) && + attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */ + struct ib_qp_attr smiqp_attr = { + .qp_state = IB_QPS_INIT, + .port_num = my_qp->init_attr.port_num, + .pkey_index = 0, + .qkey = 0 + }; + int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT | + IB_QP_PKEY_INDEX | IB_QP_QKEY; + int smirc = internal_modify_qp( + ibqp, &smiqp_attr, smiqp_attr_mask, 1); + if (smirc) { + ehca_err(ibqp->device, "SMI RESET -> INIT failed. " + "ehca_modify_qp() rc=%i", smirc); + ret = H_PARAMETER; + goto modify_qp_exit1; + } + qp_cur_state = IB_QPS_INIT; + ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded"); + } + /* is transmitted current state equal to "real" current state */ + if ((attr_mask & IB_QP_CUR_STATE) && + qp_cur_state != attr->cur_qp_state) { + ret = -EINVAL; + ehca_err(ibqp->device, + "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>" + " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x", + attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x " + "new qp_state=%x attribute_mask=%x", + my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask); + + qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state; + if (!smi_reset2init && + !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) { + ret = -EINVAL; + ehca_err(ibqp->device, + "Invalid qp transition new_state=%x cur_state=%x " + "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state, + qp_cur_state, my_qp, ibqp->qp_num, attr_mask); + goto modify_qp_exit1; + } + + mqpcb->qp_state = ib2ehca_qp_state(qp_new_state); + if (mqpcb->qp_state) + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1); + else { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid new qp state=%x " + "ehca_qp=%p qp_num=%x", + qp_new_state, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + /* retrieve state transition struct to get req and opt attrs */ + statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state); + if (statetrans < 0) { + ret = -EINVAL; + ehca_err(ibqp->device, " qp_cur_state=%x " + "new_qp_state=%x State_xsition=%x ehca_qp=%p " + "qp_num=%x", qp_cur_state, qp_new_state, + statetrans, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + qp_attr_idx = ib2ehcaqptype(ibqp->qp_type); + + if (qp_attr_idx < 0) { + ret = qp_attr_idx; + ehca_err(ibqp->device, + "Invalid QP type=%x ehca_qp=%p qp_num=%x", + ibqp->qp_type, my_qp, ibqp->qp_num); + goto modify_qp_exit1; + } + + ehca_dbg(ibqp->device, + "ehca_qp=%p qp_num=%x qp_state_xsit=%x", + my_qp, ibqp->qp_num, statetrans); + + /* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set + * in non-LL UD QPs. + */ + if ((my_qp->qp_type == IB_QPT_UD) && + (my_qp->ext_type != EQPT_LLQP) && + (statetrans == IB_QPST_INIT2RTR) && + (shca->hw_level >= 0x22)) { + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + mqpcb->send_grh_flag = 1; + } + + /* sqe -> rts: set purge bit of bad wqe before actual trans */ + if ((my_qp->qp_type == IB_QPT_UD || + my_qp->qp_type == IB_QPT_GSI || + my_qp->qp_type == IB_QPT_SMI) && + statetrans == IB_QPST_SQE2RTS) { + /* mark next free wqe if kernel */ + if (!ibqp->uobject) { + struct ehca_wqe *wqe; + /* lock send queue */ + spin_lock_irqsave(&my_qp->spinlock_s, flags); + squeue_locked = 1; + /* mark next free wqe */ + wqe = (struct ehca_wqe *) + ipz_qeit_get(&my_qp->ipz_squeue); + wqe->optype = wqe->wqef = 0xff; + ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p", + ibqp->qp_num, wqe); + } + ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt); + if (ret) { + ehca_err(ibqp->device, "prepare_sqe_rts() failed " + "ehca_qp=%p qp_num=%x ret=%i", + my_qp, ibqp->qp_num, ret); + goto modify_qp_exit2; + } + } + + /* + * enable RDMA_Atomic_Control if reset->init und reliable con + * this is necessary since gen2 does not provide that flag, + * but pHyp requires it + */ + if (statetrans == IB_QPST_RESET2INIT && + (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) { + mqpcb->rdma_atomic_ctrl = 3; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1); + } + /* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */ + if (statetrans == IB_QPST_INIT2RTR && + (ibqp->qp_type == IB_QPT_UC) && + !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) { + mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */ + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1); + } + + if (attr_mask & IB_QP_PKEY_INDEX) { + if (attr->pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + mqpcb->prim_p_key_idx = attr->pkey_index; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1); + } + if (attr_mask & IB_QP_PORT) { + struct ehca_sport *sport; + struct ehca_qp *aqp1; + if (attr->port_num < 1 || attr->port_num > shca->num_ports) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid port=%x. " + "ehca_qp=%p qp_num=%x num_ports=%x", + attr->port_num, my_qp, ibqp->qp_num, + shca->num_ports); + goto modify_qp_exit2; + } + sport = &shca->sport[attr->port_num - 1]; + if (!sport->ibqp_sqp[IB_QPT_GSI]) { + /* should not occur */ + ret = -EFAULT; + ehca_err(ibqp->device, "AQP1 was not created for " + "port=%x", attr->port_num); + goto modify_qp_exit2; + } + aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI], + struct ehca_qp, ib_qp); + if (ibqp->qp_type != IB_QPT_GSI && + ibqp->qp_type != IB_QPT_SMI && + aqp1->mod_qp_parm) { + /* + * firmware will reject this modify_qp() because + * port is not activated/initialized fully + */ + ret = -EFAULT; + ehca_warn(ibqp->device, "Couldn't modify qp port=%x: " + "either port is being activated (try again) " + "or cabling issue", attr->port_num); + goto modify_qp_exit2; + } + mqpcb->prim_phys_port = attr->port_num; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1); + } + if (attr_mask & IB_QP_QKEY) { + mqpcb->qkey = attr->qkey; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1); + } + if (attr_mask & IB_QP_AV) { + mqpcb->dlid = attr->ah_attr.dlid; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1); + mqpcb->source_path_bits = attr->ah_attr.src_path_bits; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1); + mqpcb->service_level = attr->ah_attr.sl; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1); + + if (ehca_calc_ipd(shca, mqpcb->prim_phys_port, + attr->ah_attr.static_rate, + &mqpcb->max_static_rate)) { + ret = -EINVAL; + goto modify_qp_exit2; + } + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1); + + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1); + + /* + * only if GRH is TRUE we might consider SOURCE_GID_IDX + * and DEST_GID otherwise phype will return H_ATTR_PARM!!! + */ + if (attr->ah_attr.ah_flags == IB_AH_GRH) { + mqpcb->send_grh_flag = 1; + + mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1); + + for (cnt = 0; cnt < 16; cnt++) + mqpcb->dest_gid.byte[cnt] = + attr->ah_attr.grh.dgid.raw[cnt]; + + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1); + mqpcb->flow_label = attr->ah_attr.grh.flow_label; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1); + mqpcb->hop_limit = attr->ah_attr.grh.hop_limit; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1); + mqpcb->traffic_class = attr->ah_attr.grh.traffic_class; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1); + } + } + + if (attr_mask & IB_QP_PATH_MTU) { + /* store ld(MTU) */ + my_qp->mtu_shift = attr->path_mtu + 7; + mqpcb->path_mtu = attr->path_mtu; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1); + } + if (attr_mask & IB_QP_TIMEOUT) { + mqpcb->timeout = attr->timeout; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1); + } + if (attr_mask & IB_QP_RETRY_CNT) { + mqpcb->retry_count = attr->retry_cnt; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1); + } + if (attr_mask & IB_QP_RNR_RETRY) { + mqpcb->rnr_retry_count = attr->rnr_retry; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1); + } + if (attr_mask & IB_QP_RQ_PSN) { + mqpcb->receive_psn = attr->rq_psn; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1); + } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { + mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ? + attr->max_dest_rd_atomic : 2; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1); + } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { + mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ? + attr->max_rd_atomic : 2; + update_mask |= + EHCA_BMASK_SET + (MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1); + } + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_port_num < 1 + || attr->alt_port_num > shca->num_ports) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_port=%x. " + "ehca_qp=%p qp_num=%x num_ports=%x", + attr->alt_port_num, my_qp, ibqp->qp_num, + shca->num_ports); + goto modify_qp_exit2; + } + mqpcb->alt_phys_port = attr->alt_port_num; + + if (attr->alt_pkey_index >= 16) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. " + "ehca_qp=%p qp_num=%x max_pkey_index=f", + attr->pkey_index, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + mqpcb->alt_p_key_idx = attr->alt_pkey_index; + + mqpcb->timeout_al = attr->alt_timeout; + mqpcb->dlid_al = attr->alt_ah_attr.dlid; + mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits; + mqpcb->service_level_al = attr->alt_ah_attr.sl; + + if (ehca_calc_ipd(shca, mqpcb->alt_phys_port, + attr->alt_ah_attr.static_rate, + &mqpcb->max_static_rate_al)) { + ret = -EINVAL; + goto modify_qp_exit2; + } + + /* OpenIB doesn't support alternate retry counts - copy them */ + mqpcb->retry_count_al = mqpcb->retry_count; + mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count; + + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1) + | EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1) + | EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1); + + /* + * Always supply the GRH flag, even if it's zero, to give the + * hypervisor a clear "yes" or "no" instead of a "perhaps" + */ + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1); + + /* + * only if GRH is TRUE we might consider SOURCE_GID_IDX + * and DEST_GID otherwise phype will return H_ATTR_PARM!!! + */ + if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) { + mqpcb->send_grh_flag_al = 1; + + for (cnt = 0; cnt < 16; cnt++) + mqpcb->dest_gid_al.byte[cnt] = + attr->alt_ah_attr.grh.dgid.raw[cnt]; + mqpcb->source_gid_idx_al = + attr->alt_ah_attr.grh.sgid_index; + mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label; + mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit; + mqpcb->traffic_class_al = + attr->alt_ah_attr.grh.traffic_class; + + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1) + | EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) | + EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1); + } + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) { + mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1); + } + + if (attr_mask & IB_QP_SQ_PSN) { + mqpcb->send_psn = attr->sq_psn; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1); + } + + if (attr_mask & IB_QP_DEST_QPN) { + mqpcb->dest_qp_nr = attr->dest_qp_num; + update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1); + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + if (attr->path_mig_state != IB_MIG_REARM + && attr->path_mig_state != IB_MIG_MIGRATED) { + ret = -EINVAL; + ehca_err(ibqp->device, "Invalid mig_state=%x", + attr->path_mig_state); + goto modify_qp_exit2; + } + mqpcb->path_migration_state = attr->path_mig_state + 1; + if (attr->path_mig_state == IB_MIG_REARM) + my_qp->mig_armed = 1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1); + } + + if (attr_mask & IB_QP_CAP) { + mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1); + mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1); + /* no support for max_send/recv_sge yet */ + } + + if (ehca_debug_level >= 2) + ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + + if ((my_qp->qp_type == IB_QPT_UD || + my_qp->qp_type == IB_QPT_GSI || + my_qp->qp_type == IB_QPT_SMI) && + statetrans == IB_QPST_SQE2RTS) { + /* doorbell to reprocessing wqes */ + iosync(); /* serialize GAL register access */ + hipz_update_sqa(my_qp, bad_wqe_cnt-1); + ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt); + } + + if (statetrans == IB_QPST_RESET2INIT || + statetrans == IB_QPST_INIT2INIT) { + mqpcb->qp_enable = 1; + mqpcb->qp_state = EHCA_QPS_INIT; + update_mask = 0; + update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + update_mask, + mqpcb, + my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibqp->device, "ENABLE in context of " + "RESET_2_INIT failed! Maybe you didn't get " + "a LID h_ret=%lli ehca_qp=%p qp_num=%x", + h_ret, my_qp, ibqp->qp_num); + goto modify_qp_exit2; + } + } + if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR) + && !is_user) { + ret = check_for_left_cqes(my_qp, shca); + if (ret) + goto modify_qp_exit2; + } + + if (statetrans == IB_QPST_ANY2RESET) { + ipz_qeit_reset(&my_qp->ipz_rqueue); + ipz_qeit_reset(&my_qp->ipz_squeue); + + if (qp_cur_state == IB_QPS_ERR && !is_user) { + del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); + + if (HAS_RQ(my_qp)) + del_from_err_list(my_qp->recv_cq, + &my_qp->rq_err_node); + } + if (!is_user) + reset_queue_map(&my_qp->sq_map); + + if (HAS_RQ(my_qp) && !is_user) + reset_queue_map(&my_qp->rq_map); + } + + if (attr_mask & IB_QP_QKEY) + my_qp->qkey = attr->qkey; + +modify_qp_exit2: + if (squeue_locked) { /* this means: sqe -> rts */ + spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + my_qp->sqerr_purgeflag = 1; + } + +modify_qp_exit1: + ehca_free_fw_ctrlblock(mqpcb); + + return ret; +} + +int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + int ret = 0; + + struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca, + ib_device); + struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp); + + /* The if-block below caches qp_attr to be modified for GSI and SMI + * qps during the initialization by ib_mad. When the respective port + * is activated, ie we got an event PORT_ACTIVE, we'll replay the + * cached modify calls sequence, see ehca_recover_sqs() below. + * Why that is required: + * 1) If one port is connected, older code requires that port one + * to be connected and module option nr_ports=1 to be given by + * user, which is very inconvenient for end user. + * 2) Firmware accepts modify_qp() only if respective port has become + * active. Older code had a wait loop of 30sec create_qp()/ + * define_aqp1(), which is not appropriate in practice. This + * code now removes that wait loop, see define_aqp1(), and always + * reports all ports to ib_mad resp. users. Only activated ports + * will then usable for the users. + */ + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) { + int port = my_qp->init_attr.port_num; + struct ehca_sport *sport = &shca->sport[port - 1]; + unsigned long flags; + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + /* cache qp_attr only during init */ + if (my_qp->mod_qp_parm) { + struct ehca_mod_qp_parm *p; + if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) { + ehca_err(&shca->ib_device, + "mod_qp_parm overflow state=%x port=%x" + " type=%x", attr->qp_state, + my_qp->init_attr.port_num, + ibqp->qp_type); + spin_unlock_irqrestore(&sport->mod_sqp_lock, + flags); + return -EINVAL; + } + p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx]; + p->mask = attr_mask; + p->attr = *attr; + my_qp->mod_qp_parm_idx++; + ehca_dbg(&shca->ib_device, + "Saved qp_attr for state=%x port=%x type=%x", + attr->qp_state, my_qp->init_attr.port_num, + ibqp->qp_type); + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + goto out; + } + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + } + + ret = internal_modify_qp(ibqp, attr, attr_mask, 0); + +out: + if ((ret == 0) && (attr_mask & IB_QP_STATE)) + my_qp->state = attr->qp_state; + + return ret; +} + +void ehca_recover_sqp(struct ib_qp *sqp) +{ + struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp); + int port = my_sqp->init_attr.port_num; + struct ib_qp_attr attr; + struct ehca_mod_qp_parm *qp_parm; + int i, qp_parm_idx, ret; + unsigned long flags, wr_cnt; + + if (!my_sqp->mod_qp_parm) + return; + ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num); + + qp_parm = my_sqp->mod_qp_parm; + qp_parm_idx = my_sqp->mod_qp_parm_idx; + for (i = 0; i < qp_parm_idx; i++) { + attr = qp_parm[i].attr; + ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0); + if (ret) { + ehca_err(sqp->device, "Could not modify SQP port=%x " + "qp_num=%x ret=%x", port, sqp->qp_num, ret); + goto free_qp_parm; + } + ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x", + port, sqp->qp_num, attr.qp_state); + } + + /* re-trigger posted recv wrs */ + wr_cnt = my_sqp->ipz_rqueue.current_q_offset / + my_sqp->ipz_rqueue.qe_size; + if (wr_cnt) { + spin_lock_irqsave(&my_sqp->spinlock_r, flags); + hipz_update_rqa(my_sqp, wr_cnt); + spin_unlock_irqrestore(&my_sqp->spinlock_r, flags); + ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx", + port, sqp->qp_num, wr_cnt); + } + +free_qp_parm: + kfree(qp_parm); + /* this prevents subsequent calls to modify_qp() to cache qp_attr */ + my_sqp->mod_qp_parm = NULL; +} + +int ehca_query_qp(struct ib_qp *qp, + struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + struct ehca_shca *shca = container_of(qp->device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + struct hcp_modify_qp_control_block *qpcb; + int cnt, ret = 0; + u64 h_ret; + + if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) { + ehca_err(qp->device, "Invalid attribute mask " + "ehca_qp=%p qp_num=%x qp_attr_mask=%x ", + my_qp, qp->qp_num, qp_attr_mask); + return -EINVAL; + } + + qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!qpcb) { + ehca_err(qp->device, "Out of memory for qpcb " + "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(adapter_handle, + my_qp->ipz_qp_handle, + &my_qp->pf, + qpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(qp->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, qp->qp_num, h_ret); + goto query_qp_exit1; + } + + qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state); + qp_attr->qp_state = qp_attr->cur_qp_state; + + if (qp_attr->cur_qp_state == -EINVAL) { + ret = -EINVAL; + ehca_err(qp->device, "Got invalid ehca_qp_state=%x " + "ehca_qp=%p qp_num=%x", + qpcb->qp_state, my_qp, qp->qp_num); + goto query_qp_exit1; + } + + if (qp_attr->qp_state == IB_QPS_SQD) + qp_attr->sq_draining = 1; + + qp_attr->qkey = qpcb->qkey; + qp_attr->path_mtu = qpcb->path_mtu; + qp_attr->path_mig_state = qpcb->path_migration_state - 1; + qp_attr->rq_psn = qpcb->receive_psn; + qp_attr->sq_psn = qpcb->send_psn; + qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field; + qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1; + qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1; + /* UD_AV CIRCUMVENTION */ + if (my_qp->qp_type == IB_QPT_UD) { + qp_attr->cap.max_send_sge = + qpcb->actual_nr_sges_in_sq_wqe - 2; + qp_attr->cap.max_recv_sge = + qpcb->actual_nr_sges_in_rq_wqe - 2; + } else { + qp_attr->cap.max_send_sge = + qpcb->actual_nr_sges_in_sq_wqe; + qp_attr->cap.max_recv_sge = + qpcb->actual_nr_sges_in_rq_wqe; + } + + qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size; + qp_attr->dest_qp_num = qpcb->dest_qp_nr; + + qp_attr->pkey_index = qpcb->prim_p_key_idx; + qp_attr->port_num = qpcb->prim_phys_port; + qp_attr->timeout = qpcb->timeout; + qp_attr->retry_cnt = qpcb->retry_count; + qp_attr->rnr_retry = qpcb->rnr_retry_count; + + qp_attr->alt_pkey_index = qpcb->alt_p_key_idx; + qp_attr->alt_port_num = qpcb->alt_phys_port; + qp_attr->alt_timeout = qpcb->timeout_al; + + qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res; + qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp; + + /* primary av */ + qp_attr->ah_attr.sl = qpcb->service_level; + + if (qpcb->send_grh_flag) { + qp_attr->ah_attr.ah_flags = IB_AH_GRH; + } + + qp_attr->ah_attr.static_rate = qpcb->max_static_rate; + qp_attr->ah_attr.dlid = qpcb->dlid; + qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits; + qp_attr->ah_attr.port_num = qp_attr->port_num; + + /* primary GRH */ + qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class; + qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit; + qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx; + qp_attr->ah_attr.grh.flow_label = qpcb->flow_label; + + for (cnt = 0; cnt < 16; cnt++) + qp_attr->ah_attr.grh.dgid.raw[cnt] = + qpcb->dest_gid.byte[cnt]; + + /* alternate AV */ + qp_attr->alt_ah_attr.sl = qpcb->service_level_al; + if (qpcb->send_grh_flag_al) { + qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH; + } + + qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al; + qp_attr->alt_ah_attr.dlid = qpcb->dlid_al; + qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al; + + /* alternate GRH */ + qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al; + qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al; + qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al; + qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al; + + for (cnt = 0; cnt < 16; cnt++) + qp_attr->alt_ah_attr.grh.dgid.raw[cnt] = + qpcb->dest_gid_al.byte[cnt]; + + /* return init attributes given in ehca_create_qp */ + if (qp_init_attr) + *qp_init_attr = my_qp->init_attr; + + if (ehca_debug_level >= 2) + ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num); + +query_qp_exit1: + ehca_free_fw_ctrlblock(qpcb); + + return ret; +} + +int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct ehca_qp *my_qp = + container_of(ibsrq, struct ehca_qp, ib_srq); + struct ehca_shca *shca = + container_of(ibsrq->pd->device, struct ehca_shca, ib_device); + struct hcp_modify_qp_control_block *mqpcb; + u64 update_mask; + u64 h_ret; + int ret = 0; + + mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!mqpcb) { + ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb " + "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num); + return -ENOMEM; + } + + update_mask = 0; + if (attr_mask & IB_SRQ_LIMIT) { + attr_mask &= ~IB_SRQ_LIMIT; + update_mask |= + EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1) + | EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1); + mqpcb->curr_srq_limit = attr->srq_limit; + mqpcb->qp_aff_asyn_ev_log_reg = + EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1); + } + + /* by now, all bits in attr_mask should have been cleared */ + if (attr_mask) { + ehca_err(ibsrq->device, "invalid attribute mask bits set " + "attr_mask=%x", attr_mask); + ret = -EINVAL; + goto modify_srq_exit0; + } + + if (ehca_debug_level >= 2) + ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num); + + h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle, + NULL, update_mask, mqpcb, + my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", + h_ret, my_qp, my_qp->real_qp_num); + } + +modify_srq_exit0: + ehca_free_fw_ctrlblock(mqpcb); + + return ret; +} + +int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) +{ + struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq); + struct ehca_shca *shca = container_of(srq->device, struct ehca_shca, + ib_device); + struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle; + struct hcp_modify_qp_control_block *qpcb; + int ret = 0; + u64 h_ret; + + qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL); + if (!qpcb) { + ehca_err(srq->device, "Out of memory for qpcb " + "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num); + return -ENOMEM; + } + + h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle, + NULL, qpcb, my_qp->galpas.kernel); + + if (h_ret != H_SUCCESS) { + ret = ehca2ib_return_code(h_ret); + ehca_err(srq->device, "hipz_h_query_qp() failed " + "ehca_qp=%p qp_num=%x h_ret=%lli", + my_qp, my_qp->real_qp_num, h_ret); + goto query_srq_exit1; + } + + srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1; + srq_attr->max_sge = 3; + srq_attr->srq_limit = qpcb->curr_srq_limit; + + if (ehca_debug_level >= 2) + ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num); + +query_srq_exit1: + ehca_free_fw_ctrlblock(qpcb); + + return ret; +} + +static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp, + struct ib_uobject *uobject) +{ + struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device); + struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd, + ib_pd); + struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1]; + u32 qp_num = my_qp->real_qp_num; + int ret; + u64 h_ret; + u8 port_num; + int is_user = 0; + enum ib_qp_type qp_type; + unsigned long flags; + + if (uobject) { + is_user = 1; + if (my_qp->mm_count_galpa || + my_qp->mm_count_rqueue || my_qp->mm_count_squeue) { + ehca_err(dev, "Resources still referenced in " + "user space qp_num=%x", qp_num); + return -EINVAL; + } + } + + if (my_qp->send_cq) { + ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num); + if (ret) { + ehca_err(dev, "Couldn't unassign qp from " + "send_cq ret=%i qp_num=%x cq_num=%x", ret, + qp_num, my_qp->send_cq->cq_number); + return ret; + } + } + + write_lock_irqsave(&ehca_qp_idr_lock, flags); + idr_remove(&ehca_qp_idr, my_qp->token); + write_unlock_irqrestore(&ehca_qp_idr_lock, flags); + + /* + * SRQs will never get into an error list and do not have a recv_cq, + * so we need to skip them here. + */ + if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user) + del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node); + + if (HAS_SQ(my_qp) && !is_user) + del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node); + + /* now wait until all pending events have completed */ + wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events)); + + h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp); + if (h_ret != H_SUCCESS) { + ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli " + "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num); + return ehca2ib_return_code(h_ret); + } + + port_num = my_qp->init_attr.port_num; + qp_type = my_qp->init_attr.qp_type; + + if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) { + spin_lock_irqsave(&sport->mod_sqp_lock, flags); + kfree(my_qp->mod_qp_parm); + my_qp->mod_qp_parm = NULL; + shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL; + spin_unlock_irqrestore(&sport->mod_sqp_lock, flags); + } + + /* no support for IB_QPT_SMI yet */ + if (qp_type == IB_QPT_GSI) { + struct ib_event event; + ehca_info(dev, "device %s: port %x is inactive.", + shca->ib_device.name, port_num); + event.device = &shca->ib_device; + event.event = IB_EVENT_PORT_ERR; + event.element.port_num = port_num; + shca->sport[port_num - 1].port_state = IB_PORT_DOWN; + ib_dispatch_event(&event); + } + + if (HAS_RQ(my_qp)) { + ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue); + if (!is_user) + vfree(my_qp->rq_map.map); + } + if (HAS_SQ(my_qp)) { + ipz_queue_dtor(my_pd, &my_qp->ipz_squeue); + if (!is_user) + vfree(my_qp->sq_map.map); + } + kmem_cache_free(qp_cache, my_qp); + atomic_dec(&shca->num_qps); + return 0; +} + +int ehca_destroy_qp(struct ib_qp *qp) +{ + return internal_destroy_qp(qp->device, + container_of(qp, struct ehca_qp, ib_qp), + qp->uobject); +} + +int ehca_destroy_srq(struct ib_srq *srq) +{ + return internal_destroy_qp(srq->device, + container_of(srq, struct ehca_qp, ib_srq), + srq->uobject); +} + +int ehca_init_qp_cache(void) +{ + qp_cache = kmem_cache_create("ehca_cache_qp", + sizeof(struct ehca_qp), 0, + SLAB_HWCACHE_ALIGN, + NULL); + if (!qp_cache) + return -ENOMEM; + return 0; +} + +void ehca_cleanup_qp_cache(void) +{ + if (qp_cache) + kmem_cache_destroy(qp_cache); +} diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c new file mode 100644 index 000000000..47f949843 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_reqs.c @@ -0,0 +1,953 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * post_send/recv, poll_cq, req_notify + * + * Authors: Hoang-Nam Nguyen + * Waleri Fomin + * Joachim Fenkes + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_qes.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" +#include "hipz_fns.h" + +/* in RC traffic, insert an empty RDMA READ every this many packets */ +#define ACK_CIRC_THRESHOLD 2000000 + +static u64 replace_wr_id(u64 wr_id, u16 idx) +{ + u64 ret; + + ret = wr_id & ~QMAP_IDX_MASK; + ret |= idx & QMAP_IDX_MASK; + + return ret; +} + +static u16 get_app_wr_id(u64 wr_id) +{ + return wr_id & QMAP_IDX_MASK; +} + +static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue, + struct ehca_wqe *wqe_p, + struct ib_recv_wr *recv_wr, + u32 rq_map_idx) +{ + u8 cnt_ds; + if (unlikely((recv_wr->num_sge < 0) || + (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) { + ehca_gen_err("Invalid number of WQE SGE. " + "num_sqe=%x max_nr_of_sg=%x", + recv_wr->num_sge, ipz_rqueue->act_nr_of_sg); + return -EINVAL; /* invalid SG list length */ + } + + /* clear wqe header until sglist */ + memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list)); + + wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx); + wqe_p->nr_of_data_seg = recv_wr->num_sge; + + for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) { + wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr = + recv_wr->sg_list[cnt_ds].addr; + wqe_p->u.all_rcv.sg_list[cnt_ds].lkey = + recv_wr->sg_list[cnt_ds].lkey; + wqe_p->u.all_rcv.sg_list[cnt_ds].length = + recv_wr->sg_list[cnt_ds].length; + } + + if (ehca_debug_level >= 3) { + ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p", + ipz_rqueue); + ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe"); + } + + return 0; +} + +#if defined(DEBUG_GSI_SEND_WR) + +/* need ib_mad struct */ +#include + +static void trace_send_wr_ud(const struct ib_send_wr *send_wr) +{ + int idx; + int j; + while (send_wr) { + struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr; + struct ib_sge *sge = send_wr->sg_list; + ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x " + "send_flags=%x opcode=%x", idx, send_wr->wr_id, + send_wr->num_sge, send_wr->send_flags, + send_wr->opcode); + if (mad_hdr) { + ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x " + "mgmt_class=%x class_version=%x method=%x " + "status=%x class_specific=%x tid=%lx " + "attr_id=%x resv=%x attr_mod=%x", + idx, mad_hdr->base_version, + mad_hdr->mgmt_class, + mad_hdr->class_version, mad_hdr->method, + mad_hdr->status, mad_hdr->class_specific, + mad_hdr->tid, mad_hdr->attr_id, + mad_hdr->resv, + mad_hdr->attr_mod); + } + for (j = 0; j < send_wr->num_sge; j++) { + u8 *data = __va(sge->addr); + ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x " + "lkey=%x", + idx, j, data, sge->length, sge->lkey); + /* assume length is n*16 */ + ehca_dmp(data, sge->length, "send_wr#%x sge#%x", + idx, j); + sge++; + } /* eof for j */ + idx++; + send_wr = send_wr->next; + } /* eof while send_wr */ +} + +#endif /* DEBUG_GSI_SEND_WR */ + +static inline int ehca_write_swqe(struct ehca_qp *qp, + struct ehca_wqe *wqe_p, + const struct ib_send_wr *send_wr, + u32 sq_map_idx, + int hidden) +{ + u32 idx; + u64 dma_length; + struct ehca_av *my_av; + u32 remote_qkey = send_wr->wr.ud.remote_qkey; + struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx]; + + if (unlikely((send_wr->num_sge < 0) || + (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) { + ehca_gen_err("Invalid number of WQE SGE. " + "num_sqe=%x max_nr_of_sg=%x", + send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg); + return -EINVAL; /* invalid SG list length */ + } + + /* clear wqe header until sglist */ + memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list)); + + wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx); + + qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id); + qmap_entry->reported = 0; + qmap_entry->cqe_req = 0; + + switch (send_wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + wqe_p->optype = WQE_OPTYPE_SEND; + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + wqe_p->optype = WQE_OPTYPE_RDMAWRITE; + break; + case IB_WR_RDMA_READ: + wqe_p->optype = WQE_OPTYPE_RDMAREAD; + break; + default: + ehca_gen_err("Invalid opcode=%x", send_wr->opcode); + return -EINVAL; /* invalid opcode */ + } + + wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE; + + wqe_p->wr_flag = 0; + + if ((send_wr->send_flags & IB_SEND_SIGNALED || + qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR) + && !hidden) { + wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM; + qmap_entry->cqe_req = 1; + } + + if (send_wr->opcode == IB_WR_SEND_WITH_IMM || + send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) { + /* this might not work as long as HW does not support it */ + wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data); + wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT; + } + + wqe_p->nr_of_data_seg = send_wr->num_sge; + + switch (qp->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + /* no break is intential here */ + case IB_QPT_UD: + /* IB 1.2 spec C10-15 compliance */ + if (send_wr->wr.ud.remote_qkey & 0x80000000) + remote_qkey = qp->qkey; + + wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8; + wqe_p->local_ee_context_qkey = remote_qkey; + if (unlikely(!send_wr->wr.ud.ah)) { + ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp); + return -EINVAL; + } + if (unlikely(send_wr->wr.ud.remote_qpn == 0)) { + ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num); + return -EINVAL; + } + my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah); + wqe_p->u.ud_av.ud_av = my_av->av; + + /* + * omitted check of IB_SEND_INLINE + * since HW does not support it + */ + for (idx = 0; idx < send_wr->num_sge; idx++) { + wqe_p->u.ud_av.sg_list[idx].vaddr = + send_wr->sg_list[idx].addr; + wqe_p->u.ud_av.sg_list[idx].lkey = + send_wr->sg_list[idx].lkey; + wqe_p->u.ud_av.sg_list[idx].length = + send_wr->sg_list[idx].length; + } /* eof for idx */ + if (qp->qp_type == IB_QPT_SMI || + qp->qp_type == IB_QPT_GSI) + wqe_p->u.ud_av.ud_av.pmtu = 1; + if (qp->qp_type == IB_QPT_GSI) { + wqe_p->pkeyi = send_wr->wr.ud.pkey_index; +#ifdef DEBUG_GSI_SEND_WR + trace_send_wr_ud(send_wr); +#endif /* DEBUG_GSI_SEND_WR */ + } + break; + + case IB_QPT_UC: + if (send_wr->send_flags & IB_SEND_FENCE) + wqe_p->wr_flag |= WQE_WRFLAG_FENCE; + /* no break is intentional here */ + case IB_QPT_RC: + /* TODO: atomic not implemented */ + wqe_p->u.nud.remote_virtual_address = + send_wr->wr.rdma.remote_addr; + wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey; + + /* + * omitted checking of IB_SEND_INLINE + * since HW does not support it + */ + dma_length = 0; + for (idx = 0; idx < send_wr->num_sge; idx++) { + wqe_p->u.nud.sg_list[idx].vaddr = + send_wr->sg_list[idx].addr; + wqe_p->u.nud.sg_list[idx].lkey = + send_wr->sg_list[idx].lkey; + wqe_p->u.nud.sg_list[idx].length = + send_wr->sg_list[idx].length; + dma_length += send_wr->sg_list[idx].length; + } /* eof idx */ + wqe_p->u.nud.atomic_1st_op_dma_len = dma_length; + + /* unsolicited ack circumvention */ + if (send_wr->opcode == IB_WR_RDMA_READ) { + /* on RDMA read, switch on and reset counters */ + qp->message_count = qp->packet_count = 0; + qp->unsol_ack_circ = 1; + } else + /* else estimate #packets */ + qp->packet_count += (dma_length >> qp->mtu_shift) + 1; + + break; + + default: + ehca_gen_err("Invalid qptype=%x", qp->qp_type); + return -EINVAL; + } + + if (ehca_debug_level >= 3) { + ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp); + ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe"); + } + return 0; +} + +/* map_ib_wc_status converts raw cqe_status to ib_wc_status */ +static inline void map_ib_wc_status(u32 cqe_status, + enum ib_wc_status *wc_status) +{ + if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) { + switch (cqe_status & 0x3F) { + case 0x01: + case 0x21: + *wc_status = IB_WC_LOC_LEN_ERR; + break; + case 0x02: + case 0x22: + *wc_status = IB_WC_LOC_QP_OP_ERR; + break; + case 0x03: + case 0x23: + *wc_status = IB_WC_LOC_EEC_OP_ERR; + break; + case 0x04: + case 0x24: + *wc_status = IB_WC_LOC_PROT_ERR; + break; + case 0x05: + case 0x25: + *wc_status = IB_WC_WR_FLUSH_ERR; + break; + case 0x06: + *wc_status = IB_WC_MW_BIND_ERR; + break; + case 0x07: /* remote error - look into bits 20:24 */ + switch ((cqe_status + & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) { + case 0x0: + /* + * PSN Sequence Error! + * couldn't find a matching status! + */ + *wc_status = IB_WC_GENERAL_ERR; + break; + case 0x1: + *wc_status = IB_WC_REM_INV_REQ_ERR; + break; + case 0x2: + *wc_status = IB_WC_REM_ACCESS_ERR; + break; + case 0x3: + *wc_status = IB_WC_REM_OP_ERR; + break; + case 0x4: + *wc_status = IB_WC_REM_INV_RD_REQ_ERR; + break; + } + break; + case 0x08: + *wc_status = IB_WC_RETRY_EXC_ERR; + break; + case 0x09: + *wc_status = IB_WC_RNR_RETRY_EXC_ERR; + break; + case 0x0A: + case 0x2D: + *wc_status = IB_WC_REM_ABORT_ERR; + break; + case 0x0B: + case 0x2E: + *wc_status = IB_WC_INV_EECN_ERR; + break; + case 0x0C: + case 0x2F: + *wc_status = IB_WC_INV_EEC_STATE_ERR; + break; + case 0x0D: + *wc_status = IB_WC_BAD_RESP_ERR; + break; + case 0x10: + /* WQE purged */ + *wc_status = IB_WC_WR_FLUSH_ERR; + break; + default: + *wc_status = IB_WC_FATAL_ERR; + + } + } else + *wc_status = IB_WC_SUCCESS; +} + +static inline int post_one_send(struct ehca_qp *my_qp, + struct ib_send_wr *cur_send_wr, + int hidden) +{ + struct ehca_wqe *wqe_p; + int ret; + u32 sq_map_idx; + u64 start_offset = my_qp->ipz_squeue.current_q_offset; + + /* get pointer next to free WQE */ + wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue); + if (unlikely(!wqe_p)) { + /* too many posted work requests: queue overflow */ + ehca_err(my_qp->ib_qp.device, "Too many posted WQEs " + "qp_num=%x", my_qp->ib_qp.qp_num); + return -ENOMEM; + } + + /* + * Get the index of the WQE in the send queue. The same index is used + * for writing into the sq_map. + */ + sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size; + + /* write a SEND WQE into the QUEUE */ + ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden); + /* + * if something failed, + * reset the free entry pointer to the start value + */ + if (unlikely(ret)) { + my_qp->ipz_squeue.current_q_offset = start_offset; + ehca_err(my_qp->ib_qp.device, "Could not write WQE " + "qp_num=%x", my_qp->ib_qp.qp_num); + return -EINVAL; + } + + return 0; +} + +int ehca_post_send(struct ib_qp *qp, + struct ib_send_wr *send_wr, + struct ib_send_wr **bad_send_wr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + int wqe_cnt = 0; + int ret = 0; + unsigned long flags; + + /* Reject WR if QP is in RESET, INIT or RTR state */ + if (unlikely(my_qp->state < IB_QPS_RTS)) { + ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", + my_qp->state, qp->qp_num); + ret = -EINVAL; + goto out; + } + + /* LOCK the QUEUE */ + spin_lock_irqsave(&my_qp->spinlock_s, flags); + + /* Send an empty extra RDMA read if: + * 1) there has been an RDMA read on this connection before + * 2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets + * 3) we can be sure that any previous extra RDMA read has been + * processed so we don't overflow the SQ + */ + if (unlikely(my_qp->unsol_ack_circ && + my_qp->packet_count > ACK_CIRC_THRESHOLD && + my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) { + /* insert an empty RDMA READ to fix up the remote QP state */ + struct ib_send_wr circ_wr; + memset(&circ_wr, 0, sizeof(circ_wr)); + circ_wr.opcode = IB_WR_RDMA_READ; + post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */ + wqe_cnt++; + ehca_dbg(qp->device, "posted circ wr qp_num=%x", qp->qp_num); + my_qp->message_count = my_qp->packet_count = 0; + } + + /* loop processes list of send reqs */ + while (send_wr) { + ret = post_one_send(my_qp, send_wr, 0); + if (unlikely(ret)) { + goto post_send_exit0; + } + wqe_cnt++; + send_wr = send_wr->next; + } + +post_send_exit0: + iosync(); /* serialize GAL register access */ + hipz_update_sqa(my_qp, wqe_cnt); + if (unlikely(ret || ehca_debug_level >= 2)) + ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", + my_qp, qp->qp_num, wqe_cnt, ret); + my_qp->message_count += wqe_cnt; + spin_unlock_irqrestore(&my_qp->spinlock_s, flags); + +out: + if (ret) + *bad_send_wr = send_wr; + return ret; +} + +static int internal_post_recv(struct ehca_qp *my_qp, + struct ib_device *dev, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + struct ehca_wqe *wqe_p; + int wqe_cnt = 0; + int ret = 0; + u32 rq_map_idx; + unsigned long flags; + struct ehca_qmap_entry *qmap_entry; + + if (unlikely(!HAS_RQ(my_qp))) { + ehca_err(dev, "QP has no RQ ehca_qp=%p qp_num=%x ext_type=%d", + my_qp, my_qp->real_qp_num, my_qp->ext_type); + ret = -ENODEV; + goto out; + } + + /* LOCK the QUEUE */ + spin_lock_irqsave(&my_qp->spinlock_r, flags); + + /* loop processes list of recv reqs */ + while (recv_wr) { + u64 start_offset = my_qp->ipz_rqueue.current_q_offset; + /* get pointer next to free WQE */ + wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue); + if (unlikely(!wqe_p)) { + /* too many posted work requests: queue overflow */ + ret = -ENOMEM; + ehca_err(dev, "Too many posted WQEs " + "qp_num=%x", my_qp->real_qp_num); + goto post_recv_exit0; + } + /* + * Get the index of the WQE in the recv queue. The same index + * is used for writing into the rq_map. + */ + rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size; + + /* write a RECV WQE into the QUEUE */ + ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr, + rq_map_idx); + /* + * if something failed, + * reset the free entry pointer to the start value + */ + if (unlikely(ret)) { + my_qp->ipz_rqueue.current_q_offset = start_offset; + ret = -EINVAL; + ehca_err(dev, "Could not write WQE " + "qp_num=%x", my_qp->real_qp_num); + goto post_recv_exit0; + } + + qmap_entry = &my_qp->rq_map.map[rq_map_idx]; + qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id); + qmap_entry->reported = 0; + qmap_entry->cqe_req = 1; + + wqe_cnt++; + recv_wr = recv_wr->next; + } /* eof for recv_wr */ + +post_recv_exit0: + iosync(); /* serialize GAL register access */ + hipz_update_rqa(my_qp, wqe_cnt); + if (unlikely(ret || ehca_debug_level >= 2)) + ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i", + my_qp, my_qp->real_qp_num, wqe_cnt, ret); + spin_unlock_irqrestore(&my_qp->spinlock_r, flags); + +out: + if (ret) + *bad_recv_wr = recv_wr; + + return ret; +} + +int ehca_post_recv(struct ib_qp *qp, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp); + + /* Reject WR if QP is in RESET state */ + if (unlikely(my_qp->state == IB_QPS_RESET)) { + ehca_err(qp->device, "Invalid QP state qp_state=%d qpn=%x", + my_qp->state, qp->qp_num); + *bad_recv_wr = recv_wr; + return -EINVAL; + } + + return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr); +} + +int ehca_post_srq_recv(struct ib_srq *srq, + struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr) +{ + return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq), + srq->device, recv_wr, bad_recv_wr); +} + +/* + * ib_wc_opcode table converts ehca wc opcode to ib + * Since we use zero to indicate invalid opcode, the actual ib opcode must + * be decremented!!! + */ +static const u8 ib_wc_opcode[255] = { + [0x01] = IB_WC_RECV+1, + [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1, + [0x04] = IB_WC_BIND_MW+1, + [0x08] = IB_WC_FETCH_ADD+1, + [0x10] = IB_WC_COMP_SWAP+1, + [0x20] = IB_WC_RDMA_WRITE+1, + [0x40] = IB_WC_RDMA_READ+1, + [0x80] = IB_WC_SEND+1 +}; + +/* internal function to poll one entry of cq */ +static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc) +{ + int ret = 0, qmap_tail_idx; + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + struct ehca_cqe *cqe; + struct ehca_qp *my_qp; + struct ehca_qmap_entry *qmap_entry; + struct ehca_queue_map *qmap; + int cqe_count = 0, is_error; + +repoll: + cqe = (struct ehca_cqe *) + ipz_qeit_get_inc_valid(&my_cq->ipz_queue); + if (!cqe) { + ret = -EAGAIN; + if (ehca_debug_level >= 3) + ehca_dbg(cq->device, "Completion queue is empty " + "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number); + goto poll_cq_one_exit0; + } + + /* prevents loads being reordered across this point */ + rmb(); + + cqe_count++; + if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) { + struct ehca_qp *qp; + int purgeflag; + unsigned long flags; + + qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number); + if (!qp) { + ehca_err(cq->device, "cq_num=%x qp_num=%x " + "could not find qp -> ignore cqe", + my_cq->cq_number, cqe->local_qp_number); + ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x", + my_cq->cq_number, cqe->local_qp_number); + /* ignore this purged cqe */ + goto repoll; + } + spin_lock_irqsave(&qp->spinlock_s, flags); + purgeflag = qp->sqerr_purgeflag; + spin_unlock_irqrestore(&qp->spinlock_s, flags); + + if (purgeflag) { + ehca_dbg(cq->device, + "Got CQE with purged bit qp_num=%x src_qp=%x", + cqe->local_qp_number, cqe->remote_qp_number); + if (ehca_debug_level >= 2) + ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x", + cqe->local_qp_number, + cqe->remote_qp_number); + /* + * ignore this to avoid double cqes of bad wqe + * that caused sqe and turn off purge flag + */ + qp->sqerr_purgeflag = 0; + goto repoll; + } + } + + is_error = cqe->status & WC_STATUS_ERROR_BIT; + + /* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */ + if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) { + ehca_dbg(cq->device, + "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----", + is_error ? "ERROR " : "", my_cq, my_cq->cq_number); + ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x", + my_cq, my_cq->cq_number); + ehca_dbg(cq->device, + "ehca_cq=%p cq_num=%x -------------------------", + my_cq, my_cq->cq_number); + } + + read_lock(&ehca_qp_idr_lock); + my_qp = idr_find(&ehca_qp_idr, cqe->qp_token); + read_unlock(&ehca_qp_idr_lock); + if (!my_qp) + goto repoll; + wc->qp = &my_qp->ib_qp; + + qmap_tail_idx = get_app_wr_id(cqe->work_request_id); + if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT)) + /* We got a send completion. */ + qmap = &my_qp->sq_map; + else + /* We got a receive completion. */ + qmap = &my_qp->rq_map; + + /* advance the tail pointer */ + qmap->tail = qmap_tail_idx; + + if (is_error) { + /* + * set left_to_poll to 0 because in error state, we will not + * get any additional CQEs + */ + my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail, + my_qp->sq_map.entries); + my_qp->sq_map.left_to_poll = 0; + ehca_add_to_err_list(my_qp, 1); + + my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail, + my_qp->rq_map.entries); + my_qp->rq_map.left_to_poll = 0; + if (HAS_RQ(my_qp)) + ehca_add_to_err_list(my_qp, 0); + } + + qmap_entry = &qmap->map[qmap_tail_idx]; + if (qmap_entry->reported) { + ehca_warn(cq->device, "Double cqe on qp_num=%#x", + my_qp->real_qp_num); + /* found a double cqe, discard it and read next one */ + goto repoll; + } + + wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id); + qmap_entry->reported = 1; + + /* if left_to_poll is decremented to 0, add the QP to the error list */ + if (qmap->left_to_poll > 0) { + qmap->left_to_poll--; + if ((my_qp->sq_map.left_to_poll == 0) && + (my_qp->rq_map.left_to_poll == 0)) { + ehca_add_to_err_list(my_qp, 1); + if (HAS_RQ(my_qp)) + ehca_add_to_err_list(my_qp, 0); + } + } + + /* eval ib_wc_opcode */ + wc->opcode = ib_wc_opcode[cqe->optype]-1; + if (unlikely(wc->opcode == -1)) { + ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x " + "ehca_cq=%p cq_num=%x", + cqe->optype, cqe->status, my_cq, my_cq->cq_number); + /* dump cqe for other infos */ + ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x", + my_cq, my_cq->cq_number); + /* update also queue adder to throw away this entry!!! */ + goto repoll; + } + + /* eval ib_wc_status */ + if (unlikely(is_error)) { + /* complete with errors */ + map_ib_wc_status(cqe->status, &wc->status); + wc->vendor_err = wc->status; + } else + wc->status = IB_WC_SUCCESS; + + wc->byte_len = cqe->nr_bytes_transferred; + wc->pkey_index = cqe->pkey_index; + wc->slid = cqe->rlid; + wc->dlid_path_bits = cqe->dlid; + wc->src_qp = cqe->remote_qp_number; + /* + * HW has "Immed data present" and "GRH present" in bits 6 and 5. + * SW defines those in bits 1 and 0, so we can just shift and mask. + */ + wc->wc_flags = (cqe->w_completion_flags >> 5) & 3; + wc->ex.imm_data = cpu_to_be32(cqe->immediate_data); + wc->sl = cqe->service_level; + +poll_cq_one_exit0: + if (cqe_count > 0) + hipz_update_feca(my_cq, cqe_count); + + return ret; +} + +static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq, + struct ib_wc *wc, int num_entries, + struct ipz_queue *ipz_queue, int on_sq) +{ + int nr = 0; + struct ehca_wqe *wqe; + u64 offset; + struct ehca_queue_map *qmap; + struct ehca_qmap_entry *qmap_entry; + + if (on_sq) + qmap = &my_qp->sq_map; + else + qmap = &my_qp->rq_map; + + qmap_entry = &qmap->map[qmap->next_wqe_idx]; + + while ((nr < num_entries) && (qmap_entry->reported == 0)) { + /* generate flush CQE */ + + memset(wc, 0, sizeof(*wc)); + + offset = qmap->next_wqe_idx * ipz_queue->qe_size; + wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset); + if (!wqe) { + ehca_err(cq->device, "Invalid wqe offset=%#llx on " + "qp_num=%#x", offset, my_qp->real_qp_num); + return nr; + } + + wc->wr_id = replace_wr_id(wqe->work_request_id, + qmap_entry->app_wr_id); + + if (on_sq) { + switch (wqe->optype) { + case WQE_OPTYPE_SEND: + wc->opcode = IB_WC_SEND; + break; + case WQE_OPTYPE_RDMAWRITE: + wc->opcode = IB_WC_RDMA_WRITE; + break; + case WQE_OPTYPE_RDMAREAD: + wc->opcode = IB_WC_RDMA_READ; + break; + default: + ehca_err(cq->device, "Invalid optype=%x", + wqe->optype); + return nr; + } + } else + wc->opcode = IB_WC_RECV; + + if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) { + wc->ex.imm_data = wqe->immediate_data; + wc->wc_flags |= IB_WC_WITH_IMM; + } + + wc->status = IB_WC_WR_FLUSH_ERR; + + wc->qp = &my_qp->ib_qp; + + /* mark as reported and advance next_wqe pointer */ + qmap_entry->reported = 1; + qmap->next_wqe_idx = next_index(qmap->next_wqe_idx, + qmap->entries); + qmap_entry = &qmap->map[qmap->next_wqe_idx]; + + wc++; nr++; + } + + return nr; + +} + +int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) +{ + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int nr; + struct ehca_qp *err_qp; + struct ib_wc *current_wc = wc; + int ret = 0; + unsigned long flags; + int entries_left = num_entries; + + if (num_entries < 1) { + ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p " + "cq_num=%x", num_entries, my_cq, my_cq->cq_number); + ret = -EINVAL; + goto poll_cq_exit0; + } + + spin_lock_irqsave(&my_cq->spinlock, flags); + + /* generate flush cqes for send queues */ + list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) { + nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left, + &err_qp->ipz_squeue, 1); + entries_left -= nr; + current_wc += nr; + + if (entries_left == 0) + break; + } + + /* generate flush cqes for receive queues */ + list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) { + nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left, + &err_qp->ipz_rqueue, 0); + entries_left -= nr; + current_wc += nr; + + if (entries_left == 0) + break; + } + + for (nr = 0; nr < entries_left; nr++) { + ret = ehca_poll_cq_one(cq, current_wc); + if (ret) + break; + current_wc++; + } /* eof for nr */ + entries_left -= nr; + + spin_unlock_irqrestore(&my_cq->spinlock, flags); + if (ret == -EAGAIN || !ret) + ret = num_entries - entries_left; + +poll_cq_exit0: + return ret; +} + +int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags) +{ + struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq); + int ret = 0; + + switch (notify_flags & IB_CQ_SOLICITED_MASK) { + case IB_CQ_SOLICITED: + hipz_set_cqx_n0(my_cq, 1); + break; + case IB_CQ_NEXT_COMP: + hipz_set_cqx_n1(my_cq, 1); + break; + default: + return -EINVAL; + } + + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + unsigned long spl_flags; + spin_lock_irqsave(&my_cq->spinlock, spl_flags); + ret = ipz_qeit_is_valid(&my_cq->ipz_queue); + spin_unlock_irqrestore(&my_cq->spinlock, spl_flags); + } + + return ret; +} diff --git a/drivers/staging/rdma/ehca/ehca_sqp.c b/drivers/staging/rdma/ehca/ehca_sqp.c new file mode 100644 index 000000000..376b031c2 --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_sqp.c @@ -0,0 +1,245 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * SQP functions + * + * Authors: Khadija Souissi + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "ehca_iverbs.h" +#include "hcp_if.h" + +#define IB_MAD_STATUS_REDIRECT cpu_to_be16(0x0002) +#define IB_MAD_STATUS_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_MAD_STATUS_UNSUP_METHOD cpu_to_be16(0x0008) + +#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) + +/** + * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue + * pair is created successfully, the corresponding port gets active. + * + * Define Special Queue pair 0 (SMI QP) is still not supported. + * + * @qp_init_attr: Queue pair init attributes with port and queue pair type + */ + +u64 ehca_define_sqp(struct ehca_shca *shca, + struct ehca_qp *ehca_qp, + struct ib_qp_init_attr *qp_init_attr) +{ + u32 pma_qp_nr, bma_qp_nr; + u64 ret; + u8 port = qp_init_attr->port_num; + int counter; + + shca->sport[port - 1].port_state = IB_PORT_DOWN; + + switch (qp_init_attr->qp_type) { + case IB_QPT_SMI: + /* function not supported yet */ + break; + case IB_QPT_GSI: + ret = hipz_h_define_aqp1(shca->ipz_hca_handle, + ehca_qp->ipz_qp_handle, + ehca_qp->galpas.kernel, + (u32) qp_init_attr->port_num, + &pma_qp_nr, &bma_qp_nr); + + if (ret != H_SUCCESS) { + ehca_err(&shca->ib_device, + "Can't define AQP1 for port %x. h_ret=%lli", + port, ret); + return ret; + } + shca->sport[port - 1].pma_qp_nr = pma_qp_nr; + ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x", + port, pma_qp_nr); + break; + default: + ehca_err(&shca->ib_device, "invalid qp_type=%x", + qp_init_attr->qp_type); + return H_PARAMETER; + } + + if (ehca_nr_ports < 0) /* autodetect mode */ + return H_SUCCESS; + + for (counter = 0; + shca->sport[port - 1].port_state != IB_PORT_ACTIVE && + counter < ehca_port_act_time; + counter++) { + ehca_dbg(&shca->ib_device, "... wait until port %x is active", + port); + msleep_interruptible(1000); + } + + if (counter == ehca_port_act_time) { + ehca_err(&shca->ib_device, "Port %x is not active.", port); + return H_HARDWARE; + } + + return H_SUCCESS; +} + +struct ib_perf { + struct ib_mad_hdr mad_hdr; + u8 reserved[40]; + u8 data[192]; +} __attribute__ ((packed)); + +/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */ +struct tcslfl { + u32 tc:8; + u32 sl:4; + u32 fl:20; +} __attribute__ ((packed)); + +/* IP Version/TC/FL packed into 32 bits, as in GRH */ +struct vertcfl { + u32 ver:4; + u32 tc:8; + u32 fl:20; +} __attribute__ ((packed)); + +static int ehca_process_perf(struct ib_device *ibdev, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + const struct ib_perf *in_perf = (const struct ib_perf *)in_mad; + struct ib_perf *out_perf = (struct ib_perf *)out_mad; + struct ib_class_port_info *poi = + (struct ib_class_port_info *)out_perf->data; + struct tcslfl *tcslfl = + (struct tcslfl *)&poi->redirect_tcslfl; + struct ehca_shca *shca = + container_of(ibdev, struct ehca_shca, ib_device); + struct ehca_sport *sport = &shca->sport[port_num - 1]; + + ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method); + + *out_mad = *in_mad; + + if (in_perf->mad_hdr.class_version != 1) { + ehca_warn(ibdev, "Unsupported class_version=%x", + in_perf->mad_hdr.class_version); + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION; + goto perf_reply; + } + + switch (in_perf->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_SET: + /* set class port info for redirection */ + out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO; + out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT; + memset(poi, 0, sizeof(*poi)); + poi->base_version = 1; + poi->class_version = 1; + poi->resp_time_value = 18; + + /* copy local routing information from WC where applicable */ + tcslfl->sl = in_wc->sl; + poi->redirect_lid = + sport->saved_attr.lid | in_wc->dlid_path_bits; + poi->redirect_qp = sport->pma_qp_nr; + poi->redirect_qkey = IB_QP1_QKEY; + + ehca_query_pkey(ibdev, port_num, in_wc->pkey_index, + &poi->redirect_pkey); + + /* if request was globally routed, copy route info */ + if (in_grh) { + const struct vertcfl *vertcfl = + (const struct vertcfl *)&in_grh->version_tclass_flow; + memcpy(poi->redirect_gid, in_grh->dgid.raw, + sizeof(poi->redirect_gid)); + tcslfl->tc = vertcfl->tc; + tcslfl->fl = vertcfl->fl; + } else + /* else only fill in default GID */ + ehca_query_gid(ibdev, port_num, 0, + (union ib_gid *)&poi->redirect_gid); + + ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x", + sport->saved_attr.lid, sport->pma_qp_nr); + break; + + case IB_MGMT_METHOD_GET_RESP: + return IB_MAD_RESULT_FAILURE; + + default: + out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD; + break; + } + +perf_reply: + out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + int ret; + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; + + if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc) + return IB_MAD_RESULT_FAILURE; + + /* accept only pma request */ + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT) + return IB_MAD_RESULT_SUCCESS; + + ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp); + ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh, + in_mad, out_mad); + + return ret; +} diff --git a/drivers/staging/rdma/ehca/ehca_tools.h b/drivers/staging/rdma/ehca/ehca_tools.h new file mode 100644 index 000000000..d280b12aa --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_tools.h @@ -0,0 +1,155 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * auxiliary functions + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Khadija Souissi + * Waleri Fomin + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef EHCA_TOOLS_H +#define EHCA_TOOLS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +extern int ehca_debug_level; + +#define ehca_dbg(ib_dev, format, arg...) \ + do { \ + if (unlikely(ehca_debug_level)) \ + dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \ + "PU%04x EHCA_DBG:%s " format "\n", \ + raw_smp_processor_id(), __func__, \ + ## arg); \ + } while (0) + +#define ehca_info(ib_dev, format, arg...) \ + dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_warn(ib_dev, format, arg...) \ + dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_err(ib_dev, format, arg...) \ + dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +/* use this one only if no ib_dev available */ +#define ehca_gen_dbg(format, arg...) \ + do { \ + if (unlikely(ehca_debug_level)) \ + printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg); \ + } while (0) + +#define ehca_gen_warn(format, arg...) \ + printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +#define ehca_gen_err(format, arg...) \ + printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \ + raw_smp_processor_id(), __func__, ## arg) + +/** + * ehca_dmp - printk a memory block, whose length is n*8 bytes. + * Each line has the following layout: + * adr=X ofs=Y <8 bytes hex> <8 bytes hex> + */ +#define ehca_dmp(adr, len, format, args...) \ + do { \ + unsigned int x; \ + unsigned int l = (unsigned int)(len); \ + unsigned char *deb = (unsigned char *)(adr); \ + for (x = 0; x < l; x += 16) { \ + printk(KERN_INFO "EHCA_DMP:%s " format \ + " adr=%p ofs=%04x %016llx %016llx\n", \ + __func__, ##args, deb, x, \ + *((u64 *)&deb[0]), *((u64 *)&deb[8])); \ + deb += 16; \ + } \ + } while (0) + +/* define a bitmask, little endian version */ +#define EHCA_BMASK(pos, length) (((pos) << 16) + (length)) + +/* define a bitmask, the ibm way... */ +#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1)) + +/* internal function, don't use */ +#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff) + +/* internal function, don't use */ +#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff)) + +/** + * EHCA_BMASK_SET - return value shifted and masked by mask + * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable + * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask + * in variable + */ +#define EHCA_BMASK_SET(mask, value) \ + ((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask)) + +/** + * EHCA_BMASK_GET - extract a parameter from value by mask + */ +#define EHCA_BMASK_GET(mask, value) \ + (EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask))) + +/* Converts ehca to ib return code */ +int ehca2ib_return_code(u64 ehca_rc); + +#endif /* EHCA_TOOLS_H */ diff --git a/drivers/staging/rdma/ehca/ehca_uverbs.c b/drivers/staging/rdma/ehca/ehca_uverbs.c new file mode 100644 index 000000000..1a1d5d99f --- /dev/null +++ b/drivers/staging/rdma/ehca/ehca_uverbs.c @@ -0,0 +1,309 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * userspace support verbs + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Heiko J Schick + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_classes.h" +#include "ehca_iverbs.h" +#include "ehca_mrmw.h" +#include "ehca_tools.h" +#include "hcp_if.h" + +struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device, + struct ib_udata *udata) +{ + struct ehca_ucontext *my_context; + + my_context = kzalloc(sizeof *my_context, GFP_KERNEL); + if (!my_context) { + ehca_err(device, "Out of memory device=%p", device); + return ERR_PTR(-ENOMEM); + } + + return &my_context->ib_ucontext; +} + +int ehca_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(container_of(context, struct ehca_ucontext, ib_ucontext)); + return 0; +} + +static void ehca_mm_open(struct vm_area_struct *vma) +{ + u32 *count = (u32 *)vma->vm_private_data; + if (!count) { + ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + return; + } + (*count)++; + if (!(*count)) + ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x", + vma->vm_start, vma->vm_end, *count); +} + +static void ehca_mm_close(struct vm_area_struct *vma) +{ + u32 *count = (u32 *)vma->vm_private_data; + if (!count) { + ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx", + vma->vm_start, vma->vm_end); + return; + } + (*count)--; + ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x", + vma->vm_start, vma->vm_end, *count); +} + +static const struct vm_operations_struct vm_ops = { + .open = ehca_mm_open, + .close = ehca_mm_close, +}; + +static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas, + u32 *mm_count) +{ + int ret; + u64 vsize, physical; + + vsize = vma->vm_end - vma->vm_start; + if (vsize < EHCA_PAGESIZE) { + ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start); + return -EINVAL; + } + + physical = galpas->user.fw_handle; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical); + /* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */ + ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT, + vma->vm_page_prot); + if (unlikely(ret)) { + ehca_gen_err("remap_pfn_range() failed ret=%i", ret); + return -ENOMEM; + } + + vma->vm_private_data = mm_count; + (*mm_count)++; + vma->vm_ops = &vm_ops; + + return 0; +} + +static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue, + u32 *mm_count) +{ + int ret; + u64 start, ofs; + struct page *page; + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + start = vma->vm_start; + for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) { + u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs); + page = virt_to_page(virt_addr); + ret = vm_insert_page(vma, start, page); + if (unlikely(ret)) { + ehca_gen_err("vm_insert_page() failed rc=%i", ret); + return ret; + } + start += PAGE_SIZE; + } + vma->vm_private_data = mm_count; + (*mm_count)++; + vma->vm_ops = &vm_ops; + + return 0; +} + +static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq, + u32 rsrc_type) +{ + int ret; + + switch (rsrc_type) { + case 0: /* galpa fw handle */ + ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number); + ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_fw() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + case 1: /* cq queue_addr */ + ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number); + ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_queue() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + default: + ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x", + rsrc_type, cq->cq_number); + return -EINVAL; + } + + return 0; +} + +static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp, + u32 rsrc_type) +{ + int ret; + + switch (rsrc_type) { + case 0: /* galpa fw handle */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num); + ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "remap_pfn_range() failed ret=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return -ENOMEM; + } + break; + + case 1: /* qp rqueue_addr */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num); + ret = ehca_mmap_queue(vma, &qp->ipz_rqueue, + &qp->mm_count_rqueue); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_queue(rq) failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + case 2: /* qp squeue_addr */ + ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num); + ret = ehca_mmap_queue(vma, &qp->ipz_squeue, + &qp->mm_count_squeue); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_queue(sq) failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + default: + ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x", + rsrc_type, qp->ib_qp.qp_num); + return -EINVAL; + } + + return 0; +} + +int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + u64 fileoffset = vma->vm_pgoff; + u32 idr_handle = fileoffset & 0x1FFFFFF; + u32 q_type = (fileoffset >> 27) & 0x1; /* CQ, QP,... */ + u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */ + u32 ret; + struct ehca_cq *cq; + struct ehca_qp *qp; + struct ib_uobject *uobject; + + switch (q_type) { + case 0: /* CQ */ + read_lock(&ehca_cq_idr_lock); + cq = idr_find(&ehca_cq_idr, idr_handle); + read_unlock(&ehca_cq_idr_lock); + + /* make sure this mmap really belongs to the authorized user */ + if (!cq) + return -EINVAL; + + if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context) + return -EINVAL; + + ret = ehca_mmap_cq(vma, cq, rsrc_type); + if (unlikely(ret)) { + ehca_err(cq->ib_cq.device, + "ehca_mmap_cq() failed rc=%i cq_num=%x", + ret, cq->cq_number); + return ret; + } + break; + + case 1: /* QP */ + read_lock(&ehca_qp_idr_lock); + qp = idr_find(&ehca_qp_idr, idr_handle); + read_unlock(&ehca_qp_idr_lock); + + /* make sure this mmap really belongs to the authorized user */ + if (!qp) + return -EINVAL; + + uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject; + if (!uobject || uobject->context != context) + return -EINVAL; + + ret = ehca_mmap_qp(vma, qp, rsrc_type); + if (unlikely(ret)) { + ehca_err(qp->ib_qp.device, + "ehca_mmap_qp() failed rc=%i qp_num=%x", + ret, qp->ib_qp.qp_num); + return ret; + } + break; + + default: + ehca_gen_err("bad queue type %x", q_type); + return -EINVAL; + } + + return 0; +} diff --git a/drivers/staging/rdma/ehca/hcp_if.c b/drivers/staging/rdma/ehca/hcp_if.c new file mode 100644 index 000000000..89517ffb4 --- /dev/null +++ b/drivers/staging/rdma/ehca/hcp_if.c @@ -0,0 +1,949 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware Infiniband Interface code for POWER + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Joachim Fenkes + * Gerd Bayer + * Waleri Fomin + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "ehca_tools.h" +#include "hcp_if.h" +#include "hcp_phyp.h" +#include "hipz_fns.h" +#include "ipz_pt_fn.h" + +#define H_ALL_RES_QP_ENHANCED_OPS EHCA_BMASK_IBM(9, 11) +#define H_ALL_RES_QP_PTE_PIN EHCA_BMASK_IBM(12, 12) +#define H_ALL_RES_QP_SERVICE_TYPE EHCA_BMASK_IBM(13, 15) +#define H_ALL_RES_QP_STORAGE EHCA_BMASK_IBM(16, 17) +#define H_ALL_RES_QP_LL_RQ_CQE_POSTING EHCA_BMASK_IBM(18, 18) +#define H_ALL_RES_QP_LL_SQ_CQE_POSTING EHCA_BMASK_IBM(19, 21) +#define H_ALL_RES_QP_SIGNALING_TYPE EHCA_BMASK_IBM(22, 23) +#define H_ALL_RES_QP_UD_AV_LKEY_CTRL EHCA_BMASK_IBM(31, 31) +#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35) +#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39) +#define H_ALL_RES_QP_RESOURCE_TYPE EHCA_BMASK_IBM(56, 63) + +#define H_ALL_RES_QP_MAX_OUTST_SEND_WR EHCA_BMASK_IBM(0, 15) +#define H_ALL_RES_QP_MAX_OUTST_RECV_WR EHCA_BMASK_IBM(16, 31) +#define H_ALL_RES_QP_MAX_SEND_SGE EHCA_BMASK_IBM(32, 39) +#define H_ALL_RES_QP_MAX_RECV_SGE EHCA_BMASK_IBM(40, 47) + +#define H_ALL_RES_QP_UD_AV_LKEY EHCA_BMASK_IBM(32, 63) +#define H_ALL_RES_QP_SRQ_QP_TOKEN EHCA_BMASK_IBM(0, 31) +#define H_ALL_RES_QP_SRQ_QP_HANDLE EHCA_BMASK_IBM(0, 64) +#define H_ALL_RES_QP_SRQ_LIMIT EHCA_BMASK_IBM(48, 63) +#define H_ALL_RES_QP_SRQ_QPN EHCA_BMASK_IBM(40, 63) + +#define H_ALL_RES_QP_ACT_OUTST_SEND_WR EHCA_BMASK_IBM(16, 31) +#define H_ALL_RES_QP_ACT_OUTST_RECV_WR EHCA_BMASK_IBM(48, 63) +#define H_ALL_RES_QP_ACT_SEND_SGE EHCA_BMASK_IBM(8, 15) +#define H_ALL_RES_QP_ACT_RECV_SGE EHCA_BMASK_IBM(24, 31) + +#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES EHCA_BMASK_IBM(0, 31) +#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES EHCA_BMASK_IBM(32, 63) + +#define H_MP_INIT_TYPE EHCA_BMASK_IBM(44, 47) +#define H_MP_SHUTDOWN EHCA_BMASK_IBM(48, 48) +#define H_MP_RESET_QKEY_CTR EHCA_BMASK_IBM(49, 49) + +#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx" +#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx" +#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx" + +static DEFINE_SPINLOCK(hcall_lock); + +static long ehca_plpar_hcall_norets(unsigned long opcode, + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6, + unsigned long arg7) +{ + long ret; + int i, sleep_msecs; + unsigned long flags = 0; + + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT, + opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7); + + for (i = 0; i < 5; i++) { + /* serialize hCalls to work around firmware issue */ + if (ehca_lock_hcalls) + spin_lock_irqsave(&hcall_lock, flags); + + ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4, + arg5, arg6, arg7); + + if (ehca_lock_hcalls) + spin_unlock_irqrestore(&hcall_lock, flags); + + if (H_IS_LONG_BUSY(ret)) { + sleep_msecs = get_longbusy_msecs(ret); + msleep_interruptible(sleep_msecs); + continue; + } + + if (ret < H_SUCCESS) + ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT, + opcode, ret, arg1, arg2, arg3, + arg4, arg5, arg6, arg7); + else + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret); + + return ret; + } + + return H_BUSY; +} + +static long ehca_plpar_hcall9(unsigned long opcode, + unsigned long *outs, /* array of 9 outputs */ + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4, + unsigned long arg5, + unsigned long arg6, + unsigned long arg7, + unsigned long arg8, + unsigned long arg9) +{ + long ret; + int i, sleep_msecs; + unsigned long flags = 0; + + if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode, + arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + + for (i = 0; i < 5; i++) { + /* serialize hCalls to work around firmware issue */ + if (ehca_lock_hcalls) + spin_lock_irqsave(&hcall_lock, flags); + + ret = plpar_hcall9(opcode, outs, + arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + + if (ehca_lock_hcalls) + spin_unlock_irqrestore(&hcall_lock, flags); + + if (H_IS_LONG_BUSY(ret)) { + sleep_msecs = get_longbusy_msecs(ret); + msleep_interruptible(sleep_msecs); + continue; + } + + if (ret < H_SUCCESS) { + ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, + opcode, arg1, arg2, arg3, arg4, arg5, + arg6, arg7, arg8, arg9); + ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], + outs[4], outs[5], outs[6], outs[7], + outs[8]); + } else if (unlikely(ehca_debug_level >= 2)) + ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT, + ret, outs[0], outs[1], outs[2], outs[3], + outs[4], outs[5], outs[6], outs[7], + outs[8]); + return ret; + } + + return H_BUSY; +} + +u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_pfeq *pfeq, + const u32 neq_control, + const u32 number_of_entries, + struct ipz_eq_handle *eq_handle, + u32 *act_nr_of_entries, + u32 *act_pages, + u32 *eq_ist) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + u64 allocate_controls; + + /* resource type */ + allocate_controls = 3ULL; + + /* ISN is associated */ + if (neq_control != 1) + allocate_controls = (1ULL << (63 - 7)) | allocate_controls; + else /* notification event queue */ + allocate_controls = (1ULL << 63) | allocate_controls; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + allocate_controls, /* r5 */ + number_of_entries, /* r6 */ + 0, 0, 0, 0, 0, 0); + eq_handle->handle = outs[0]; + *act_nr_of_entries = (u32)outs[3]; + *act_pages = (u32)outs[4]; + *eq_ist = (u32)outs[5]; + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resource - ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle, + struct ipz_eq_handle eq_handle, + const u64 event_mask) +{ + return ehca_plpar_hcall_norets(H_RESET_EVENTS, + adapter_handle.handle, /* r4 */ + eq_handle.handle, /* r5 */ + event_mask, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + struct ehca_alloc_cq_parms *param) +{ + int rc; + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 2, /* r5 */ + param->eq_handle.handle, /* r6 */ + cq->token, /* r7 */ + param->nr_cqe, /* r8 */ + 0, 0, 0, 0); + cq->ipz_cq_handle.handle = outs[0]; + param->act_nr_of_entries = (u32)outs[3]; + param->act_pages = (u32)outs[4]; + + if (ret == H_SUCCESS) { + rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]); + if (rc) { + ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx", + rc, outs[5]); + + ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + cq->ipz_cq_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + ret = H_NO_MEM; + } + } + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_alloc_qp_parms *parms, int is_user) +{ + int rc; + u64 ret; + u64 allocate_controls, max_r10_reg, r11, r12; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + allocate_controls = + EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type) + | EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0) + | EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype) + | EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype) + | EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage) + | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE, + parms->squeue.page_size) + | EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE, + parms->rqueue.page_size) + | EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING, + !!(parms->ll_comp_flags & LLQP_RECV_COMP)) + | EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING, + !!(parms->ll_comp_flags & LLQP_SEND_COMP)) + | EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL, + parms->ud_av_l_key_ctl) + | EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1); + + max_r10_reg = + EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR, + parms->squeue.max_wr + 1) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR, + parms->rqueue.max_wr + 1) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE, + parms->squeue.max_sge) + | EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE, + parms->rqueue.max_sge); + + r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token); + + if (parms->ext_type == EQPT_SRQ) + r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit); + else + r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn); + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + allocate_controls, /* r5 */ + parms->send_cq_handle.handle, + parms->recv_cq_handle.handle, + parms->eq_handle.handle, + ((u64)parms->token << 32) | parms->pd.value, + max_r10_reg, r11, r12); + + parms->qp_handle.handle = outs[0]; + parms->real_qp_num = (u32)outs[1]; + parms->squeue.act_nr_wqes = + (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]); + parms->rqueue.act_nr_wqes = + (u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]); + parms->squeue.act_nr_sges = + (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]); + parms->rqueue.act_nr_sges = + (u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]); + parms->squeue.queue_size = + (u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]); + parms->rqueue.queue_size = + (u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]); + + if (ret == H_SUCCESS) { + rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]); + if (rc) { + ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx", + rc, outs[6]); + + ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + parms->qp_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + ret = H_NO_MEM; + } + } + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, + struct hipz_query_port *query_port_response_block) +{ + u64 ret; + u64 r_cb = __pa(query_port_response_block); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("response block not page aligned"); + return H_PARAMETER; + } + + ret = ehca_plpar_hcall_norets(H_QUERY_PORT, + adapter_handle.handle, /* r4 */ + port_id, /* r5 */ + r_cb, /* r6 */ + 0, 0, 0, 0); + + if (ehca_debug_level >= 2) + ehca_dmp(query_port_response_block, 64, "response_block"); + + return ret; +} + +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask) +{ + u64 port_attributes = port_cap; + + if (modify_mask & IB_PORT_SHUTDOWN) + port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1); + if (modify_mask & IB_PORT_INIT_TYPE) + port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type); + if (modify_mask & IB_PORT_RESET_QKEY_CNTR) + port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1); + + return ehca_plpar_hcall_norets(H_MODIFY_PORT, + adapter_handle.handle, /* r4 */ + port_id, /* r5 */ + port_attributes, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, + struct hipz_query_hca *query_hca_rblock) +{ + u64 r_cb = __pa(query_hca_rblock); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("response_block=%p not page aligned", + query_hca_rblock); + return H_PARAMETER; + } + + return ehca_plpar_hcall_norets(H_QUERY_HCA, + adapter_handle.handle, /* r4 */ + r_cb, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle, + const u8 pagesize, + const u8 queue_type, + const u64 resource_handle, + const u64 logical_address_of_page, + u64 count) +{ + return ehca_plpar_hcall_norets(H_REGISTER_RPAGES, + adapter_handle.handle, /* r4 */ + (u64)queue_type | ((u64)pagesize) << 8, + /* r5 */ + resource_handle, /* r6 */ + logical_address_of_page, /* r7 */ + count, /* r8 */ + 0, 0); +} + +u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_eq_handle eq_handle, + struct ehca_pfeq *pfeq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count) +{ + if (count != 1) { + ehca_gen_err("Ppage counter=%llx", count); + return H_PARAMETER; + } + return hipz_h_register_rpage(adapter_handle, + pagesize, + queue_type, + eq_handle.handle, + logical_address_of_page, count); +} + +u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle, + u32 ist) +{ + u64 ret; + ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE, + adapter_handle.handle, /* r4 */ + ist, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret != H_SUCCESS && ret != H_BUSY) + ehca_gen_err("Could not query interrupt state."); + + return ret; +} + +u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_cq_handle cq_handle, + struct ehca_pfcq *pfcq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa gal) +{ + if (count != 1) { + ehca_gen_err("Page counter=%llx", count); + return H_PARAMETER; + } + + return hipz_h_register_rpage(adapter_handle, pagesize, queue_type, + cq_handle.handle, logical_address_of_page, + count); +} + +u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa galpa) +{ + if (count > 1) { + ehca_gen_err("Page counter=%llx", count); + return H_PARAMETER; + } + + return hipz_h_register_rpage(adapter_handle, pagesize, queue_type, + qp_handle.handle, logical_address_of_page, + count); +} + +u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + void **log_addr_next_sq_wqe2processed, + void **log_addr_next_rq_wqe2processed, + int dis_and_get_function_code) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs, + adapter_handle.handle, /* r4 */ + dis_and_get_function_code, /* r5 */ + qp_handle.handle, /* r6 */ + 0, 0, 0, 0, 0, 0); + if (log_addr_next_sq_wqe2processed) + *log_addr_next_sq_wqe2processed = (void *)outs[0]; + if (log_addr_next_rq_wqe2processed) + *log_addr_next_rq_wqe2processed = (void *)outs[1]; + + return ret; +} + +u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u64 update_mask, + struct hcp_modify_qp_control_block *mqpcb, + struct h_galpa gal) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + ret = ehca_plpar_hcall9(H_MODIFY_QP, outs, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + update_mask, /* r6 */ + __pa(mqpcb), /* r7 */ + 0, 0, 0, 0, 0); + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Insufficient resources ret=%lli", ret); + + return ret; +} + +u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + struct hcp_modify_qp_control_block *qqpcb, + struct h_galpa gal) +{ + return ehca_plpar_hcall_norets(H_QUERY_QP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + __pa(qqpcb), /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_qp *qp) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = hcp_galpas_dtor(&qp->galpas); + if (ret) { + ehca_gen_err("Could not destruct qp->galpas"); + return H_RESOURCE; + } + ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs, + adapter_handle.handle, /* r4 */ + /* function code */ + 1, /* r5 */ + qp->ipz_qp_handle.handle, /* r6 */ + 0, 0, 0, 0, 0, 0); + if (ret == H_HARDWARE) + ehca_gen_err("HCA not operational. ret=%lli", ret); + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + qp->ipz_qp_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("Resource still in use. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port) +{ + return ehca_plpar_hcall_norets(H_DEFINE_AQP0, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + port, /* r6 */ + 0, 0, 0, 0); +} + +u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port, u32 * pma_qp_nr, + u32 * bma_qp_nr) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + port, /* r6 */ + 0, 0, 0, 0, 0, 0); + *pma_qp_nr = (u32)outs[0]; + *bma_qp_nr = (u32)outs[1]; + + if (ret == H_ALIAS_EXIST) + ehca_gen_err("AQP1 already exists. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id) +{ + u64 ret; + + ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + mcg_dlid, /* r6 */ + interface_id, /* r7 */ + subnet_prefix, /* r8 */ + 0, 0); + + if (ret == H_NOT_ENOUGH_RESOURCES) + ehca_gen_err("Not enough resources. ret=%lli", ret); + + return ret; +} + +u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id) +{ + return ehca_plpar_hcall_norets(H_DETACH_MCQP, + adapter_handle.handle, /* r4 */ + qp_handle.handle, /* r5 */ + mcg_dlid, /* r6 */ + interface_id, /* r7 */ + subnet_prefix, /* r8 */ + 0, 0); +} + +u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + u8 force_flag) +{ + u64 ret; + + ret = hcp_galpas_dtor(&cq->galpas); + if (ret) { + ehca_gen_err("Could not destruct cp->galpas"); + return H_RESOURCE; + } + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + cq->ipz_cq_handle.handle, /* r5 */ + force_flag != 0 ? 1L : 0L, /* r6 */ + 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_eq *eq) +{ + u64 ret; + + ret = hcp_galpas_dtor(&eq->galpas); + if (ret) { + ehca_gen_err("Could not destruct eq->galpas"); + return H_RESOURCE; + } + + ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + eq->ipz_eq_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); + + if (ret == H_RESOURCE) + ehca_gen_err("Resource in use. ret=%lli ", ret); + + return ret; +} + +u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 5, /* r5 */ + vaddr, /* r6 */ + length, /* r7 */ + (((u64)access_ctrl) << 32ULL), /* r8 */ + pd.value, /* r9 */ + 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count) +{ + u64 ret; + + if (unlikely(ehca_debug_level >= 3)) { + if (count > 1) { + u64 *kpage; + int i; + kpage = __va(logical_address_of_page); + for (i = 0; i < count; i++) + ehca_gen_dbg("kpage[%d]=%p", + i, (void *)kpage[i]); + } else + ehca_gen_dbg("kpage=%p", + (void *)logical_address_of_page); + } + + if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) { + ehca_gen_err("logical_address_of_page not on a 4k boundary " + "adapter_handle=%llx mr=%p mr_handle=%llx " + "pagesize=%x queue_type=%x " + "logical_address_of_page=%llx count=%llx", + adapter_handle.handle, mr, + mr->ipz_mr_handle.handle, pagesize, queue_type, + logical_address_of_page, count); + ret = H_PARAMETER; + } else + ret = hipz_h_register_rpage(adapter_handle, pagesize, + queue_type, + mr->ipz_mr_handle.handle, + logical_address_of_page, count); + return ret; +} + +u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_QUERY_MR, outs, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + 0, 0, 0, 0, 0, 0, 0); + outparms->len = outs[0]; + outparms->vaddr = outs[1]; + outparms->acl = outs[4] >> 32; + outparms->lkey = (u32)(outs[5] >> 32); + outparms->rkey = (u32)(outs[5] & (0xffffffff)); + + return ret; +} + +u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr) +{ + return ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr_in, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + const u64 mr_addr_cb, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs, + adapter_handle.handle, /* r4 */ + mr->ipz_mr_handle.handle, /* r5 */ + vaddr_in, /* r6 */ + length, /* r7 */ + /* r8 */ + ((((u64)access_ctrl) << 32ULL) | pd.value), + mr_addr_cb, /* r9 */ + 0, 0, 0); + outparms->vaddr = outs[1]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const struct ehca_mr *orig_mr, + const u64 vaddr_in, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs, + adapter_handle.handle, /* r4 */ + orig_mr->ipz_mr_handle.handle, /* r5 */ + vaddr_in, /* r6 */ + (((u64)access_ctrl) << 32ULL), /* r7 */ + pd.value, /* r8 */ + 0, 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->lkey = (u32)outs[2]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + const struct ipz_pd pd, + struct ehca_mw_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs, + adapter_handle.handle, /* r4 */ + 6, /* r5 */ + pd.value, /* r6 */ + 0, 0, 0, 0, 0, 0); + outparms->handle.handle = outs[0]; + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + struct ehca_mw_hipzout_parms *outparms) +{ + u64 ret; + unsigned long outs[PLPAR_HCALL9_BUFSIZE]; + + ret = ehca_plpar_hcall9(H_QUERY_MW, outs, + adapter_handle.handle, /* r4 */ + mw->ipz_mw_handle.handle, /* r5 */ + 0, 0, 0, 0, 0, 0, 0); + outparms->rkey = (u32)outs[3]; + + return ret; +} + +u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw) +{ + return ehca_plpar_hcall_norets(H_FREE_RESOURCE, + adapter_handle.handle, /* r4 */ + mw->ipz_mw_handle.handle, /* r5 */ + 0, 0, 0, 0, 0); +} + +u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle, + const u64 ressource_handle, + void *rblock, + unsigned long *byte_count) +{ + u64 r_cb = __pa(rblock); + + if (r_cb & (EHCA_PAGESIZE-1)) { + ehca_gen_err("rblock not page aligned."); + return H_PARAMETER; + } + + return ehca_plpar_hcall_norets(H_ERROR_DATA, + adapter_handle.handle, + ressource_handle, + r_cb, + 0, 0, 0, 0); +} + +u64 hipz_h_eoi(int irq) +{ + unsigned long xirr; + + iosync(); + xirr = (0xffULL << 24) | irq; + + return plpar_hcall_norets(H_EOI, xirr); +} diff --git a/drivers/staging/rdma/ehca/hcp_if.h b/drivers/staging/rdma/ehca/hcp_if.h new file mode 100644 index 000000000..a46e514c3 --- /dev/null +++ b/drivers/staging/rdma/ehca/hcp_if.h @@ -0,0 +1,265 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware Infiniband Interface code for POWER + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Gerd Bayer + * Waleri Fomin + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HCP_IF_H__ +#define __HCP_IF_H__ + +#include "ehca_classes.h" +#include "ehca_tools.h" +#include "hipz_hw.h" + +/* + * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize + * resources, create the empty EQPT (ring). + */ +u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_pfeq *pfeq, + const u32 neq_control, + const u32 number_of_entries, + struct ipz_eq_handle *eq_handle, + u32 * act_nr_of_entries, + u32 * act_pages, + u32 * eq_ist); + +u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle, + struct ipz_eq_handle eq_handle, + const u64 event_mask); +/* + * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize + * resources, create the empty CQPT (ring). + */ +u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + struct ehca_alloc_cq_parms *param); + + +/* + * hipz_h_alloc_resource_qp allocates QP resources in HW and FW, + * initialize resources, create empty QPPTs (2 rings). + */ +u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_alloc_qp_parms *parms, int is_user); + +u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, + struct hipz_query_port *query_port_response_block); + +u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle, + const u8 port_id, const u32 port_cap, + const u8 init_type, const int modify_mask); + +u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle, + struct hipz_query_hca *query_hca_rblock); + +/* + * hipz_h_register_rpage internal function in hcp_if.h for all + * hcp_H_REGISTER_RPAGE calls. + */ +u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle, + const u8 pagesize, + const u8 queue_type, + const u64 resource_handle, + const u64 logical_address_of_page, + u64 count); + +u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_eq_handle eq_handle, + struct ehca_pfeq *pfeq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count); + +u64 hipz_h_query_int_state(const struct ipz_adapter_handle + hcp_adapter_handle, + u32 ist); + +u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle, + const struct ipz_cq_handle cq_handle, + struct ehca_pfcq *pfcq, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa gal); + +u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count, + const struct h_galpa galpa); + +u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + void **log_addr_next_sq_wqe_tb_processed, + void **log_addr_next_rq_wqe_tb_processed, + int dis_and_get_function_code); +enum hcall_sigt { + HCALL_SIGT_NO_CQE = 0, + HCALL_SIGT_BY_WQE = 1, + HCALL_SIGT_EVERY = 2 +}; + +u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + const u64 update_mask, + struct hcp_modify_qp_control_block *mqpcb, + struct h_galpa gal); + +u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct ehca_pfqp *pfqp, + struct hcp_modify_qp_control_block *qqpcb, + struct h_galpa gal); + +u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle, + struct ehca_qp *qp); + +u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port); + +u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u32 port, u32 * pma_qp_nr, + u32 * bma_qp_nr); + +u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id); + +u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle, + const struct ipz_qp_handle qp_handle, + struct h_galpa gal, + u16 mcg_dlid, + u64 subnet_prefix, u64 interface_id); + +u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle, + struct ehca_cq *cq, + u8 force_flag); + +u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle, + struct ehca_eq *eq); + +/* + * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize + * resources. + */ +u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */ +u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u8 pagesize, + const u8 queue_type, + const u64 logical_address_of_page, + const u64 count); + +/* hipz_h_query_mr queries MR in HW and FW */ +u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_free_resource_mr frees MR resources in HW and FW */ +u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr); + +/* hipz_h_reregister_pmr reregisters MR in HW and FW */ +u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const u64 vaddr_in, + const u64 length, + const u32 access_ctrl, + const struct ipz_pd pd, + const u64 mr_addr_cb, + struct ehca_mr_hipzout_parms *outparms); + +/* hipz_h_register_smr register shared MR in HW and FW */ +u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mr *mr, + const struct ehca_mr *orig_mr, + const u64 vaddr_in, + const u32 access_ctrl, + const struct ipz_pd pd, + struct ehca_mr_hipzout_parms *outparms); + +/* + * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize + * resources. + */ +u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + const struct ipz_pd pd, + struct ehca_mw_hipzout_parms *outparms); + +/* hipz_h_query_mw queries MW in HW and FW */ +u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw, + struct ehca_mw_hipzout_parms *outparms); + +/* hipz_h_free_resource_mw frees MW resources in HW and FW */ +u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle, + const struct ehca_mw *mw); + +u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle, + const u64 ressource_handle, + void *rblock, + unsigned long *byte_count); +u64 hipz_h_eoi(int irq); + +#endif /* __HCP_IF_H__ */ diff --git a/drivers/staging/rdma/ehca/hcp_phyp.c b/drivers/staging/rdma/ehca/hcp_phyp.c new file mode 100644 index 000000000..077376ff3 --- /dev/null +++ b/drivers/staging/rdma/ehca/hcp_phyp.c @@ -0,0 +1,82 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * load store abstraction for ehca register access with tracing + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ehca_classes.h" +#include "hipz_hw.h" + +u64 hcall_map_page(u64 physaddr) +{ + return (u64)ioremap(physaddr, EHCA_PAGESIZE); +} + +int hcall_unmap_page(u64 mapaddr) +{ + iounmap((volatile void __iomem *) mapaddr); + return 0; +} + +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, + u64 paddr_kernel, u64 paddr_user) +{ + if (!is_user) { + galpas->kernel.fw_handle = hcall_map_page(paddr_kernel); + if (!galpas->kernel.fw_handle) + return -ENOMEM; + } else + galpas->kernel.fw_handle = 0; + + galpas->user.fw_handle = paddr_user; + + return 0; +} + +int hcp_galpas_dtor(struct h_galpas *galpas) +{ + if (galpas->kernel.fw_handle) { + int ret = hcall_unmap_page(galpas->kernel.fw_handle); + if (ret) + return ret; + } + + galpas->user.fw_handle = galpas->kernel.fw_handle = 0; + + return 0; +} diff --git a/drivers/staging/rdma/ehca/hcp_phyp.h b/drivers/staging/rdma/ehca/hcp_phyp.h new file mode 100644 index 000000000..d1b029910 --- /dev/null +++ b/drivers/staging/rdma/ehca/hcp_phyp.h @@ -0,0 +1,90 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * Firmware calls + * + * Authors: Christoph Raisch + * Hoang-Nam Nguyen + * Waleri Fomin + * Gerd Bayer + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HCP_PHYP_H__ +#define __HCP_PHYP_H__ + + +/* + * eHCA page (mapped into memory) + * resource to access eHCA register pages in CPU address space +*/ +struct h_galpa { + u64 fw_handle; + /* for pSeries this is a 64bit memory address where + I/O memory is mapped into CPU address space (kv) */ +}; + +/* + * resource to access eHCA address space registers, all types + */ +struct h_galpas { + u32 pid; /*PID of userspace galpa checking */ + struct h_galpa user; /* user space accessible resource, + set to 0 if unused */ + struct h_galpa kernel; /* kernel space accessible resource, + set to 0 if unused */ +}; + +static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset) +{ + u64 addr = galpa.fw_handle + offset; + return *(volatile u64 __force *)addr; +} + +static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value) +{ + u64 addr = galpa.fw_handle + offset; + *(volatile u64 __force *)addr = value; +} + +int hcp_galpas_ctor(struct h_galpas *galpas, int is_user, + u64 paddr_kernel, u64 paddr_user); + +int hcp_galpas_dtor(struct h_galpas *galpas); + +u64 hcall_map_page(u64 physaddr); + +int hcall_unmap_page(u64 mapaddr); + +#endif diff --git a/drivers/staging/rdma/ehca/hipz_fns.h b/drivers/staging/rdma/ehca/hipz_fns.h new file mode 100644 index 000000000..9dac93d02 --- /dev/null +++ b/drivers/staging/rdma/ehca/hipz_fns.h @@ -0,0 +1,68 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HW abstraction register functions + * + * Authors: Christoph Raisch + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_FNS_H__ +#define __HIPZ_FNS_H__ + +#include "ehca_classes.h" +#include "hipz_hw.h" + +#include "hipz_fns_core.h" + +#define hipz_galpa_store_eq(gal, offset, value) \ + hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value) + +#define hipz_galpa_load_eq(gal, offset) \ + hipz_galpa_load(gal, EQTEMM_OFFSET(offset)) + +#define hipz_galpa_store_qped(gal, offset, value) \ + hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value) + +#define hipz_galpa_load_qped(gal, offset) \ + hipz_galpa_load(gal, QPEDMM_OFFSET(offset)) + +#define hipz_galpa_store_mrmw(gal, offset, value) \ + hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value) + +#define hipz_galpa_load_mrmw(gal, offset) \ + hipz_galpa_load(gal, MRMWMM_OFFSET(offset)) + +#endif diff --git a/drivers/staging/rdma/ehca/hipz_fns_core.h b/drivers/staging/rdma/ehca/hipz_fns_core.h new file mode 100644 index 000000000..868735fd3 --- /dev/null +++ b/drivers/staging/rdma/ehca/hipz_fns_core.h @@ -0,0 +1,100 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * HW abstraction register functions + * + * Authors: Christoph Raisch + * Heiko J Schick + * Hoang-Nam Nguyen + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_FNS_CORE_H__ +#define __HIPZ_FNS_CORE_H__ + +#include "hcp_phyp.h" +#include "hipz_hw.h" + +#define hipz_galpa_store_cq(gal, offset, value) \ + hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value) + +#define hipz_galpa_load_cq(gal, offset) \ + hipz_galpa_load(gal, CQTEMM_OFFSET(offset)) + +#define hipz_galpa_store_qp(gal, offset, value) \ + hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value) +#define hipz_galpa_load_qp(gal, offset) \ + hipz_galpa_load(gal, QPTEMM_OFFSET(offset)) + +static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes) +{ + /* ringing doorbell :-) */ + hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa, + EHCA_BMASK_SET(QPX_SQADDER, nr_wqes)); +} + +static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes) +{ + /* ringing doorbell :-) */ + hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa, + EHCA_BMASK_SET(QPX_RQADDER, nr_wqes)); +} + +static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes) +{ + hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca, + EHCA_BMASK_SET(CQX_FECADDER, nr_cqes)); +} + +static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value) +{ + u64 cqx_n0_reg; + + hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0, + EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT, + value)); + cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0); +} + +static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value) +{ + u64 cqx_n1_reg; + + hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1, + EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value)); + cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1); +} + +#endif /* __HIPZ_FNC_CORE_H__ */ diff --git a/drivers/staging/rdma/ehca/hipz_hw.h b/drivers/staging/rdma/ehca/hipz_hw.h new file mode 100644 index 000000000..bf996c7ac --- /dev/null +++ b/drivers/staging/rdma/ehca/hipz_hw.h @@ -0,0 +1,414 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * eHCA register definitions + * + * Authors: Waleri Fomin + * Christoph Raisch + * Reinhard Ernst + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __HIPZ_HW_H__ +#define __HIPZ_HW_H__ + +#include "ehca_tools.h" + +#define EHCA_MAX_MTU 4 + +/* QP Table Entry Memory Map */ +struct hipz_qptemm { + u64 qpx_hcr; + u64 qpx_c; + u64 qpx_herr; + u64 qpx_aer; +/* 0x20*/ + u64 qpx_sqa; + u64 qpx_sqc; + u64 qpx_rqa; + u64 qpx_rqc; +/* 0x40*/ + u64 qpx_st; + u64 qpx_pmstate; + u64 qpx_pmfa; + u64 qpx_pkey; +/* 0x60*/ + u64 qpx_pkeya; + u64 qpx_pkeyb; + u64 qpx_pkeyc; + u64 qpx_pkeyd; +/* 0x80*/ + u64 qpx_qkey; + u64 qpx_dqp; + u64 qpx_dlidp; + u64 qpx_portp; +/* 0xa0*/ + u64 qpx_slidp; + u64 qpx_slidpp; + u64 qpx_dlida; + u64 qpx_porta; +/* 0xc0*/ + u64 qpx_slida; + u64 qpx_slidpa; + u64 qpx_slvl; + u64 qpx_ipd; +/* 0xe0*/ + u64 qpx_mtu; + u64 qpx_lato; + u64 qpx_rlimit; + u64 qpx_rnrlimit; +/* 0x100*/ + u64 qpx_t; + u64 qpx_sqhp; + u64 qpx_sqptp; + u64 qpx_nspsn; +/* 0x120*/ + u64 qpx_nspsnhwm; + u64 reserved1; + u64 qpx_sdsi; + u64 qpx_sdsbc; +/* 0x140*/ + u64 qpx_sqwsize; + u64 qpx_sqwts; + u64 qpx_lsn; + u64 qpx_nssn; +/* 0x160 */ + u64 qpx_mor; + u64 qpx_cor; + u64 qpx_sqsize; + u64 qpx_erc; +/* 0x180*/ + u64 qpx_rnrrc; + u64 qpx_ernrwt; + u64 qpx_rnrresp; + u64 qpx_lmsna; +/* 0x1a0 */ + u64 qpx_sqhpc; + u64 qpx_sqcptp; + u64 qpx_sigt; + u64 qpx_wqecnt; +/* 0x1c0*/ + u64 qpx_rqhp; + u64 qpx_rqptp; + u64 qpx_rqsize; + u64 qpx_nrr; +/* 0x1e0*/ + u64 qpx_rdmac; + u64 qpx_nrpsn; + u64 qpx_lapsn; + u64 qpx_lcr; +/* 0x200*/ + u64 qpx_rwc; + u64 qpx_rwva; + u64 qpx_rdsi; + u64 qpx_rdsbc; +/* 0x220*/ + u64 qpx_rqwsize; + u64 qpx_crmsn; + u64 qpx_rdd; + u64 qpx_larpsn; +/* 0x240*/ + u64 qpx_pd; + u64 qpx_scqn; + u64 qpx_rcqn; + u64 qpx_aeqn; +/* 0x260*/ + u64 qpx_aaelog; + u64 qpx_ram; + u64 qpx_rdmaqe0; + u64 qpx_rdmaqe1; +/* 0x280*/ + u64 qpx_rdmaqe2; + u64 qpx_rdmaqe3; + u64 qpx_nrpsnhwm; +/* 0x298*/ + u64 reserved[(0x400 - 0x298) / 8]; +/* 0x400 extended data */ + u64 reserved_ext[(0x500 - 0x400) / 8]; +/* 0x500 */ + u64 reserved2[(0x1000 - 0x500) / 8]; +/* 0x1000 */ +}; + +#define QPX_SQADDER EHCA_BMASK_IBM(48, 63) +#define QPX_RQADDER EHCA_BMASK_IBM(48, 63) +#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3) + +#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x) + +/* MRMWPT Entry Memory Map */ +struct hipz_mrmwmm { + /* 0x00 */ + u64 mrx_hcr; + + u64 mrx_c; + u64 mrx_herr; + u64 mrx_aer; + /* 0x20 */ + u64 mrx_pp; + u64 reserved1; + u64 reserved2; + u64 reserved3; + /* 0x40 */ + u64 reserved4[(0x200 - 0x40) / 8]; + /* 0x200 */ + u64 mrx_ctl[64]; + +}; + +#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x) + +struct hipz_qpedmm { + /* 0x00 */ + u64 reserved0[(0x400) / 8]; + /* 0x400 */ + u64 qpedx_phh; + u64 qpedx_ppsgp; + /* 0x410 */ + u64 qpedx_ppsgu; + u64 qpedx_ppdgp; + /* 0x420 */ + u64 qpedx_ppdgu; + u64 qpedx_aph; + /* 0x430 */ + u64 qpedx_apsgp; + u64 qpedx_apsgu; + /* 0x440 */ + u64 qpedx_apdgp; + u64 qpedx_apdgu; + /* 0x450 */ + u64 qpedx_apav; + u64 qpedx_apsav; + /* 0x460 */ + u64 qpedx_hcr; + u64 reserved1[4]; + /* 0x488 */ + u64 qpedx_rrl0; + /* 0x490 */ + u64 qpedx_rrrkey0; + u64 qpedx_rrva0; + /* 0x4a0 */ + u64 reserved2; + u64 qpedx_rrl1; + /* 0x4b0 */ + u64 qpedx_rrrkey1; + u64 qpedx_rrva1; + /* 0x4c0 */ + u64 reserved3; + u64 qpedx_rrl2; + /* 0x4d0 */ + u64 qpedx_rrrkey2; + u64 qpedx_rrva2; + /* 0x4e0 */ + u64 reserved4; + u64 qpedx_rrl3; + /* 0x4f0 */ + u64 qpedx_rrrkey3; + u64 qpedx_rrva3; +}; + +#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x) + +/* CQ Table Entry Memory Map */ +struct hipz_cqtemm { + u64 cqx_hcr; + u64 cqx_c; + u64 cqx_herr; + u64 cqx_aer; +/* 0x20 */ + u64 cqx_ptp; + u64 cqx_tp; + u64 cqx_fec; + u64 cqx_feca; +/* 0x40 */ + u64 cqx_ep; + u64 cqx_eq; +/* 0x50 */ + u64 reserved1; + u64 cqx_n0; +/* 0x60 */ + u64 cqx_n1; + u64 reserved2[(0x1000 - 0x60) / 8]; +/* 0x1000 */ +}; + +#define CQX_FEC_CQE_CNT EHCA_BMASK_IBM(32, 63) +#define CQX_FECADDER EHCA_BMASK_IBM(32, 63) +#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0) +#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0) + +#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x) + +/* EQ Table Entry Memory Map */ +struct hipz_eqtemm { + u64 eqx_hcr; + u64 eqx_c; + + u64 eqx_herr; + u64 eqx_aer; +/* 0x20 */ + u64 eqx_ptp; + u64 eqx_tp; + u64 eqx_ssba; + u64 eqx_psba; + +/* 0x40 */ + u64 eqx_cec; + u64 eqx_meql; + u64 eqx_xisbi; + u64 eqx_xisc; +/* 0x60 */ + u64 eqx_it; + +}; + +#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x) + +/* access control defines for MR/MW */ +#define HIPZ_ACCESSCTRL_L_WRITE 0x00800000 +#define HIPZ_ACCESSCTRL_R_WRITE 0x00400000 +#define HIPZ_ACCESSCTRL_R_READ 0x00200000 +#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000 +#define HIPZ_ACCESSCTRL_MW_BIND 0x00080000 + +/* query hca response block */ +struct hipz_query_hca { + u32 cur_reliable_dg; + u32 cur_qp; + u32 cur_cq; + u32 cur_eq; + u32 cur_mr; + u32 cur_mw; + u32 cur_ee_context; + u32 cur_mcast_grp; + u32 cur_qp_attached_mcast_grp; + u32 reserved1; + u32 cur_ipv6_qp; + u32 cur_eth_qp; + u32 cur_hp_mr; + u32 reserved2[3]; + u32 max_rd_domain; + u32 max_qp; + u32 max_cq; + u32 max_eq; + u32 max_mr; + u32 max_hp_mr; + u32 max_mw; + u32 max_mrwpte; + u32 max_special_mrwpte; + u32 max_rd_ee_context; + u32 max_mcast_grp; + u32 max_total_mcast_qp_attach; + u32 max_mcast_qp_attach; + u32 max_raw_ipv6_qp; + u32 max_raw_ethy_qp; + u32 internal_clock_frequency; + u32 max_pd; + u32 max_ah; + u32 max_cqe; + u32 max_wqes_wq; + u32 max_partitions; + u32 max_rr_ee_context; + u32 max_rr_qp; + u32 max_rr_hca; + u32 max_act_wqs_ee_context; + u32 max_act_wqs_qp; + u32 max_sge; + u32 max_sge_rd; + u32 memory_page_size_supported; + u64 max_mr_size; + u32 local_ca_ack_delay; + u32 num_ports; + u32 vendor_id; + u32 vendor_part_id; + u32 hw_ver; + u64 node_guid; + u64 hca_cap_indicators; + u32 data_counter_register_size; + u32 max_shared_rq; + u32 max_isns_eq; + u32 max_neq; +} __attribute__ ((packed)); + +#define HCA_CAP_AH_PORT_NR_CHECK EHCA_BMASK_IBM( 0, 0) +#define HCA_CAP_ATOMIC EHCA_BMASK_IBM( 1, 1) +#define HCA_CAP_AUTO_PATH_MIG EHCA_BMASK_IBM( 2, 2) +#define HCA_CAP_BAD_P_KEY_CTR EHCA_BMASK_IBM( 3, 3) +#define HCA_CAP_SQD_RTS_PORT_CHANGE EHCA_BMASK_IBM( 4, 4) +#define HCA_CAP_CUR_QP_STATE_MOD EHCA_BMASK_IBM( 5, 5) +#define HCA_CAP_INIT_TYPE EHCA_BMASK_IBM( 6, 6) +#define HCA_CAP_PORT_ACTIVE_EVENT EHCA_BMASK_IBM( 7, 7) +#define HCA_CAP_Q_KEY_VIOL_CTR EHCA_BMASK_IBM( 8, 8) +#define HCA_CAP_WQE_RESIZE EHCA_BMASK_IBM( 9, 9) +#define HCA_CAP_RAW_PACKET_MCAST EHCA_BMASK_IBM(10, 10) +#define HCA_CAP_SHUTDOWN_PORT EHCA_BMASK_IBM(11, 11) +#define HCA_CAP_RC_LL_QP EHCA_BMASK_IBM(12, 12) +#define HCA_CAP_SRQ EHCA_BMASK_IBM(13, 13) +#define HCA_CAP_UD_LL_QP EHCA_BMASK_IBM(16, 16) +#define HCA_CAP_RESIZE_MR EHCA_BMASK_IBM(17, 17) +#define HCA_CAP_MINI_QP EHCA_BMASK_IBM(18, 18) +#define HCA_CAP_H_ALLOC_RES_SYNC EHCA_BMASK_IBM(19, 19) + +/* query port response block */ +struct hipz_query_port { + u32 state; + u32 bad_pkey_cntr; + u32 lmc; + u32 lid; + u32 subnet_timeout; + u32 qkey_viol_cntr; + u32 sm_sl; + u32 sm_lid; + u32 capability_mask; + u32 init_type_reply; + u32 pkey_tbl_len; + u32 gid_tbl_len; + u64 gid_prefix; + u32 port_nr; + u16 pkey_entries[16]; + u8 reserved1[32]; + u32 trent_size; + u32 trbuf_size; + u64 max_msg_sz; + u32 max_mtu; + u32 vl_cap; + u32 phys_pstate; + u32 phys_state; + u32 phys_speed; + u32 phys_width; + u8 reserved2[1884]; + u64 guid_entries[255]; +} __attribute__ ((packed)); + +#endif diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.c b/drivers/staging/rdma/ehca/ipz_pt_fn.c new file mode 100644 index 000000000..7ffc748cb --- /dev/null +++ b/drivers/staging/rdma/ehca/ipz_pt_fn.c @@ -0,0 +1,289 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * internal queue handling + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "ehca_tools.h" +#include "ipz_pt_fn.h" +#include "ehca_classes.h" + +#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT) + +struct kmem_cache *small_qp_cache; + +void *ipz_qpageit_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + queue->current_q_offset += queue->pagesize; + if (queue->current_q_offset > queue->queue_length) { + queue->current_q_offset -= queue->pagesize; + ret = NULL; + } + if (((u64)ret) % queue->pagesize) { + ehca_gen_err("ERROR!! not at PAGE-Boundary"); + return NULL; + } + return ret; +} + +void *ipz_qeit_eq_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u64 last_entry_in_q = queue->queue_length - queue->qe_size; + + queue->current_q_offset += queue->qe_size; + if (queue->current_q_offset > last_entry_in_q) { + queue->current_q_offset = 0; + queue->toggle_state = (~queue->toggle_state) & 1; + } + + return ret; +} + +int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset) +{ + int i; + for (i = 0; i < queue->queue_length / queue->pagesize; i++) { + u64 page = __pa(queue->queue_pages[i]); + if (addr >= page && addr < page + queue->pagesize) { + *q_offset = addr - page + i * queue->pagesize; + return 0; + } + } + return -EINVAL; +} + +#if PAGE_SHIFT < EHCA_PAGESHIFT +#error Kernel pages must be at least as large than eHCA pages (4K) ! +#endif + +/* + * allocate pages for queue: + * outer loop allocates whole kernel pages (page aligned) and + * inner loop divides a kernel page into smaller hca queue pages + */ +static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages) +{ + int k, f = 0; + u8 *kpage; + + while (f < nr_of_pages) { + kpage = (u8 *)get_zeroed_page(GFP_KERNEL); + if (!kpage) + goto out; + + for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) { + queue->queue_pages[f] = (struct ipz_page *)kpage; + kpage += EHCA_PAGESIZE; + f++; + } + } + return 1; + +out: + for (f = 0; f < nr_of_pages && queue->queue_pages[f]; + f += PAGES_PER_KPAGE) + free_page((unsigned long)(queue->queue_pages)[f]); + return 0; +} + +static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) +{ + int order = ilog2(queue->pagesize) - 9; + struct ipz_small_queue_page *page; + unsigned long bit; + + mutex_lock(&pd->lock); + + if (!list_empty(&pd->free[order])) + page = list_entry(pd->free[order].next, + struct ipz_small_queue_page, list); + else { + page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL); + if (!page) + goto out; + + page->page = get_zeroed_page(GFP_KERNEL); + if (!page->page) { + kmem_cache_free(small_qp_cache, page); + goto out; + } + + list_add(&page->list, &pd->free[order]); + } + + bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order); + __set_bit(bit, page->bitmap); + page->fill++; + + if (page->fill == IPZ_SPAGE_PER_KPAGE >> order) + list_move(&page->list, &pd->full[order]); + + mutex_unlock(&pd->lock); + + queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9))); + queue->small_page = page; + queue->offset = bit << (order + 9); + return 1; + +out: + ehca_err(pd->ib_pd.device, "failed to allocate small queue page"); + mutex_unlock(&pd->lock); + return 0; +} + +static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd) +{ + int order = ilog2(queue->pagesize) - 9; + struct ipz_small_queue_page *page = queue->small_page; + unsigned long bit; + int free_page = 0; + + bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK) + >> (order + 9); + + mutex_lock(&pd->lock); + + __clear_bit(bit, page->bitmap); + page->fill--; + + if (page->fill == 0) { + list_del(&page->list); + free_page = 1; + } + + if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1) + /* the page was full until we freed the chunk */ + list_move_tail(&page->list, &pd->free[order]); + + mutex_unlock(&pd->lock); + + if (free_page) { + free_page(page->page); + kmem_cache_free(small_qp_cache, page); + } +} + +int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, + const u32 nr_of_pages, const u32 pagesize, + const u32 qe_size, const u32 nr_of_sg, + int is_small) +{ + if (pagesize > PAGE_SIZE) { + ehca_gen_err("FATAL ERROR: pagesize=%x " + "is greater than kernel page size", pagesize); + return 0; + } + + /* init queue fields */ + queue->queue_length = nr_of_pages * pagesize; + queue->pagesize = pagesize; + queue->qe_size = qe_size; + queue->act_nr_of_sg = nr_of_sg; + queue->current_q_offset = 0; + queue->toggle_state = 1; + queue->small_page = NULL; + + /* allocate queue page pointers */ + queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *), + GFP_KERNEL | __GFP_NOWARN); + if (!queue->queue_pages) { + queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *)); + if (!queue->queue_pages) { + ehca_gen_err("Couldn't allocate queue page list"); + return 0; + } + } + + /* allocate actual queue pages */ + if (is_small) { + if (!alloc_small_queue_page(queue, pd)) + goto ipz_queue_ctor_exit0; + } else + if (!alloc_queue_pages(queue, nr_of_pages)) + goto ipz_queue_ctor_exit0; + + return 1; + +ipz_queue_ctor_exit0: + ehca_gen_err("Couldn't alloc pages queue=%p " + "nr_of_pages=%x", queue, nr_of_pages); + kvfree(queue->queue_pages); + + return 0; +} + +int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue) +{ + int i, nr_pages; + + if (!queue || !queue->queue_pages) { + ehca_gen_dbg("queue or queue_pages is NULL"); + return 0; + } + + if (queue->small_page) + free_small_queue_page(queue, pd); + else { + nr_pages = queue->queue_length / queue->pagesize; + for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE) + free_page((unsigned long)queue->queue_pages[i]); + } + + kvfree(queue->queue_pages); + + return 1; +} + +int ehca_init_small_qp_cache(void) +{ + small_qp_cache = kmem_cache_create("ehca_cache_small_qp", + sizeof(struct ipz_small_queue_page), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!small_qp_cache) + return -ENOMEM; + + return 0; +} + +void ehca_cleanup_small_qp_cache(void) +{ + kmem_cache_destroy(small_qp_cache); +} diff --git a/drivers/staging/rdma/ehca/ipz_pt_fn.h b/drivers/staging/rdma/ehca/ipz_pt_fn.h new file mode 100644 index 000000000..a801274ea --- /dev/null +++ b/drivers/staging/rdma/ehca/ipz_pt_fn.h @@ -0,0 +1,289 @@ +/* + * IBM eServer eHCA Infiniband device driver for Linux on POWER + * + * internal queue handling + * + * Authors: Waleri Fomin + * Reinhard Ernst + * Christoph Raisch + * + * Copyright (c) 2005 IBM Corporation + * + * All rights reserved. + * + * This source code is distributed under a dual license of GPL v2.0 and OpenIB + * BSD. + * + * OpenIB BSD License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials + * provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __IPZ_PT_FN_H__ +#define __IPZ_PT_FN_H__ + +#define EHCA_PAGESHIFT 12 +#define EHCA_PAGESIZE 4096UL +#define EHCA_PAGEMASK (~(EHCA_PAGESIZE-1)) +#define EHCA_PT_ENTRIES 512UL + +#include "ehca_tools.h" +#include "ehca_qes.h" + +struct ehca_pd; +struct ipz_small_queue_page; + +extern struct kmem_cache *small_qp_cache; + +/* struct generic ehca page */ +struct ipz_page { + u8 entries[EHCA_PAGESIZE]; +}; + +#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512) + +struct ipz_small_queue_page { + unsigned long page; + unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG]; + int fill; + void *mapped_addr; + u32 mmap_count; + struct list_head list; +}; + +/* struct generic queue in linux kernel virtual memory (kv) */ +struct ipz_queue { + u64 current_q_offset; /* current queue entry */ + + struct ipz_page **queue_pages; /* array of pages belonging to queue */ + u32 qe_size; /* queue entry size */ + u32 act_nr_of_sg; + u32 queue_length; /* queue length allocated in bytes */ + u32 pagesize; + u32 toggle_state; /* toggle flag - per page */ + u32 offset; /* save offset within page for small_qp */ + struct ipz_small_queue_page *small_page; +}; + +/* + * return current Queue Entry for a certain q_offset + * returns address (kv) of Queue Entry + */ +static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset) +{ + struct ipz_page *current_page; + if (q_offset >= queue->queue_length) + return NULL; + current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT]; + return ¤t_page->entries[q_offset & (EHCA_PAGESIZE - 1)]; +} + +/* + * return current Queue Entry + * returns address (kv) of Queue Entry + */ +static inline void *ipz_qeit_get(struct ipz_queue *queue) +{ + return ipz_qeit_calc(queue, queue->current_q_offset); +} + +/* + * return current Queue Page , increment Queue Page iterator from + * page to page in struct ipz_queue, last increment will return 0! and + * NOT wrap + * returns address (kv) of Queue Page + * warning don't use in parallel with ipz_QE_get_inc() + */ +void *ipz_qpageit_get_inc(struct ipz_queue *queue); + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * warning don't use in parallel with ipz_qpageit_get_inc() + */ +static inline void *ipz_qeit_get_inc(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + queue->current_q_offset += queue->qe_size; + if (queue->current_q_offset >= queue->queue_length) { + queue->current_q_offset = 0; + /* toggle the valid flag */ + queue->toggle_state = (~queue->toggle_state) & 1; + } + + return ret; +} + +/* + * return a bool indicating whether current Queue Entry is valid + */ +static inline int ipz_qeit_is_valid(struct ipz_queue *queue) +{ + struct ehca_cqe *cqe = ipz_qeit_get(queue); + return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1)); +} + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * returns 0 and does not increment, if wrong valid state + * warning don't use in parallel with ipz_qpageit_get_inc() + */ +static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue) +{ + return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL; +} + +/* + * returns and resets Queue Entry iterator + * returns address (kv) of first Queue Entry + */ +static inline void *ipz_qeit_reset(struct ipz_queue *queue) +{ + queue->current_q_offset = 0; + return ipz_qeit_get(queue); +} + +/* + * return the q_offset corresponding to an absolute address + */ +int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset); + +/* + * return the next queue offset. don't modify the queue. + */ +static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset) +{ + offset += queue->qe_size; + if (offset >= queue->queue_length) offset = 0; + return offset; +} + +/* struct generic page table */ +struct ipz_pt { + u64 entries[EHCA_PT_ENTRIES]; +}; + +/* struct page table for a queue, only to be used in pf */ +struct ipz_qpt { + /* queue page tables (kv), use u64 because we know the element length */ + u64 *qpts; + u32 n_qpts; + u32 n_ptes; /* number of page table entries */ + u64 *current_pte_addr; +}; + +/* + * constructor for a ipz_queue_t, placement new for ipz_queue_t, + * new for all dependent datastructors + * all QP Tables are the same + * flow: + * allocate+pin queue + * see ipz_qpt_ctor() + * returns true if ok, false if out of memory + */ +int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue, + const u32 nr_of_pages, const u32 pagesize, + const u32 qe_size, const u32 nr_of_sg, + int is_small); + +/* + * destructor for a ipz_queue_t + * -# free queue + * see ipz_queue_ctor() + * returns true if ok, false if queue was NULL-ptr of free failed + */ +int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue); + +/* + * constructor for a ipz_qpt_t, + * placement new for struct ipz_queue, new for all dependent datastructors + * all QP Tables are the same, + * flow: + * -# allocate+pin queue + * -# initialise ptcb + * -# allocate+pin PTs + * -# link PTs to a ring, according to HCA Arch, set bit62 id needed + * -# the ring must have room for exactly nr_of_PTEs + * see ipz_qpt_ctor() + */ +void ipz_qpt_ctor(struct ipz_qpt *qpt, + const u32 nr_of_qes, + const u32 pagesize, + const u32 qe_size, + const u8 lowbyte, const u8 toggle, + u32 * act_nr_of_QEs, u32 * act_nr_of_pages); + +/* + * return current Queue Entry, increment Queue Entry iterator by one + * step in struct ipz_queue, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * warning don't use in parallel with ipz_qpageit_get_inc() + * warning unpredictable results may occur if steps>act_nr_of_queue_entries + * fix EQ page problems + */ +void *ipz_qeit_eq_get_inc(struct ipz_queue *queue); + +/* + * return current Event Queue Entry, increment Queue Entry iterator + * by one step in struct ipz_queue if valid, will wrap in ringbuffer + * returns address (kv) of Queue Entry BEFORE increment + * returns 0 and does not increment, if wrong valid state + * warning don't use in parallel with ipz_queue_QPageit_get_inc() + * warning unpredictable results may occur if steps>act_nr_of_queue_entries + */ +static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u32 qe = *(u8 *)ret; + if ((qe >> 7) != (queue->toggle_state & 1)) + return NULL; + ipz_qeit_eq_get_inc(queue); /* this is a good one */ + return ret; +} + +static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue) +{ + void *ret = ipz_qeit_get(queue); + u32 qe = *(u8 *)ret; + if ((qe >> 7) != (queue->toggle_state & 1)) + return NULL; + return ret; +} + +/* returns address (GX) of first queue entry */ +static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt) +{ + return be64_to_cpu(qpt->qpts[0]); +} + +/* returns address (kv) of first page of queue page table */ +static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt) +{ + return qpt->qpts; +} + +#endif /* __IPZ_PT_FN_H__ */ diff --git a/drivers/staging/rdma/hfi1/Kconfig b/drivers/staging/rdma/hfi1/Kconfig new file mode 100644 index 000000000..fd25078ee --- /dev/null +++ b/drivers/staging/rdma/hfi1/Kconfig @@ -0,0 +1,37 @@ +config INFINIBAND_HFI1 + tristate "Intel OPA Gen1 support" + depends on X86_64 + default m + ---help--- + This is a low-level driver for Intel OPA Gen1 adapter. +config HFI1_DEBUG_SDMA_ORDER + bool "HFI1 SDMA Order debug" + depends on INFINIBAND_HFI1 + default n + ---help--- + This is a debug flag to test for out of order + sdma completions for unit testing +config HFI1_VERBS_31BIT_PSN + bool "HFI1 enable 31 bit PSN" + depends on INFINIBAND_HFI1 + default y + ---help--- + Setting this enables 31 BIT PSN + For verbs RC/UC +config SDMA_VERBOSITY + bool "Config SDMA Verbosity" + depends on INFINIBAND_HFI1 + default n + ---help--- + This is a configuration flag to enable verbose + SDMA debug +config PRESCAN_RXQ + bool "Enable prescanning of the RX queue for ECNs" + depends on INFINIBAND_HFI1 + default n + ---help--- + This option toggles the prescanning of the receive queue for + Explicit Congestion Notifications. If an ECN is detected, it + is processed as quickly as possible, the ECN is toggled off. + After the prescanning step, the receive queue is processed as + usual. diff --git a/drivers/staging/rdma/hfi1/Makefile b/drivers/staging/rdma/hfi1/Makefile new file mode 100644 index 000000000..2e5daa6cd --- /dev/null +++ b/drivers/staging/rdma/hfi1/Makefile @@ -0,0 +1,19 @@ +# +# HFI driver +# +# +# +# Called from the kernel module build system. +# +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o + +hfi1-y := chip.o cq.o device.o diag.o dma.o driver.o eprom.o file_ops.o firmware.o \ + init.o intr.o keys.o mad.o mmap.o mr.o pcie.o pio.o pio_copy.o \ + qp.o qsfp.o rc.o ruc.o sdma.o srq.o sysfs.o trace.o twsi.o \ + uc.o ud.o user_pages.o user_sdma.o verbs_mcast.o verbs.o +hfi1-$(CONFIG_DEBUG_FS) += debugfs.o + +CFLAGS_trace.o = -I$(src) +ifdef MVERSION +CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\" +endif diff --git a/drivers/staging/rdma/hfi1/TODO b/drivers/staging/rdma/hfi1/TODO new file mode 100644 index 000000000..05de0dad8 --- /dev/null +++ b/drivers/staging/rdma/hfi1/TODO @@ -0,0 +1,6 @@ +July, 2015 + +- Remove unneeded file entries in sysfs +- Remove software processing of IB protocol and place in library for use + by qib, ipath (if still present), hfi1, and eventually soft-roce + diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c new file mode 100644 index 000000000..aa58e597d --- /dev/null +++ b/drivers/staging/rdma/hfi1/chip.c @@ -0,0 +1,10798 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains all of the code that is specific to the HFI chip + */ + +#include +#include +#include +#include + +#include "hfi.h" +#include "trace.h" +#include "mad.h" +#include "pio.h" +#include "sdma.h" +#include "eprom.h" + +#define NUM_IB_PORTS 1 + +uint kdeth_qp; +module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO); +MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix"); + +uint num_vls = HFI1_MAX_VLS_SUPPORTED; +module_param(num_vls, uint, S_IRUGO); +MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)"); + +/* + * Default time to aggregate two 10K packets from the idle state + * (timer not running). The timer starts at the end of the first packet, + * so only the time for one 10K packet and header plus a bit extra is needed. + * 10 * 1024 + 64 header byte = 10304 byte + * 10304 byte / 12.5 GB/s = 824.32ns + */ +uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */ +module_param(rcv_intr_timeout, uint, S_IRUGO); +MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns"); + +uint rcv_intr_count = 16; /* same as qib */ +module_param(rcv_intr_count, uint, S_IRUGO); +MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count"); + +ushort link_crc_mask = SUPPORTED_CRCS; +module_param(link_crc_mask, ushort, S_IRUGO); +MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link"); + +uint loopback; +module_param_named(loopback, loopback, uint, S_IRUGO); +MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable"); + +/* Other driver tunables */ +uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/ +static ushort crc_14b_sideband = 1; +static uint use_flr = 1; +uint quick_linkup; /* skip LNI */ + +struct flag_table { + u64 flag; /* the flag */ + char *str; /* description string */ + u16 extra; /* extra information */ + u16 unused0; + u32 unused1; +}; + +/* str must be a string constant */ +#define FLAG_ENTRY(str, extra, flag) {flag, str, extra} +#define FLAG_ENTRY0(str, flag) {flag, str, 0} + +/* Send Error Consequences */ +#define SEC_WRITE_DROPPED 0x1 +#define SEC_PACKET_DROPPED 0x2 +#define SEC_SC_HALTED 0x4 /* per-context only */ +#define SEC_SPC_FREEZE 0x8 /* per-HFI only */ + +#define VL15CTXT 1 +#define MIN_KERNEL_KCTXTS 2 +#define NUM_MAP_REGS 32 + +/* Bit offset into the GUID which carries HFI id information */ +#define GUID_HFI_INDEX_SHIFT 39 + +/* extract the emulation revision */ +#define emulator_rev(dd) ((dd)->irev >> 8) +/* parallel and serial emulation versions are 3 and 4 respectively */ +#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3) +#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4) + +/* RSM fields */ + +/* packet type */ +#define IB_PACKET_TYPE 2ull +#define QW_SHIFT 6ull +/* QPN[7..1] */ +#define QPN_WIDTH 7ull + +/* LRH.BTH: QW 0, OFFSET 48 - for match */ +#define LRH_BTH_QW 0ull +#define LRH_BTH_BIT_OFFSET 48ull +#define LRH_BTH_OFFSET(off) ((LRH_BTH_QW << QW_SHIFT) | (off)) +#define LRH_BTH_MATCH_OFFSET LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET) +#define LRH_BTH_SELECT +#define LRH_BTH_MASK 3ull +#define LRH_BTH_VALUE 2ull + +/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */ +#define LRH_SC_QW 0ull +#define LRH_SC_BIT_OFFSET 56ull +#define LRH_SC_OFFSET(off) ((LRH_SC_QW << QW_SHIFT) | (off)) +#define LRH_SC_MATCH_OFFSET LRH_SC_OFFSET(LRH_SC_BIT_OFFSET) +#define LRH_SC_MASK 128ull +#define LRH_SC_VALUE 0ull + +/* SC[n..0] QW 0, OFFSET 60 - for select */ +#define LRH_SC_SELECT_OFFSET ((LRH_SC_QW << QW_SHIFT) | (60ull)) + +/* QPN[m+n:1] QW 1, OFFSET 1 */ +#define QPN_SELECT_OFFSET ((1ull << QW_SHIFT) | (1ull)) + +/* defines to build power on SC2VL table */ +#define SC2VL_VAL( \ + num, \ + sc0, sc0val, \ + sc1, sc1val, \ + sc2, sc2val, \ + sc3, sc3val, \ + sc4, sc4val, \ + sc5, sc5val, \ + sc6, sc6val, \ + sc7, sc7val) \ +( \ + ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \ + ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \ + ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \ + ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \ + ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \ + ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \ + ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \ + ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT) \ +) + +#define DC_SC_VL_VAL( \ + range, \ + e0, e0val, \ + e1, e1val, \ + e2, e2val, \ + e3, e3val, \ + e4, e4val, \ + e5, e5val, \ + e6, e6val, \ + e7, e7val, \ + e8, e8val, \ + e9, e9val, \ + e10, e10val, \ + e11, e11val, \ + e12, e12val, \ + e13, e13val, \ + e14, e14val, \ + e15, e15val) \ +( \ + ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \ + ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \ + ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \ + ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \ + ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \ + ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \ + ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \ + ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \ + ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \ + ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \ + ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \ + ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \ + ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \ + ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \ + ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \ + ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \ +) + +/* all CceStatus sub-block freeze bits */ +#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \ + | CCE_STATUS_RXE_FROZE_SMASK \ + | CCE_STATUS_TXE_FROZE_SMASK \ + | CCE_STATUS_TXE_PIO_FROZE_SMASK) +/* all CceStatus sub-block TXE pause bits */ +#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \ + | CCE_STATUS_TXE_PAUSED_SMASK \ + | CCE_STATUS_SDMA_PAUSED_SMASK) +/* all CceStatus sub-block RXE pause bits */ +#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK + +/* + * CCE Error flags. + */ +static struct flag_table cce_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("CceCsrParityErr", + CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK), +/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr", + CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK), +/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr", + CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK), +/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr", + CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr", + CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK), +/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr", + CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK), +/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr", + CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr", + CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK), +/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr", + CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK), +/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK), +/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK), +/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK), +/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr", + CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK), +/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr", + CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK), +/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK), +/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK), +/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK), +/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr", + CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK), +/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr", + CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK), +/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr", + CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK), +/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr", + CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK), +/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr", + CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK), +/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr", + CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK), +/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr", + CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK), +/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr", + CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK), +/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr", + CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK), +/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr", + CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK), +/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr", + CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK), +/*29*/ FLAG_ENTRY0("PcicReceiveParityErr", + CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK), +/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr", + CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK), +/*31*/ FLAG_ENTRY0("LATriggered", + CCE_ERR_STATUS_LA_TRIGGERED_SMASK), +/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr", + CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK), +/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr", + CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK), +/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr", + CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK), +/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr", + CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK), +/*36*/ FLAG_ENTRY0("CceMsixTableCorErr", + CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK), +/*37*/ FLAG_ENTRY0("CceMsixTableUncErr", + CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK), +/*38*/ FLAG_ENTRY0("CceIntMapCorErr", + CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK), +/*39*/ FLAG_ENTRY0("CceIntMapUncErr", + CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK), +/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr", + CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK), +/*41-63 reserved*/ +}; + +/* + * Misc Error flags + */ +#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK +static struct flag_table misc_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)), +/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)), +/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)), +/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)), +/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)), +/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)), +/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)), +/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)), +/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)), +/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)), +/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)), +/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)), +/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL)) +}; + +/* + * TXE PIO Error flags and consequences + */ +static struct flag_table pio_err_status_flags[] = { +/* 0*/ FLAG_ENTRY("PioWriteBadCtxt", + SEC_WRITE_DROPPED, + SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK), +/* 1*/ FLAG_ENTRY("PioWriteAddrParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK), +/* 2*/ FLAG_ENTRY("PioCsrParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK), +/* 3*/ FLAG_ENTRY("PioSbMemFifo0", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK), +/* 4*/ FLAG_ENTRY("PioSbMemFifo1", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK), +/* 5*/ FLAG_ENTRY("PioPccFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK), +/* 6*/ FLAG_ENTRY("PioPecFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK), +/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK), +/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK), +/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK), +/*10*/ FLAG_ENTRY("PioSmPktResetParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK), +/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK), +/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK), +/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor", + 0, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK), +/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor", + 0, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK), +/*15*/ FLAG_ENTRY("PioCreditRetFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK), +/*16*/ FLAG_ENTRY("PioPpmcPblFifo", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK), +/*17*/ FLAG_ENTRY("PioInitSmIn", + 0, + SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK), +/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK), +/*19*/ FLAG_ENTRY("PioHostAddrMemUnc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK), +/*20*/ FLAG_ENTRY("PioHostAddrMemCor", + 0, + SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK), +/*21*/ FLAG_ENTRY("PioWriteDataParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK), +/*22*/ FLAG_ENTRY("PioStateMachine", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK), +/*23*/ FLAG_ENTRY("PioWriteQwValidParity", + SEC_WRITE_DROPPED|SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK), +/*24*/ FLAG_ENTRY("PioBlockQwCountParity", + SEC_WRITE_DROPPED|SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK), +/*25*/ FLAG_ENTRY("PioVlfVlLenParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK), +/*26*/ FLAG_ENTRY("PioVlfSopParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK), +/*27*/ FLAG_ENTRY("PioVlFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK), +/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK), +/*29*/ FLAG_ENTRY("PioPpmcSopLen", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK), +/*30-31 reserved*/ +/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK), +/*33*/ FLAG_ENTRY("PioLastReturnedCntParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK), +/*34*/ FLAG_ENTRY("PioPccSopHeadParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK), +/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK), +/*36-63 reserved*/ +}; + +/* TXE PIO errors that cause an SPC freeze */ +#define ALL_PIO_FREEZE_ERR \ + (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK) + +/* + * TXE SDMA Error flags + */ +static struct flag_table sdma_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", + SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK), +/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr", + SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK), +/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr", + SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK), +/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr", + SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK), +/*04-63 reserved*/ +}; + +/* TXE SDMA errors that cause an SPC freeze */ +#define ALL_SDMA_FREEZE_ERR \ + (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \ + | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \ + | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK) + +/* + * TXE Egress Error flags + */ +#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK +static struct flag_table egress_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)), +/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)), +/* 2 reserved */ +/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr", + SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)), +/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)), +/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)), +/* 6 reserved */ +/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr", + SEES(TX_PIO_LAUNCH_INTF_PARITY)), +/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr", + SEES(TX_SDMA_LAUNCH_INTF_PARITY)), +/* 9-10 reserved */ +/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr", + SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)), +/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)), +/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)), +/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)), +/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)), +/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr", + SEES(TX_SDMA0_DISALLOWED_PACKET)), +/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr", + SEES(TX_SDMA1_DISALLOWED_PACKET)), +/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr", + SEES(TX_SDMA2_DISALLOWED_PACKET)), +/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr", + SEES(TX_SDMA3_DISALLOWED_PACKET)), +/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr", + SEES(TX_SDMA4_DISALLOWED_PACKET)), +/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr", + SEES(TX_SDMA5_DISALLOWED_PACKET)), +/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr", + SEES(TX_SDMA6_DISALLOWED_PACKET)), +/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr", + SEES(TX_SDMA7_DISALLOWED_PACKET)), +/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr", + SEES(TX_SDMA8_DISALLOWED_PACKET)), +/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr", + SEES(TX_SDMA9_DISALLOWED_PACKET)), +/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr", + SEES(TX_SDMA10_DISALLOWED_PACKET)), +/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr", + SEES(TX_SDMA11_DISALLOWED_PACKET)), +/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr", + SEES(TX_SDMA12_DISALLOWED_PACKET)), +/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr", + SEES(TX_SDMA13_DISALLOWED_PACKET)), +/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr", + SEES(TX_SDMA14_DISALLOWED_PACKET)), +/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr", + SEES(TX_SDMA15_DISALLOWED_PACKET)), +/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr", + SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)), +/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr", + SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)), +/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr", + SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)), +/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr", + SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)), +/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr", + SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)), +/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr", + SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)), +/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr", + SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)), +/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr", + SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)), +/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr", + SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)), +/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)), +/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)), +/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)), +/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)), +/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)), +/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)), +/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)), +/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)), +/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)), +/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)), +/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)), +/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)), +/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)), +/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)), +/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)), +/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)), +/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)), +/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)), +/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)), +/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)), +/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)), +/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr", + SEES(TX_READ_SDMA_MEMORY_CSR_UNC)), +/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr", + SEES(TX_READ_PIO_MEMORY_CSR_UNC)), +}; + +/* + * TXE Egress Error Info flags + */ +#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK +static struct flag_table egress_err_info_flags[] = { +/* 0*/ FLAG_ENTRY0("Reserved", 0ull), +/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)), +/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), +/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), +/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)), +/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)), +/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)), +/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)), +/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)), +/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)), +/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)), +/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)), +/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)), +/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)), +/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)), +/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)), +/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)), +/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)), +/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)), +/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)), +/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)), +/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)), +}; + +/* TXE Egress errors that cause an SPC freeze */ +#define ALL_TXE_EGRESS_FREEZE_ERR \ + (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \ + | SEES(TX_PIO_LAUNCH_INTF_PARITY) \ + | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \ + | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \ + | SEES(TX_LAUNCH_CSR_PARITY) \ + | SEES(TX_SBRD_CTL_CSR_PARITY) \ + | SEES(TX_CONFIG_PARITY) \ + | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \ + | SEES(TX_CREDIT_RETURN_PARITY)) + +/* + * TXE Send error flags + */ +#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK +static struct flag_table send_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SES(CSR_PARITY)), +/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)), +/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR)) +}; + +/* + * TXE Send Context Error flags and consequences + */ +static struct flag_table sc_err_status_flags[] = { +/* 0*/ FLAG_ENTRY("InconsistentSop", + SEC_PACKET_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK), +/* 1*/ FLAG_ENTRY("DisallowedPacket", + SEC_PACKET_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK), +/* 2*/ FLAG_ENTRY("WriteCrossesBoundary", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK), +/* 3*/ FLAG_ENTRY("WriteOverflow", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK), +/* 4*/ FLAG_ENTRY("WriteOutOfBounds", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK), +/* 5-63 reserved*/ +}; + +/* + * RXE Receive Error flags + */ +#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK +static struct flag_table rxe_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)), +/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)), +/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)), +/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)), +/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)), +/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)), +/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)), +/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)), +/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)), +/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)), +/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)), +/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)), +/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)), +/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)), +/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)), +/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)), +/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr", + RXES(RBUF_LOOKUP_DES_REG_UNC_COR)), +/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)), +/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)), +/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr", + RXES(RBUF_BLOCK_LIST_READ_UNC)), +/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr", + RXES(RBUF_BLOCK_LIST_READ_COR)), +/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr", + RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)), +/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr", + RXES(RBUF_CSR_QENT_CNT_PARITY)), +/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr", + RXES(RBUF_CSR_QNEXT_BUF_PARITY)), +/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr", + RXES(RBUF_CSR_QVLD_BIT_PARITY)), +/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)), +/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)), +/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr", + RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)), +/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)), +/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)), +/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)), +/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)), +/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)), +/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)), +/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)), +/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr", + RXES(RBUF_FL_INITDONE_PARITY)), +/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr", + RXES(RBUF_FL_INIT_WR_ADDR_PARITY)), +/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)), +/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)), +/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)), +/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr", + RXES(LOOKUP_DES_PART1_UNC_COR)), +/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr", + RXES(LOOKUP_DES_PART2_PARITY)), +/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)), +/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)), +/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)), +/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)), +/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)), +/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)), +/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)), +/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)), +/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)), +/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)), +/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)), +/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)), +/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)), +/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)), +/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)), +/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)), +/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)), +/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)), +/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)), +/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)), +/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)), +/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY)) +}; + +/* RXE errors that will trigger an SPC freeze */ +#define ALL_RXE_FREEZE_ERR \ + (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK) + +#define RXE_FREEZE_ABORT_MASK \ + (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \ + RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \ + RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK) + +/* + * DCC Error Flags + */ +#define DCCE(name) DCC_ERR_FLG_##name##_SMASK +static struct flag_table dcc_err_flags[] = { + FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)), + FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)), + FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)), + FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)), + FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)), + FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)), + FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)), + FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)), + FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)), + FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)), + FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)), + FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)), + FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)), + FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)), + FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)), + FLAG_ENTRY0("link_err", DCCE(LINK_ERR)), + FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)), + FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)), + FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)), + FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)), + FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)), + FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)), + FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)), + FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)), + FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)), + FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)), + FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)), + FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)), + FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)), + FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)), + FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)), + FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)), + FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)), + FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)), + FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)), + FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)), + FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)), + FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)), + FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)), + FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)), + FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)), + FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)), + FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)), + FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)), + FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)), + FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)), +}; + +/* + * LCB error flags + */ +#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK +static struct flag_table lcb_err_flags[] = { +/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)), +/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)), +/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)), +/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST", + LCBE(ALL_LNS_FAILED_REINIT_TEST)), +/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)), +/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)), +/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)), +/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)), +/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)), +/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)), +/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)), +/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)), +/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)), +/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER", + LCBE(UNEXPECTED_ROUND_TRIP_MARKER)), +/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)), +/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)), +/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)), +/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)), +/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)), +/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE", + LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)), +/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)), +/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)), +/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)), +/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)), +/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)), +/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)), +/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP", + LCBE(RST_FOR_INCOMPLT_RND_TRIP)), +/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)), +/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE", + LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)), +/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR", + LCBE(REDUNDANT_FLIT_PARITY_ERR)) +}; + +/* + * DC8051 Error Flags + */ +#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK +static struct flag_table dc8051_err_flags[] = { + FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)), + FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)), + FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)), + FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)), + FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)), + FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)), + FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)), + FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)), + FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES", + D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)), + FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)), +}; + +/* + * DC8051 Information Error flags + * + * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field. + */ +static struct flag_table dc8051_info_err_flags[] = { + FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED), + FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME), + FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET), + FLAG_ENTRY0("Serdes internal loopback failure", + FAILED_SERDES_INTERNAL_LOOPBACK), + FLAG_ENTRY0("Failed SerDes init", FAILED_SERDES_INIT), + FLAG_ENTRY0("Failed LNI(Polling)", FAILED_LNI_POLLING), + FLAG_ENTRY0("Failed LNI(Debounce)", FAILED_LNI_DEBOUNCE), + FLAG_ENTRY0("Failed LNI(EstbComm)", FAILED_LNI_ESTBCOMM), + FLAG_ENTRY0("Failed LNI(OptEq)", FAILED_LNI_OPTEQ), + FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1), + FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2), + FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT) +}; + +/* + * DC8051 Information Host Information flags + * + * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field. + */ +static struct flag_table dc8051_info_host_msg_flags[] = { + FLAG_ENTRY0("Host request done", 0x0001), + FLAG_ENTRY0("BC SMA message", 0x0002), + FLAG_ENTRY0("BC PWR_MGM message", 0x0004), + FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008), + FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010), + FLAG_ENTRY0("External device config request", 0x0020), + FLAG_ENTRY0("VerifyCap all frames received", 0x0040), + FLAG_ENTRY0("LinkUp achieved", 0x0080), + FLAG_ENTRY0("Link going down", 0x0100), +}; + + +static u32 encoded_size(u32 size); +static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate); +static int set_physical_link_state(struct hfi1_devdata *dd, u64 state); +static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management, + u8 *continuous); +static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, + u8 *vcu, u16 *vl15buf, u8 *crc_sizes); +static void read_vc_remote_link_width(struct hfi1_devdata *dd, + u8 *remote_tx_rate, u16 *link_widths); +static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths); +static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, + u8 *device_rev); +static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed); +static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx); +static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx, + u8 *tx_polarity_inversion, + u8 *rx_polarity_inversion, u8 *max_rate); +static void handle_sdma_eng_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg); +static void handle_dcc_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_lcb_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void set_partition_keys(struct hfi1_pportdata *); +static const char *link_state_name(u32 state); +static const char *link_state_reason_name(struct hfi1_pportdata *ppd, + u32 state); +static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, + u64 *out_data); +static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); +static int thermal_init(struct hfi1_devdata *dd); + +static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs); +static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc); +static void handle_temp_err(struct hfi1_devdata *); +static void dc_shutdown(struct hfi1_devdata *); +static void dc_start(struct hfi1_devdata *); + +/* + * Error interrupt table entry. This is used as input to the interrupt + * "clear down" routine used for all second tier error interrupt register. + * Second tier interrupt registers have a single bit representing them + * in the top-level CceIntStatus. + */ +struct err_reg_info { + u32 status; /* status CSR offset */ + u32 clear; /* clear CSR offset */ + u32 mask; /* mask CSR offset */ + void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg); + const char *desc; +}; + +#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START) +#define NUM_DC_ERRS (IS_DC_END - IS_DC_START) +#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START) + +/* + * Helpers for building HFI and DC error interrupt table entries. Different + * helpers are needed because of inconsistent register names. + */ +#define EE(reg, handler, desc) \ + { reg##_STATUS, reg##_CLEAR, reg##_MASK, \ + handler, desc } +#define DC_EE1(reg, handler, desc) \ + { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc } +#define DC_EE2(reg, handler, desc) \ + { reg##_FLG, reg##_CLR, reg##_EN, handler, desc } + +/* + * Table of the "misc" grouping of error interrupts. Each entry refers to + * another register containing more information. + */ +static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = { +/* 0*/ EE(CCE_ERR, handle_cce_err, "CceErr"), +/* 1*/ EE(RCV_ERR, handle_rxe_err, "RxeErr"), +/* 2*/ EE(MISC_ERR, handle_misc_err, "MiscErr"), +/* 3*/ { 0, 0, 0, NULL }, /* reserved */ +/* 4*/ EE(SEND_PIO_ERR, handle_pio_err, "PioErr"), +/* 5*/ EE(SEND_DMA_ERR, handle_sdma_err, "SDmaErr"), +/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"), +/* 7*/ EE(SEND_ERR, handle_txe_err, "TxeErr") + /* the rest are reserved */ +}; + +/* + * Index into the Various section of the interrupt sources + * corresponding to the Critical Temperature interrupt. + */ +#define TCRIT_INT_SOURCE 4 + +/* + * SDMA error interrupt entry - refers to another register containing more + * information. + */ +static const struct err_reg_info sdma_eng_err = + EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr"); + +static const struct err_reg_info various_err[NUM_VARIOUS] = { +/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */ +/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */ +/* 2*/ EE(ASIC_QSFP1, handle_qsfp_int, "QSFP1"), +/* 3*/ EE(ASIC_QSFP2, handle_qsfp_int, "QSFP2"), +/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */ + /* rest are reserved */ +}; + +/* + * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG + * register can not be derived from the MTU value because 10K is not + * a power of 2. Therefore, we need a constant. Everything else can + * be calculated. + */ +#define DCC_CFG_PORT_MTU_CAP_10240 7 + +/* + * Table of the DC grouping of error interrupts. Each entry refers to + * another register containing more information. + */ +static const struct err_reg_info dc_errs[NUM_DC_ERRS] = { +/* 0*/ DC_EE1(DCC_ERR, handle_dcc_err, "DCC Err"), +/* 1*/ DC_EE2(DC_LCB_ERR, handle_lcb_err, "LCB Err"), +/* 2*/ DC_EE2(DC_DC8051_ERR, handle_8051_interrupt, "DC8051 Interrupt"), +/* 3*/ /* dc_lbm_int - special, see is_dc_int() */ + /* the rest are reserved */ +}; + +struct cntr_entry { + /* + * counter name + */ + char *name; + + /* + * csr to read for name (if applicable) + */ + u64 csr; + + /* + * offset into dd or ppd to store the counter's value + */ + int offset; + + /* + * flags + */ + u8 flags; + + /* + * accessor for stat element, context either dd or ppd + */ + u64 (*rw_cntr)(const struct cntr_entry *, + void *context, + int vl, + int mode, + u64 data); +}; + +#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0 +#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159 + +#define CNTR_ELEM(name, csr, offset, flags, accessor) \ +{ \ + name, \ + csr, \ + offset, \ + flags, \ + accessor \ +} + +/* 32bit RXE */ +#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + port_access_u32_csr) + +#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +/* 64bit RXE */ +#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY64), \ + 0, flags, \ + port_access_u64_csr) + +#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY64), \ + 0, flags, \ + dev_access_u64_csr) + +#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx +#define OVR_ELM(ctx) \ +CNTR_ELEM("RcvHdrOvr" #ctx, \ + (RCV_HDR_OVFL_CNT + ctx*0x100), \ + 0, CNTR_NORMAL, port_access_u64_csr) + +/* 32bit TXE */ +#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + SEND_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + port_access_u32_csr) + +/* 64bit TXE */ +#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + SEND_COUNTER_ARRAY64), \ + 0, flags, \ + port_access_u64_csr) + +# define TX64_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name,\ + counter * 8 + SEND_COUNTER_ARRAY64, \ + 0, \ + flags, \ + dev_access_u64_csr) + +/* CCE */ +#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + CCE_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + CCE_INT_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +/* DC */ +#define DC_PERF_CNTR(name, counter, flags) \ +CNTR_ELEM(#name, \ + counter, \ + 0, \ + flags, \ + dev_access_u64_csr) + +#define DC_PERF_CNTR_LCB(name, counter, flags) \ +CNTR_ELEM(#name, \ + counter, \ + 0, \ + flags, \ + dc_access_lcb_cntr) + +/* ibp counters */ +#define SW_IBP_CNTR(name, cntr) \ +CNTR_ELEM(#name, \ + 0, \ + 0, \ + CNTR_SYNTH, \ + access_ibp_##cntr) + +u64 read_csr(const struct hfi1_devdata *dd, u32 offset) +{ + u64 val; + + if (dd->flags & HFI1_PRESENT) { + val = readq((void __iomem *)dd->kregbase + offset); + return val; + } + return -1; +} + +void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value) +{ + if (dd->flags & HFI1_PRESENT) + writeq(value, (void __iomem *)dd->kregbase + offset); +} + +void __iomem *get_csr_addr( + struct hfi1_devdata *dd, + u32 offset) +{ + return (void __iomem *)dd->kregbase + offset; +} + +static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr, + int mode, u64 value) +{ + u64 ret; + + + if (mode == CNTR_MODE_R) { + ret = read_csr(dd, csr); + } else if (mode == CNTR_MODE_W) { + write_csr(dd, csr, value); + ret = value; + } else { + dd_dev_err(dd, "Invalid cntr register access mode"); + return 0; + } + + hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode); + return ret; +} + +/* Dev Access */ +static u64 dev_access_u32_csr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_csr(dd, entry->csr, mode, data); +} + +static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + u64 val = 0; + u64 csr = entry->csr; + + if (entry->flags & CNTR_VL) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 8 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + + val = read_write_csr(dd, csr, mode, data); + return val; +} + +static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + u32 csr = entry->csr; + int ret = 0; + + if (vl != CNTR_INVALID_VL) + return 0; + if (mode == CNTR_MODE_R) + ret = read_lcb_csr(dd, csr, &data); + else if (mode == CNTR_MODE_W) + ret = write_lcb_csr(dd, csr, data); + + if (ret) { + dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr); + return 0; + } + + hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode); + return data; +} + +/* Port Access */ +static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_csr(ppd->dd, entry->csr, mode, data); +} + +static u64 port_access_u64_csr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + u64 val; + u64 csr = entry->csr; + + if (entry->flags & CNTR_VL) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 8 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + val = read_write_csr(ppd->dd, csr, mode, data); + return val; +} + +/* Software defined */ +static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode, + u64 data) +{ + u64 ret; + + if (mode == CNTR_MODE_R) { + ret = *cntr; + } else if (mode == CNTR_MODE_W) { + *cntr = data; + ret = data; + } else { + dd_dev_err(dd, "Invalid cntr sw access mode"); + return 0; + } + + hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode); + + return ret; +} + +static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->link_downed, mode, data); +} + +static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->link_up, mode, data); +} + +static u64 access_sw_xmit_discards(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + + return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data); +} + +static u64 access_xmit_constraint_errs(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + + return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors, + mode, data); +} + +static u64 access_rcv_constraint_errs(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + + return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors, + mode, data); +} + +u64 get_all_cpu_total(u64 __percpu *cntr) +{ + int cpu; + u64 counter = 0; + + for_each_possible_cpu(cpu) + counter += *per_cpu_ptr(cntr, cpu); + return counter; +} + +static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val, + u64 __percpu *cntr, + int vl, int mode, u64 data) +{ + + u64 ret = 0; + + if (vl != CNTR_INVALID_VL) + return 0; + + if (mode == CNTR_MODE_R) { + ret = get_all_cpu_total(cntr) - *z_val; + } else if (mode == CNTR_MODE_W) { + /* A write can only zero the counter */ + if (data == 0) + *z_val = get_all_cpu_total(cntr); + else + dd_dev_err(dd, "Per CPU cntrs can only be zeroed"); + } else { + dd_dev_err(dd, "Invalid cntr sw cpu access mode"); + return 0; + } + + return ret; +} + +static u64 access_sw_cpu_intr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl, + mode, data); +} + +static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl, + mode, data); +} + +static u64 access_sw_pio_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->verbs_dev.n_piowait; +} + +static u64 access_sw_vtx_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->verbs_dev.n_txwait; +} + +static u64 access_sw_kmem_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->verbs_dev.n_kmem_wait; +} + +#define def_access_sw_cpu(cntr) \ +static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \ + void *context, int vl, int mode, u64 data) \ +{ \ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \ + return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr, \ + ppd->ibport_data.cntr, vl, \ + mode, data); \ +} + +def_access_sw_cpu(rc_acks); +def_access_sw_cpu(rc_qacks); +def_access_sw_cpu(rc_delayed_comp); + +#define def_access_ibp_counter(cntr) \ +static u64 access_ibp_##cntr(const struct cntr_entry *entry, \ + void *context, int vl, int mode, u64 data) \ +{ \ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \ + \ + if (vl != CNTR_INVALID_VL) \ + return 0; \ + \ + return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr, \ + mode, data); \ +} + +def_access_ibp_counter(loop_pkts); +def_access_ibp_counter(rc_resends); +def_access_ibp_counter(rnr_naks); +def_access_ibp_counter(other_naks); +def_access_ibp_counter(rc_timeouts); +def_access_ibp_counter(pkt_drops); +def_access_ibp_counter(dmawait); +def_access_ibp_counter(rc_seqnak); +def_access_ibp_counter(rc_dupreq); +def_access_ibp_counter(rdma_seq); +def_access_ibp_counter(unaligned); +def_access_ibp_counter(seq_naks); + +static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { +[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH), +[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT, + CNTR_NORMAL), +[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT, + CNTR_NORMAL), +[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs, + RCV_TID_FLOW_GEN_MISMATCH_CNT, + CNTR_NORMAL), +[C_RX_CTX_RHQS] = RXE32_DEV_CNTR_ELEM(RxCtxRHQS, RCV_CONTEXT_RHQ_STALL, + CNTR_NORMAL), +[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL, + CNTR_NORMAL), +[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs, + RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL), +[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt, + CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL), +[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT, + CNTR_NORMAL), +[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT, + CNTR_NORMAL), +[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT, + CNTR_NORMAL), +[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT, + CNTR_NORMAL), +[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT, + CNTR_NORMAL), +[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT, + CNTR_NORMAL), +[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt, + CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL), +[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt, + CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL), +[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH), +[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT, + CNTR_SYNTH), +[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT, + CNTR_SYNTH), +[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT, + CNTR_SYNTH), +[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts, + DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH), +[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts, + DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT, + CNTR_SYNTH), +[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr, + DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH), +[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT, + CNTR_SYNTH), +[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT, + CNTR_SYNTH), +[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT, + CNTR_SYNTH), +[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT, + CNTR_SYNTH), +[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT, + CNTR_SYNTH), +[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT, + CNTR_SYNTH), +[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH), +[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH), +[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT, + CNTR_SYNTH), +[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_TOTAL_CRC] = + DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR, + CNTR_SYNTH), +[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0, + CNTR_SYNTH), +[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1, + CNTR_SYNTH), +[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2, + CNTR_SYNTH), +[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3, + CNTR_SYNTH), +[C_DC_CRC_MULT_LN] = + DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN, + CNTR_SYNTH), +[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT, + CNTR_SYNTH), +[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT, + CNTR_SYNTH), +[C_DC_SEQ_CRC_CNT] = + DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT, + CNTR_SYNTH), +[C_DC_ESC0_ONLY_CNT] = + DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT, + CNTR_SYNTH), +[C_DC_ESC0_PLUS1_CNT] = + DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT, + CNTR_SYNTH), +[C_DC_ESC0_PLUS2_CNT] = + DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT, + CNTR_SYNTH), +[C_DC_REINIT_FROM_PEER_CNT] = + DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT, + CNTR_SYNTH), +[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT, + CNTR_SYNTH), +[C_DC_MISC_FLG_CNT] = + DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT, + CNTR_SYNTH), +[C_DC_PRF_GOOD_LTP_CNT] = + DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH), +[C_DC_PRF_ACCEPTED_LTP_CNT] = + DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT, + CNTR_SYNTH), +[C_DC_PRF_RX_FLIT_CNT] = + DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH), +[C_DC_PRF_TX_FLIT_CNT] = + DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH), +[C_DC_PRF_CLK_CNTR] = + DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH), +[C_DC_PG_DBG_FLIT_CRDTS_CNT] = + DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH), +[C_DC_PG_STS_PAUSE_COMPLETE_CNT] = + DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT, + CNTR_SYNTH), +[C_DC_PG_STS_TX_SBE_CNT] = + DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH), +[C_DC_PG_STS_TX_MBE_CNT] = + DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT, + CNTR_SYNTH), +[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL, + access_sw_cpu_intr), +[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL, + access_sw_cpu_rcv_limit), +[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL, + access_sw_vtx_wait), +[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL, + access_sw_pio_wait), +[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL, + access_sw_kmem_wait), +}; + +static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = { +[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT, + CNTR_NORMAL), +[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT, + CNTR_NORMAL), +[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT, + CNTR_NORMAL), +[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT, + CNTR_NORMAL), +[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT, + CNTR_NORMAL), +[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT, + CNTR_NORMAL), +[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT, + CNTR_NORMAL), +[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL), +[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL), +[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH), +[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL), +[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL), +[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_link_dn_cnt), +[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_link_up_cnt), +[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_xmit_discards), +[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0, + CNTR_SYNTH | CNTR_32BIT | CNTR_VL, + access_sw_xmit_discards), +[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH, + access_xmit_constraint_errs), +[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH, + access_rcv_constraint_errs), +[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts), +[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends), +[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks), +[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks), +[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts), +[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops), +[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait), +[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak), +[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq), +[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq), +[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned), +[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks), +[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_acks), +[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_qacks), +[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_delayed_comp), +[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1), +[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3), +[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5), +[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7), +[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9), +[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11), +[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13), +[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15), +[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17), +[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19), +[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21), +[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23), +[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25), +[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27), +[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29), +[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31), +[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33), +[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35), +[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37), +[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39), +[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41), +[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43), +[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45), +[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47), +[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49), +[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51), +[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53), +[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55), +[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57), +[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59), +[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61), +[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63), +[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65), +[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67), +[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69), +[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71), +[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73), +[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75), +[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77), +[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79), +[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81), +[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83), +[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85), +[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87), +[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89), +[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91), +[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93), +[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95), +[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97), +[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99), +[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101), +[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103), +[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105), +[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107), +[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109), +[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111), +[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113), +[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115), +[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117), +[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119), +[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121), +[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123), +[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125), +[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127), +[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129), +[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131), +[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133), +[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135), +[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137), +[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139), +[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141), +[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143), +[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145), +[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147), +[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149), +[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151), +[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153), +[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155), +[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157), +[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159), +}; + +/* ======================================================================== */ + +/* return true if this is chip revision revision a0 */ +int is_a0(struct hfi1_devdata *dd) +{ + return ((dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT) + & CCE_REVISION_CHIP_REV_MINOR_MASK) == 0; +} + +/* return true if this is chip revision revision a */ +int is_ax(struct hfi1_devdata *dd) +{ + u8 chip_rev_minor = + dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT + & CCE_REVISION_CHIP_REV_MINOR_MASK; + return (chip_rev_minor & 0xf0) == 0; +} + +/* return true if this is chip revision revision b */ +int is_bx(struct hfi1_devdata *dd) +{ + u8 chip_rev_minor = + dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT + & CCE_REVISION_CHIP_REV_MINOR_MASK; + return !!(chip_rev_minor & 0x10); +} + +/* + * Append string s to buffer buf. Arguments curp and len are the current + * position and remaining length, respectively. + * + * return 0 on success, 1 on out of room + */ +static int append_str(char *buf, char **curp, int *lenp, const char *s) +{ + char *p = *curp; + int len = *lenp; + int result = 0; /* success */ + char c; + + /* add a comma, if first in the buffer */ + if (p != buf) { + if (len == 0) { + result = 1; /* out of room */ + goto done; + } + *p++ = ','; + len--; + } + + /* copy the string */ + while ((c = *s++) != 0) { + if (len == 0) { + result = 1; /* out of room */ + goto done; + } + *p++ = c; + len--; + } + +done: + /* write return values */ + *curp = p; + *lenp = len; + + return result; +} + +/* + * Using the given flag table, print a comma separated string into + * the buffer. End in '*' if the buffer is too short. + */ +static char *flag_string(char *buf, int buf_len, u64 flags, + struct flag_table *table, int table_size) +{ + char extra[32]; + char *p = buf; + int len = buf_len; + int no_room = 0; + int i; + + /* make sure there is at least 2 so we can form "*" */ + if (len < 2) + return ""; + + len--; /* leave room for a nul */ + for (i = 0; i < table_size; i++) { + if (flags & table[i].flag) { + no_room = append_str(buf, &p, &len, table[i].str); + if (no_room) + break; + flags &= ~table[i].flag; + } + } + + /* any undocumented bits left? */ + if (!no_room && flags) { + snprintf(extra, sizeof(extra), "bits 0x%llx", flags); + no_room = append_str(buf, &p, &len, extra); + } + + /* add * if ran out of room */ + if (no_room) { + /* may need to back up to add space for a '*' */ + if (len == 0) + --p; + *p++ = '*'; + } + + /* add final nul - space already allocated above */ + *p = 0; + return buf; +} + +/* first 8 CCE error interrupt source names */ +static const char * const cce_misc_names[] = { + "CceErrInt", /* 0 */ + "RxeErrInt", /* 1 */ + "MiscErrInt", /* 2 */ + "Reserved3", /* 3 */ + "PioErrInt", /* 4 */ + "SDmaErrInt", /* 5 */ + "EgressErrInt", /* 6 */ + "TxeErrInt" /* 7 */ +}; + +/* + * Return the miscellaneous error interrupt name. + */ +static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source) +{ + if (source < ARRAY_SIZE(cce_misc_names)) + strncpy(buf, cce_misc_names[source], bsize); + else + snprintf(buf, + bsize, + "Reserved%u", + source + IS_GENERAL_ERR_START); + + return buf; +} + +/* + * Return the SDMA engine error interrupt name. + */ +static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SDmaEngErrInt%u", source); + return buf; +} + +/* + * Return the send context error interrupt name. + */ +static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SendCtxtErrInt%u", source); + return buf; +} + +static const char * const various_names[] = { + "PbcInt", + "GpioAssertInt", + "Qsfp1Int", + "Qsfp2Int", + "TCritInt" +}; + +/* + * Return the various interrupt name. + */ +static char *is_various_name(char *buf, size_t bsize, unsigned int source) +{ + if (source < ARRAY_SIZE(various_names)) + strncpy(buf, various_names[source], bsize); + else + snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START); + return buf; +} + +/* + * Return the DC interrupt name. + */ +static char *is_dc_name(char *buf, size_t bsize, unsigned int source) +{ + static const char * const dc_int_names[] = { + "common", + "lcb", + "8051", + "lbm" /* local block merge */ + }; + + if (source < ARRAY_SIZE(dc_int_names)) + snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]); + else + snprintf(buf, bsize, "DCInt%u", source); + return buf; +} + +static const char * const sdma_int_names[] = { + "SDmaInt", + "SdmaIdleInt", + "SdmaProgressInt", +}; + +/* + * Return the SDMA engine interrupt name. + */ +static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source) +{ + /* what interrupt */ + unsigned int what = source / TXE_NUM_SDMA_ENGINES; + /* which engine */ + unsigned int which = source % TXE_NUM_SDMA_ENGINES; + + if (likely(what < 3)) + snprintf(buf, bsize, "%s%u", sdma_int_names[what], which); + else + snprintf(buf, bsize, "Invalid SDMA interrupt %u", source); + return buf; +} + +/* + * Return the receive available interrupt name. + */ +static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "RcvAvailInt%u", source); + return buf; +} + +/* + * Return the receive urgent interrupt name. + */ +static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "RcvUrgentInt%u", source); + return buf; +} + +/* + * Return the send credit interrupt name. + */ +static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SendCreditInt%u", source); + return buf; +} + +/* + * Return the reserved interrupt name. + */ +static char *is_reserved_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START); + return buf; +} + +static char *cce_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags)); +} + +static char *rxe_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags)); +} + +static char *misc_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, misc_err_status_flags, + ARRAY_SIZE(misc_err_status_flags)); +} + +static char *pio_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags)); +} + +static char *sdma_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + sdma_err_status_flags, + ARRAY_SIZE(sdma_err_status_flags)); +} + +static char *egress_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags)); +} + +static char *egress_err_info_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags)); +} + +static char *send_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + send_err_status_flags, + ARRAY_SIZE(send_err_status_flags)); +} + +static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + /* + * For most these errors, there is nothing that can be done except + * report or record it. + */ + dd_dev_info(dd, "CCE Error: %s\n", + cce_err_status_string(buf, sizeof(buf), reg)); + + if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) + && is_a0(dd) + && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) { + /* this error requires a manual drop into SPC freeze mode */ + /* then a fix up */ + start_freeze_handling(dd->pport, FREEZE_SELF); + } +} + +/* + * Check counters for receive errors that do not have an interrupt + * associated with them. + */ +#define RCVERR_CHECK_TIME 10 +static void update_rcverr_timer(unsigned long opaque) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque; + struct hfi1_pportdata *ppd = dd->pport; + u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); + + if (dd->rcv_ovfl_cnt < cur_ovfl_cnt && + ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) { + dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__); + set_link_down_reason(ppd, + OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0, + OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN); + queue_work(ppd->hfi1_wq, &ppd->link_bounce_work); + } + dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt; + + mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME); +} + +static int init_rcverr(struct hfi1_devdata *dd) +{ + init_timer(&dd->rcverr_timer); + dd->rcverr_timer.function = update_rcverr_timer; + dd->rcverr_timer.data = (unsigned long) dd; + /* Assume the hardware counter has been reset */ + dd->rcv_ovfl_cnt = 0; + return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME); +} + +static void free_rcverr(struct hfi1_devdata *dd) +{ + if (dd->rcverr_timer.data) + del_timer_sync(&dd->rcverr_timer); + dd->rcverr_timer.data = 0; +} + +static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "Receive Error: %s\n", + rxe_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_RXE_FREEZE_ERR) { + int flags = 0; + + /* + * Freeze mode recovery is disabled for the errors + * in RXE_FREEZE_ABORT_MASK + */ + if (is_a0(dd) && (reg & RXE_FREEZE_ABORT_MASK)) + flags = FREEZE_ABORT; + + start_freeze_handling(dd->pport, flags); + } +} + +static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "Misc Error: %s", + misc_err_status_string(buf, sizeof(buf), reg)); +} + +static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "PIO Error: %s\n", + pio_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_PIO_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); +} + +static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "SDMA Error: %s\n", + sdma_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_SDMA_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); +} + +static void count_port_inactive(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = dd->pport; + + if (ppd->port_xmit_discards < ~(u64)0) + ppd->port_xmit_discards++; +} + +/* + * We have had a "disallowed packet" error during egress. Determine the + * integrity check which failed, and update relevant error counter, etc. + * + * Note that the SEND_EGRESS_ERR_INFO register has only a single + * bit of state per integrity check, and so we can miss the reason for an + * egress error if more than one packet fails the same integrity check + * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO. + */ +static void handle_send_egress_err_info(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */ + u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO); + char buf[96]; + + /* clear down all observed info as quickly as possible after read */ + write_csr(dd, SEND_EGRESS_ERR_INFO, info); + + dd_dev_info(dd, + "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n", + info, egress_err_info_string(buf, sizeof(buf), info), src); + + /* Eventually add other counters for each bit */ + + if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) { + if (ppd->port_xmit_discards < ~(u64)0) + ppd->port_xmit_discards++; + } +} + +/* + * Input value is a bit position within the SEND_EGRESS_ERR_STATUS + * register. Does it represent a 'port inactive' error? + */ +static inline int port_inactive_err(u64 posn) +{ + return (posn >= SEES(TX_LINKDOWN) && + posn <= SEES(TX_INCORRECT_LINK_STATE)); +} + +/* + * Input value is a bit position within the SEND_EGRESS_ERR_STATUS + * register. Does it represent a 'disallowed packet' error? + */ +static inline int disallowed_pkt_err(u64 posn) +{ + return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) && + posn <= SEES(TX_SDMA15_DISALLOWED_PACKET)); +} + +static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + u64 reg_copy = reg, handled = 0; + char buf[96]; + + if (reg & ALL_TXE_EGRESS_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); + if (is_a0(dd) && (reg & + SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) + && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) + start_freeze_handling(dd->pport, 0); + + while (reg_copy) { + int posn = fls64(reg_copy); + /* + * fls64() returns a 1-based offset, but we generally + * want 0-based offsets. + */ + int shift = posn - 1; + + if (port_inactive_err(shift)) { + count_port_inactive(dd); + handled |= (1ULL << shift); + } else if (disallowed_pkt_err(shift)) { + handle_send_egress_err_info(dd); + handled |= (1ULL << shift); + } + clear_bit(shift, (unsigned long *)®_copy); + } + + reg &= ~handled; + + if (reg) + dd_dev_info(dd, "Egress Error: %s\n", + egress_err_status_string(buf, sizeof(buf), reg)); +} + +static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "Send Error: %s\n", + send_err_status_string(buf, sizeof(buf), reg)); + +} + +/* + * The maximum number of times the error clear down will loop before + * blocking a repeating error. This value is arbitrary. + */ +#define MAX_CLEAR_COUNT 20 + +/* + * Clear and handle an error register. All error interrupts are funneled + * through here to have a central location to correctly handle single- + * or multi-shot errors. + * + * For non per-context registers, call this routine with a context value + * of 0 so the per-context offset is zero. + * + * If the handler loops too many times, assume that something is wrong + * and can't be fixed, so mask the error bits. + */ +static void interrupt_clear_down(struct hfi1_devdata *dd, + u32 context, + const struct err_reg_info *eri) +{ + u64 reg; + u32 count; + + /* read in a loop until no more errors are seen */ + count = 0; + while (1) { + reg = read_kctxt_csr(dd, context, eri->status); + if (reg == 0) + break; + write_kctxt_csr(dd, context, eri->clear, reg); + if (likely(eri->handler)) + eri->handler(dd, context, reg); + count++; + if (count > MAX_CLEAR_COUNT) { + u64 mask; + + dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n", + eri->desc, reg); + /* + * Read-modify-write so any other masked bits + * remain masked. + */ + mask = read_kctxt_csr(dd, context, eri->mask); + mask &= ~reg; + write_kctxt_csr(dd, context, eri->mask, mask); + break; + } + } +} + +/* + * CCE block "misc" interrupt. Source is < 16. + */ +static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &misc_errs[source]; + + if (eri->handler) { + interrupt_clear_down(dd, 0, eri); + } else { + dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n", + source); + } +} + +static char *send_context_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags)); +} + +/* + * Send context error interrupt. Source (hw_context) is < 160. + * + * All send context errors cause the send context to halt. The normal + * clear-down mechanism cannot be used because we cannot clear the + * error bits until several other long-running items are done first. + * This is OK because with the context halted, nothing else is going + * to happen on it anyway. + */ +static void is_sendctxt_err_int(struct hfi1_devdata *dd, + unsigned int hw_context) +{ + struct send_context_info *sci; + struct send_context *sc; + char flags[96]; + u64 status; + u32 sw_index; + + sw_index = dd->hw_to_sw[hw_context]; + if (sw_index >= dd->num_send_contexts) { + dd_dev_err(dd, + "out of range sw index %u for send context %u\n", + sw_index, hw_context); + return; + } + sci = &dd->send_contexts[sw_index]; + sc = sci->sc; + if (!sc) { + dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__, + sw_index, hw_context); + return; + } + + /* tell the software that a halt has begun */ + sc_stop(sc, SCF_HALTED); + + status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS); + + dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context, + send_context_err_status_string(flags, sizeof(flags), status)); + + if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK) + handle_send_egress_err_info(dd); + + /* + * Automatically restart halted kernel contexts out of interrupt + * context. User contexts must ask the driver to restart the context. + */ + if (sc->type != SC_USER) + queue_work(dd->pport->hfi1_wq, &sc->halt_work); +} + +static void handle_sdma_eng_err(struct hfi1_devdata *dd, + unsigned int source, u64 status) +{ + struct sdma_engine *sde; + + sde = &dd->per_sdma[source]; +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n", + sde->this_idx, source, (unsigned long long)status); +#endif + sdma_engine_error(sde, status); +} + +/* + * CCE block SDMA error interrupt. Source is < 16. + */ +static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source) +{ +#ifdef CONFIG_SDMA_VERBOSITY + struct sdma_engine *sde = &dd->per_sdma[source]; + + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx, + source); + sdma_dumpstate(sde); +#endif + interrupt_clear_down(dd, source, &sdma_eng_err); +} + +/* + * CCE block "various" interrupt. Source is < 8. + */ +static void is_various_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &various_err[source]; + + /* + * TCritInt cannot go through interrupt_clear_down() + * because it is not a second tier interrupt. The handler + * should be called directly. + */ + if (source == TCRIT_INT_SOURCE) + handle_temp_err(dd); + else if (eri->handler) + interrupt_clear_down(dd, 0, eri); + else + dd_dev_info(dd, + "%s: Unimplemented/reserved interrupt %d\n", + __func__, source); +} + +static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) +{ + /* source is always zero */ + struct hfi1_pportdata *ppd = dd->pport; + unsigned long flags; + u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); + + if (reg & QSFP_HFI0_MODPRST_N) { + + dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n", + __func__); + + if (!qsfp_mod_present(ppd)) { + ppd->driver_link_ready = 0; + /* + * Cable removed, reset all our information about the + * cache and cable capabilities + */ + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + /* + * We don't set cache_refresh_required here as we expect + * an interrupt when a cable is inserted + */ + ppd->qsfp_info.cache_valid = 0; + ppd->qsfp_info.qsfp_interrupt_functional = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + write_csr(dd, + dd->hfi1_id ? + ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, + qsfp_int_mgmt); + if (ppd->host_link_state == HLS_DN_POLL) { + /* + * The link is still in POLL. This means + * that the normal link down processing + * will not happen. We have to do it here + * before turning the DC off. + */ + queue_work(ppd->hfi1_wq, &ppd->link_down_work); + } + } else { + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 0; + ppd->qsfp_info.cache_refresh_required = 1; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + + qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N; + write_csr(dd, + dd->hfi1_id ? + ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, + qsfp_int_mgmt); + } + } + + if (reg & QSFP_HFI0_INT_N) { + + dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n", + __func__); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.check_interrupt_flags = 1; + ppd->qsfp_info.qsfp_interrupt_functional = 1; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + } + + /* Schedule the QSFP work only if there is a cable attached. */ + if (qsfp_mod_present(ppd)) + queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work); +} + +static int request_host_lcb_access(struct hfi1_devdata *dd) +{ + int ret; + + ret = do_8051_command(dd, HCMD_MISC, + (u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT, + NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "%s: command failed with error %d\n", + __func__, ret); + } + return ret == HCMD_SUCCESS ? 0 : -EBUSY; +} + +static int request_8051_lcb_access(struct hfi1_devdata *dd) +{ + int ret; + + ret = do_8051_command(dd, HCMD_MISC, + (u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT, + NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "%s: command failed with error %d\n", + __func__, ret); + } + return ret == HCMD_SUCCESS ? 0 : -EBUSY; +} + +/* + * Set the LCB selector - allow host access. The DCC selector always + * points to the host. + */ +static inline void set_host_lcb_access(struct hfi1_devdata *dd) +{ + write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK + | DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK); +} + +/* + * Clear the LCB selector - allow 8051 access. The DCC selector always + * points to the host. + */ +static inline void set_8051_lcb_access(struct hfi1_devdata *dd) +{ + write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK); +} + +/* + * Acquire LCB access from the 8051. If the host already has access, + * just increment a counter. Otherwise, inform the 8051 that the + * host is taking access. + * + * Returns: + * 0 on success + * -EBUSY if the 8051 has control and cannot be disturbed + * -errno if unable to acquire access from the 8051 + */ +int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok) +{ + struct hfi1_pportdata *ppd = dd->pport; + int ret = 0; + + /* + * Use the host link state lock so the operation of this routine + * { link state check, selector change, count increment } can occur + * as a unit against a link state change. Otherwise there is a + * race between the state change and the count increment. + */ + if (sleep_ok) { + mutex_lock(&ppd->hls_lock); + } else { + while (!mutex_trylock(&ppd->hls_lock)) + udelay(1); + } + + /* this access is valid only when the link is up */ + if ((ppd->host_link_state & HLS_UP) == 0) { + dd_dev_info(dd, "%s: link state %s not up\n", + __func__, link_state_name(ppd->host_link_state)); + ret = -EBUSY; + goto done; + } + + if (dd->lcb_access_count == 0) { + ret = request_host_lcb_access(dd); + if (ret) { + dd_dev_err(dd, + "%s: unable to acquire LCB access, err %d\n", + __func__, ret); + goto done; + } + set_host_lcb_access(dd); + } + dd->lcb_access_count++; +done: + mutex_unlock(&ppd->hls_lock); + return ret; +} + +/* + * Release LCB access by decrementing the use count. If the count is moving + * from 1 to 0, inform 8051 that it has control back. + * + * Returns: + * 0 on success + * -errno if unable to release access to the 8051 + */ +int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok) +{ + int ret = 0; + + /* + * Use the host link state lock because the acquire needed it. + * Here, we only need to keep { selector change, count decrement } + * as a unit. + */ + if (sleep_ok) { + mutex_lock(&dd->pport->hls_lock); + } else { + while (!mutex_trylock(&dd->pport->hls_lock)) + udelay(1); + } + + if (dd->lcb_access_count == 0) { + dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n", + __func__); + goto done; + } + + if (dd->lcb_access_count == 1) { + set_8051_lcb_access(dd); + ret = request_8051_lcb_access(dd); + if (ret) { + dd_dev_err(dd, + "%s: unable to release LCB access, err %d\n", + __func__, ret); + /* restore host access if the grant didn't work */ + set_host_lcb_access(dd); + goto done; + } + } + dd->lcb_access_count--; +done: + mutex_unlock(&dd->pport->hls_lock); + return ret; +} + +/* + * Initialize LCB access variables and state. Called during driver load, + * after most of the initialization is finished. + * + * The DC default is LCB access on for the host. The driver defaults to + * leaving access to the 8051. Assign access now - this constrains the call + * to this routine to be after all LCB set-up is done. In particular, after + * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts() + */ +static void init_lcb_access(struct hfi1_devdata *dd) +{ + dd->lcb_access_count = 0; +} + +/* + * Write a response back to a 8051 request. + */ +static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data) +{ + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, + DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK + | (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT + | (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT); +} + +/* + * Handle requests from the 8051. + */ +static void handle_8051_request(struct hfi1_devdata *dd) +{ + u64 reg; + u16 data; + u8 type; + + reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); + if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) + return; /* no request */ + + /* zero out COMPLETED so the response is seen */ + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0); + + /* extract request details */ + type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT) + & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK; + data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT) + & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK; + + switch (type) { + case HREQ_LOAD_CONFIG: + case HREQ_SAVE_CONFIG: + case HREQ_READ_CONFIG: + case HREQ_SET_TX_EQ_ABS: + case HREQ_SET_TX_EQ_REL: + case HREQ_ENABLE: + dd_dev_info(dd, "8051 request: request 0x%x not supported\n", + type); + hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + + case HREQ_CONFIG_DONE: + hreq_response(dd, HREQ_SUCCESS, 0); + break; + + case HREQ_INTERFACE_TEST: + hreq_response(dd, HREQ_SUCCESS, data); + break; + + default: + dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type); + hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + } +} + +static void write_global_credit(struct hfi1_devdata *dd, + u8 vau, u16 total, u16 shared) +{ + write_csr(dd, SEND_CM_GLOBAL_CREDIT, + ((u64)total + << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) + | ((u64)shared + << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) + | ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT)); +} + +/* + * Set up initial VL15 credits of the remote. Assumes the rest of + * the CM credit registers are zero from a previous global or credit reset . + */ +void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf) +{ + /* leave shared count at zero for both global and VL15 */ + write_global_credit(dd, vau, vl15buf, 0); + + /* We may need some credits for another VL when sending packets + * with the snoop interface. Dividing it down the middle for VL15 + * and VL0 should suffice. + */ + if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) { + write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1) + << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); + write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1) + << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT); + } else { + write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf + << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); + } +} + +/* + * Zero all credit details from the previous connection and + * reset the CM manager's internal counters. + */ +void reset_link_credits(struct hfi1_devdata *dd) +{ + int i; + + /* remove all previous VL credit limits */ + for (i = 0; i < TXE_NUM_DATA_VL; i++) + write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0); + write_csr(dd, SEND_CM_CREDIT_VL15, 0); + write_global_credit(dd, 0, 0, 0); + /* reset the CM block */ + pio_send_control(dd, PSC_CM_RESET); +} + +/* convert a vCU to a CU */ +static u32 vcu_to_cu(u8 vcu) +{ + return 1 << vcu; +} + +/* convert a CU to a vCU */ +static u8 cu_to_vcu(u32 cu) +{ + return ilog2(cu); +} + +/* convert a vAU to an AU */ +static u32 vau_to_au(u8 vau) +{ + return 8 * (1 << vau); +} + +static void set_linkup_defaults(struct hfi1_pportdata *ppd) +{ + ppd->sm_trap_qp = 0x0; + ppd->sa_qp = 0x1; +} + +/* + * Graceful LCB shutdown. This leaves the LCB FIFOs in reset. + */ +static void lcb_shutdown(struct hfi1_devdata *dd, int abort) +{ + u64 reg; + + /* clear lcb run: LCB_CFG_RUN.EN = 0 */ + write_csr(dd, DC_LCB_CFG_RUN, 0); + /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, + 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT); + /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */ + dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN); + reg = read_csr(dd, DCC_CFG_RESET); + write_csr(dd, DCC_CFG_RESET, + reg + | (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) + | (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT)); + (void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ + if (!abort) { + udelay(1); /* must hold for the longer of 16cclks or 20ns */ + write_csr(dd, DCC_CFG_RESET, reg); + write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); + } +} + +/* + * This routine should be called after the link has been transitioned to + * OFFLINE (OFFLINE state has the side effect of putting the SerDes into + * reset). + * + * The expectation is that the caller of this routine would have taken + * care of properly transitioning the link into the correct state. + */ +static void dc_shutdown(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->dc8051_lock, flags); + if (dd->dc_shutdown) { + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + return; + } + dd->dc_shutdown = 1; + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + /* Shutdown the LCB */ + lcb_shutdown(dd, 1); + /* Going to OFFLINE would have causes the 8051 to put the + * SerDes into reset already. Just need to shut down the 8051, + * itself. */ + write_csr(dd, DC_DC8051_CFG_RST, 0x1); +} + +/* Calling this after the DC has been brought out of reset should not + * do any damage. */ +static void dc_start(struct hfi1_devdata *dd) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&dd->dc8051_lock, flags); + if (!dd->dc_shutdown) + goto done; + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + /* Take the 8051 out of reset */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + /* Wait until 8051 is ready */ + ret = wait_fm_ready(dd, TIMEOUT_8051_START); + if (ret) { + dd_dev_err(dd, "%s: timeout starting 8051 firmware\n", + __func__); + } + /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ + write_csr(dd, DCC_CFG_RESET, 0x10); + /* lcb_shutdown() with abort=1 does not restore these */ + write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); + spin_lock_irqsave(&dd->dc8051_lock, flags); + dd->dc_shutdown = 0; +done: + spin_unlock_irqrestore(&dd->dc8051_lock, flags); +} + +/* + * These LCB adjustments are for the Aurora SerDes core in the FPGA. + */ +static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd) +{ + u64 rx_radr, tx_radr; + u32 version; + + if (dd->icode != ICODE_FPGA_EMULATION) + return; + + /* + * These LCB defaults on emulator _s are good, nothing to do here: + * LCB_CFG_TX_FIFOS_RADR + * LCB_CFG_RX_FIFOS_RADR + * LCB_CFG_LN_DCLK + * LCB_CFG_IGNORE_LOST_RCLK + */ + if (is_emulator_s(dd)) + return; + /* else this is _p */ + + version = emulator_rev(dd); + if (!is_a0(dd)) + version = 0x2d; /* all B0 use 0x2d or higher settings */ + + if (version <= 0x12) { + /* release 0x12 and below */ + + /* + * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9 + * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9 + * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa + */ + rx_radr = + 0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + /* + * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default) + * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6 + */ + tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version <= 0x18) { + /* release 0x13 up to 0x18 */ + /* LCB_CFG_RX_FIFOS_RADR = 0x988 */ + rx_radr = + 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version == 0x19) { + /* release 0x19 */ + /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */ + rx_radr = + 0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version == 0x1a) { + /* release 0x1a */ + /* LCB_CFG_RX_FIFOS_RADR = 0x988 */ + rx_radr = + 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull); + } else { + /* release 0x1b and higher */ + /* LCB_CFG_RX_FIFOS_RADR = 0x877 */ + rx_radr = + 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } + + write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr); + /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */ + write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, + DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK); + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr); +} + +/* + * Handle a SMA idle message + * + * This is a work-queue function outside of the interrupt. + */ +void handle_sma_message(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + sma_message_work); + struct hfi1_devdata *dd = ppd->dd; + u64 msg; + int ret; + + /* msg is bytes 1-4 of the 40-bit idle message - the command code + is stripped off */ + ret = read_idle_sma(dd, &msg); + if (ret) + return; + dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg); + /* + * React to the SMA message. Byte[1] (0 for us) is the command. + */ + switch (msg & 0xff) { + case SMA_IDLE_ARM: + /* + * See OPAv1 table 9-14 - HFI and External Switch Ports Key + * State Transitions + * + * Only expected in INIT or ARMED, discard otherwise. + */ + if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED)) + ppd->neighbor_normal = 1; + break; + case SMA_IDLE_ACTIVE: + /* + * See OPAv1 table 9-14 - HFI and External Switch Ports Key + * State Transitions + * + * Can activate the node. Discard otherwise. + */ + if (ppd->host_link_state == HLS_UP_ARMED + && ppd->is_active_optimize_enabled) { + ppd->neighbor_normal = 1; + ret = set_link_state(ppd, HLS_UP_ACTIVE); + if (ret) + dd_dev_err( + dd, + "%s: received Active SMA idle message, couldn't set link to Active\n", + __func__); + } + break; + default: + dd_dev_err(dd, + "%s: received unexpected SMA idle message 0x%llx\n", + __func__, msg); + break; + } +} + +static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear) +{ + u64 rcvctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->rcvctrl_lock, flags); + rcvctrl = read_csr(dd, RCV_CTRL); + rcvctrl |= add; + rcvctrl &= ~clear; + write_csr(dd, RCV_CTRL, rcvctrl); + spin_unlock_irqrestore(&dd->rcvctrl_lock, flags); +} + +static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add) +{ + adjust_rcvctrl(dd, add, 0); +} + +static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear) +{ + adjust_rcvctrl(dd, 0, clear); +} + +/* + * Called from all interrupt handlers to start handling an SPC freeze. + */ +void start_freeze_handling(struct hfi1_pportdata *ppd, int flags) +{ + struct hfi1_devdata *dd = ppd->dd; + struct send_context *sc; + int i; + + if (flags & FREEZE_SELF) + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); + + /* enter frozen mode */ + dd->flags |= HFI1_FROZEN; + + /* notify all SDMA engines that they are going into a freeze */ + sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN)); + + /* do halt pre-handling on all enabled send contexts */ + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (sc && (sc->flags & SCF_ENABLED)) + sc_stop(sc, SCF_FROZEN | SCF_HALTED); + } + + /* Send context are frozen. Notify user space */ + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT); + + if (flags & FREEZE_ABORT) { + dd_dev_err(dd, + "Aborted freeze recovery. Please REBOOT system\n"); + return; + } + /* queue non-interrupt handler */ + queue_work(ppd->hfi1_wq, &ppd->freeze_work); +} + +/* + * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen, + * depending on the "freeze" parameter. + * + * No need to return an error if it times out, our only option + * is to proceed anyway. + */ +static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze) +{ + unsigned long timeout; + u64 reg; + + timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT); + while (1) { + reg = read_csr(dd, CCE_STATUS); + if (freeze) { + /* waiting until all indicators are set */ + if ((reg & ALL_FROZE) == ALL_FROZE) + return; /* all done */ + } else { + /* waiting until all indicators are clear */ + if ((reg & ALL_FROZE) == 0) + return; /* all done */ + } + + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing", + freeze ? "" : "un", + reg & ALL_FROZE, + freeze ? ALL_FROZE : 0ull); + return; + } + usleep_range(80, 120); + } +} + +/* + * Do all freeze handling for the RXE block. + */ +static void rxe_freeze(struct hfi1_devdata *dd) +{ + int i; + + /* disable port */ + clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + /* disable all receive contexts */ + for (i = 0; i < dd->num_rcv_contexts; i++) + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i); +} + +/* + * Unfreeze handling for the RXE block - kernel contexts only. + * This will also enable the port. User contexts will do unfreeze + * handling on a per-context basis as they call into the driver. + * + */ +static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) +{ + int i; + + /* enable all kernel contexts */ + for (i = 0; i < dd->n_krcv_queues; i++) + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i); + + /* enable port */ + add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); +} + +/* + * Non-interrupt SPC freeze handling. + * + * This is a work-queue function outside of the triggering interrupt. + */ +void handle_freeze(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + freeze_work); + struct hfi1_devdata *dd = ppd->dd; + + /* wait for freeze indicators on all affected blocks */ + dd_dev_info(dd, "Entering SPC freeze\n"); + wait_for_freeze_status(dd, 1); + + /* SPC is now frozen */ + + /* do send PIO freeze steps */ + pio_freeze(dd); + + /* do send DMA freeze steps */ + sdma_freeze(dd); + + /* do send egress freeze steps - nothing to do */ + + /* do receive freeze steps */ + rxe_freeze(dd); + + /* + * Unfreeze the hardware - clear the freeze, wait for each + * block's frozen bit to clear, then clear the frozen flag. + */ + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK); + wait_for_freeze_status(dd, 0); + + if (is_a0(dd)) { + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); + wait_for_freeze_status(dd, 1); + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK); + wait_for_freeze_status(dd, 0); + } + + /* do send PIO unfreeze steps for kernel contexts */ + pio_kernel_unfreeze(dd); + + /* do send DMA unfreeze steps */ + sdma_unfreeze(dd); + + /* do send egress unfreeze steps - nothing to do */ + + /* do receive unfreeze steps for kernel contexts */ + rxe_kernel_unfreeze(dd); + + /* + * The unfreeze procedure touches global device registers when + * it disables and re-enables RXE. Mark the device unfrozen + * after all that is done so other parts of the driver waiting + * for the device to unfreeze don't do things out of order. + * + * The above implies that the meaning of HFI1_FROZEN flag is + * "Device has gone into freeze mode and freeze mode handling + * is still in progress." + * + * The flag will be removed when freeze mode processing has + * completed. + */ + dd->flags &= ~HFI1_FROZEN; + wake_up(&dd->event_queue); + + /* no longer frozen */ + dd_dev_err(dd, "Exiting SPC freeze\n"); +} + +/* + * Handle a link up interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_up(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_up_work); + set_link_state(ppd, HLS_UP_INIT); + + /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ + read_ltp_rtt(ppd->dd); + /* + * OPA specifies that certain counters are cleared on a transition + * to link up, so do that. + */ + clear_linkup_counters(ppd->dd); + /* + * And (re)set link up default values. + */ + set_linkup_defaults(ppd); + + /* enforce link speed enabled */ + if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) { + /* oops - current speed is not enabled, bounce */ + dd_dev_err(ppd->dd, + "Link speed active 0x%x is outside enabled 0x%x, downing link\n", + ppd->link_speed_active, ppd->link_speed_enabled); + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0, + OPA_LINKDOWN_REASON_SPEED_POLICY); + set_link_state(ppd, HLS_DN_OFFLINE); + start_link(ppd); + } +} + +/* Several pieces of LNI information were cached for SMA in ppd. + * Reset these on link down */ +static void reset_neighbor_info(struct hfi1_pportdata *ppd) +{ + ppd->neighbor_guid = 0; + ppd->neighbor_port_number = 0; + ppd->neighbor_type = 0; + ppd->neighbor_fm_security = 0; +} + +/* + * Handle a link down interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_down(struct work_struct *work) +{ + u8 lcl_reason, neigh_reason = 0; + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_down_work); + + /* go offline first, then deal with reasons */ + set_link_state(ppd, HLS_DN_OFFLINE); + + lcl_reason = 0; + read_planned_down_reason_code(ppd->dd, &neigh_reason); + + /* + * If no reason, assume peer-initiated but missed + * LinkGoingDown idle flits. + */ + if (neigh_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + + set_link_down_reason(ppd, lcl_reason, neigh_reason, 0); + + reset_neighbor_info(ppd); + + /* disable the port */ + clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + /* If there is no cable attached, turn the DC off. Otherwise, + * start the link bring up. */ + if (!qsfp_mod_present(ppd)) + dc_shutdown(ppd->dd); + else + start_link(ppd); +} + +void handle_link_bounce(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_bounce_work); + + /* + * Only do something if the link is currently up. + */ + if (ppd->host_link_state & HLS_UP) { + set_link_state(ppd, HLS_DN_OFFLINE); + start_link(ppd); + } else { + dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n", + __func__, link_state_name(ppd->host_link_state)); + } +} + +/* + * Mask conversion: Capability exchange to Port LTP. The capability + * exchange has an implicit 16b CRC that is mandatory. + */ +static int cap_to_port_ltp(int cap) +{ + int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */ + + if (cap & CAP_CRC_14B) + port_ltp |= PORT_LTP_CRC_MODE_14; + if (cap & CAP_CRC_48B) + port_ltp |= PORT_LTP_CRC_MODE_48; + if (cap & CAP_CRC_12B_16B_PER_LANE) + port_ltp |= PORT_LTP_CRC_MODE_PER_LANE; + + return port_ltp; +} + +/* + * Convert an OPA Port LTP mask to capability mask + */ +int port_ltp_to_cap(int port_ltp) +{ + int cap_mask = 0; + + if (port_ltp & PORT_LTP_CRC_MODE_14) + cap_mask |= CAP_CRC_14B; + if (port_ltp & PORT_LTP_CRC_MODE_48) + cap_mask |= CAP_CRC_48B; + if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE) + cap_mask |= CAP_CRC_12B_16B_PER_LANE; + + return cap_mask; +} + +/* + * Convert a single DC LCB CRC mode to an OPA Port LTP mask. + */ +static int lcb_to_port_ltp(int lcb_crc) +{ + int port_ltp = 0; + + if (lcb_crc == LCB_CRC_12B_16B_PER_LANE) + port_ltp = PORT_LTP_CRC_MODE_PER_LANE; + else if (lcb_crc == LCB_CRC_48B) + port_ltp = PORT_LTP_CRC_MODE_48; + else if (lcb_crc == LCB_CRC_14B) + port_ltp = PORT_LTP_CRC_MODE_14; + else + port_ltp = PORT_LTP_CRC_MODE_16; + + return port_ltp; +} + +/* + * Our neighbor has indicated that we are allowed to act as a fabric + * manager, so place the full management partition key in the second + * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note + * that we should already have the limited management partition key in + * array element 1, and also that the port is not yet up when + * add_full_mgmt_pkey() is invoked. + */ +static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* Sanity check - ppd->pkeys[2] should be 0 */ + if (ppd->pkeys[2] != 0) + dd_dev_err(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n", + __func__, ppd->pkeys[2], FULL_MGMT_P_KEY); + ppd->pkeys[2] = FULL_MGMT_P_KEY; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); +} + +/* + * Convert the given link width to the OPA link width bitmask. + */ +static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width) +{ + switch (width) { + case 0: + /* + * Simulator and quick linkup do not set the width. + * Just set it to 4x without complaint. + */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup) + return OPA_LINK_WIDTH_4X; + return 0; /* no lanes up */ + case 1: return OPA_LINK_WIDTH_1X; + case 2: return OPA_LINK_WIDTH_2X; + case 3: return OPA_LINK_WIDTH_3X; + default: + dd_dev_info(dd, "%s: invalid width %d, using 4\n", + __func__, width); + /* fall through */ + case 4: return OPA_LINK_WIDTH_4X; + } +} + +/* + * Do a population count on the bottom nibble. + */ +static const u8 bit_counts[16] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 +}; +static inline u8 nibble_to_count(u8 nibble) +{ + return bit_counts[nibble & 0xf]; +} + +/* + * Read the active lane information from the 8051 registers and return + * their widths. + * + * Active lane information is found in these 8051 registers: + * enable_lane_tx + * enable_lane_rx + */ +static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width, + u16 *rx_width) +{ + u16 tx, rx; + u8 enable_lane_rx; + u8 enable_lane_tx; + u8 tx_polarity_inversion; + u8 rx_polarity_inversion; + u8 max_rate; + + /* read the active lanes */ + read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, + &rx_polarity_inversion, &max_rate); + read_local_lni(dd, &enable_lane_rx); + + /* convert to counts */ + tx = nibble_to_count(enable_lane_tx); + rx = nibble_to_count(enable_lane_rx); + + /* + * Set link_speed_active here, overriding what was set in + * handle_verify_cap(). The ASIC 8051 firmware does not correctly + * set the max_rate field in handle_verify_cap until v0.19. + */ + if ((dd->icode == ICODE_RTL_SILICON) + && (dd->dc8051_ver < dc8051_ver(0, 19))) { + /* max_rate: 0 = 12.5G, 1 = 25G */ + switch (max_rate) { + case 0: + dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G; + break; + default: + dd_dev_err(dd, + "%s: unexpected max rate %d, using 25Gb\n", + __func__, (int)max_rate); + /* fall through */ + case 1: + dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G; + break; + } + } + + dd_dev_info(dd, + "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n", + enable_lane_tx, tx, enable_lane_rx, rx); + *tx_width = link_width_to_bits(dd, tx); + *rx_width = link_width_to_bits(dd, rx); +} + +/* + * Read verify_cap_local_fm_link_width[1] to obtain the link widths. + * Valid after the end of VerifyCap and during LinkUp. Does not change + * after link up. I.e. look elsewhere for downgrade information. + * + * Bits are: + * + bits [7:4] contain the number of active transmitters + * + bits [3:0] contain the number of active receivers + * These are numbers 1 through 4 and can be different values if the + * link is asymmetric. + * + * verify_cap_local_fm_link_width[0] retains its original value. + */ +static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width, + u16 *rx_width) +{ + u16 widths, tx, rx; + u8 misc_bits, local_flags; + u16 active_tx, active_rx; + + read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths); + tx = widths >> 12; + rx = (widths >> 8) & 0xf; + + *tx_width = link_width_to_bits(dd, tx); + *rx_width = link_width_to_bits(dd, rx); + + /* print the active widths */ + get_link_widths(dd, &active_tx, &active_rx); +} + +/* + * Set ppd->link_width_active and ppd->link_width_downgrade_active using + * hardware information when the link first comes up. + * + * The link width is not available until after VerifyCap.AllFramesReceived + * (the trigger for handle_verify_cap), so this is outside that routine + * and should be called when the 8051 signals linkup. + */ +void get_linkup_link_widths(struct hfi1_pportdata *ppd) +{ + u16 tx_width, rx_width; + + /* get end-of-LNI link widths */ + get_linkup_widths(ppd->dd, &tx_width, &rx_width); + + /* use tx_width as the link is supposed to be symmetric on link up */ + ppd->link_width_active = tx_width; + /* link width downgrade active (LWD.A) starts out matching LW.A */ + ppd->link_width_downgrade_tx_active = ppd->link_width_active; + ppd->link_width_downgrade_rx_active = ppd->link_width_active; + /* per OPA spec, on link up LWD.E resets to LWD.S */ + ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported; + /* cache the active egress rate (units {10^6 bits/sec]) */ + ppd->current_egress_rate = active_egress_rate(ppd); +} + +/* + * Handle a verify capabilities interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_verify_cap(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_vc_work); + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + u8 power_management; + u8 continious; + u8 vcu; + u8 vau; + u8 z; + u16 vl15buf; + u16 link_widths; + u16 crc_mask; + u16 crc_val; + u16 device_id; + u16 active_tx, active_rx; + u8 partner_supported_crc; + u8 remote_tx_rate; + u8 device_rev; + + set_link_state(ppd, HLS_VERIFY_CAP); + + lcb_shutdown(dd, 0); + adjust_lcb_for_fpga_serdes(dd); + + /* + * These are now valid: + * remote VerifyCap fields in the general LNI config + * CSR DC8051_STS_REMOTE_GUID + * CSR DC8051_STS_REMOTE_NODE_TYPE + * CSR DC8051_STS_REMOTE_FM_SECURITY + * CSR DC8051_STS_REMOTE_PORT_NO + */ + + read_vc_remote_phy(dd, &power_management, &continious); + read_vc_remote_fabric( + dd, + &vau, + &z, + &vcu, + &vl15buf, + &partner_supported_crc); + read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths); + read_remote_device_id(dd, &device_id, &device_rev); + /* + * And the 'MgmtAllowed' information, which is exchanged during + * LNI, is also be available at this point. + */ + read_mgmt_allowed(dd, &ppd->mgmt_allowed); + /* print the active widths */ + get_link_widths(dd, &active_tx, &active_rx); + dd_dev_info(dd, + "Peer PHY: power management 0x%x, continuous updates 0x%x\n", + (int)power_management, (int)continious); + dd_dev_info(dd, + "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n", + (int)vau, + (int)z, + (int)vcu, + (int)vl15buf, + (int)partner_supported_crc); + dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n", + (u32)remote_tx_rate, (u32)link_widths); + dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n", + (u32)device_id, (u32)device_rev); + /* + * The peer vAU value just read is the peer receiver value. HFI does + * not support a transmit vAU of 0 (AU == 8). We advertised that + * with Z=1 in the fabric capabilities sent to the peer. The peer + * will see our Z=1, and, if it advertised a vAU of 0, will move its + * receive to vAU of 1 (AU == 16). Do the same here. We do not care + * about the peer Z value - our sent vAU is 3 (hardwired) and is not + * subject to the Z value exception. + */ + if (vau == 0) + vau = 1; + set_up_vl15(dd, vau, vl15buf); + + /* set up the LCB CRC mode */ + crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc; + + /* order is important: use the lowest bit in common */ + if (crc_mask & CAP_CRC_14B) + crc_val = LCB_CRC_14B; + else if (crc_mask & CAP_CRC_48B) + crc_val = LCB_CRC_48B; + else if (crc_mask & CAP_CRC_12B_16B_PER_LANE) + crc_val = LCB_CRC_12B_16B_PER_LANE; + else + crc_val = LCB_CRC_16B; + + dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val); + write_csr(dd, DC_LCB_CFG_CRC_MODE, + (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT); + + /* set (14b only) or clear sideband credit */ + reg = read_csr(dd, SEND_CM_CTRL); + if (crc_val == LCB_CRC_14B && crc_14b_sideband) { + write_csr(dd, SEND_CM_CTRL, + reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + } else { + write_csr(dd, SEND_CM_CTRL, + reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + } + + ppd->link_speed_active = 0; /* invalid value */ + if (dd->dc8051_ver < dc8051_ver(0, 20)) { + /* remote_tx_rate: 0 = 12.5G, 1 = 25G */ + switch (remote_tx_rate) { + case 0: + ppd->link_speed_active = OPA_LINK_SPEED_12_5G; + break; + case 1: + ppd->link_speed_active = OPA_LINK_SPEED_25G; + break; + } + } else { + /* actual rate is highest bit of the ANDed rates */ + u8 rate = remote_tx_rate & ppd->local_tx_rate; + + if (rate & 2) + ppd->link_speed_active = OPA_LINK_SPEED_25G; + else if (rate & 1) + ppd->link_speed_active = OPA_LINK_SPEED_12_5G; + } + if (ppd->link_speed_active == 0) { + dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n", + __func__, (int)remote_tx_rate); + ppd->link_speed_active = OPA_LINK_SPEED_25G; + } + + /* + * Cache the values of the supported, enabled, and active + * LTP CRC modes to return in 'portinfo' queries. But the bit + * flags that are returned in the portinfo query differ from + * what's in the link_crc_mask, crc_sizes, and crc_val + * variables. Convert these here. + */ + ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; + /* supported crc modes */ + ppd->port_ltp_crc_mode |= + cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4; + /* enabled crc modes */ + ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val); + /* active crc mode */ + + /* set up the remote credit return table */ + assign_remote_cm_au_table(dd, vcu); + + /* + * The LCB is reset on entry to handle_verify_cap(), so this must + * be applied on every link up. + * + * Adjust LCB error kill enable to kill the link if + * these RBUF errors are seen: + * REPLAY_BUF_MBE_SMASK + * FLIT_INPUT_BUF_MBE_SMASK + */ + if (is_a0(dd)) { /* fixed in B0 */ + reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN); + reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK + | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK; + write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg); + } + + /* pull LCB fifos out of reset - all fifo clocks must be stable */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0); + + /* give 8051 access to the LCB CSRs */ + write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */ + set_8051_lcb_access(dd); + + ppd->neighbor_guid = + read_csr(dd, DC_DC8051_STS_REMOTE_GUID); + ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) & + DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK; + ppd->neighbor_type = + read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) & + DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK; + ppd->neighbor_fm_security = + read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) & + DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK; + dd_dev_info(dd, + "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n", + ppd->neighbor_guid, ppd->neighbor_type, + ppd->mgmt_allowed, ppd->neighbor_fm_security); + if (ppd->mgmt_allowed) + add_full_mgmt_pkey(ppd); + + /* tell the 8051 to go to LinkUp */ + set_link_state(ppd, HLS_GOING_UP); +} + +/* + * Apply the link width downgrade enabled policy against the current active + * link widths. + * + * Called when the enabled policy changes or the active link widths change. + */ +void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths) +{ + int skip = 1; + int do_bounce = 0; + u16 lwde = ppd->link_width_downgrade_enabled; + u16 tx, rx; + + mutex_lock(&ppd->hls_lock); + /* only apply if the link is up */ + if (ppd->host_link_state & HLS_UP) + skip = 0; + mutex_unlock(&ppd->hls_lock); + if (skip) + return; + + if (refresh_widths) { + get_link_widths(ppd->dd, &tx, &rx); + ppd->link_width_downgrade_tx_active = tx; + ppd->link_width_downgrade_rx_active = rx; + } + + if (lwde == 0) { + /* downgrade is disabled */ + + /* bounce if not at starting active width */ + if ((ppd->link_width_active != + ppd->link_width_downgrade_tx_active) + || (ppd->link_width_active != + ppd->link_width_downgrade_rx_active)) { + dd_dev_err(ppd->dd, + "Link downgrade is disabled and link has downgraded, downing link\n"); + dd_dev_err(ppd->dd, + " original 0x%x, tx active 0x%x, rx active 0x%x\n", + ppd->link_width_active, + ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); + do_bounce = 1; + } + } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 + || (lwde & ppd->link_width_downgrade_rx_active) == 0) { + /* Tx or Rx is outside the enabled policy */ + dd_dev_err(ppd->dd, + "Link is outside of downgrade allowed, downing link\n"); + dd_dev_err(ppd->dd, + " enabled 0x%x, tx active 0x%x, rx active 0x%x\n", + lwde, + ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); + do_bounce = 1; + } + + if (do_bounce) { + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0, + OPA_LINKDOWN_REASON_WIDTH_POLICY); + set_link_state(ppd, HLS_DN_OFFLINE); + start_link(ppd); + } +} + +/* + * Handle a link downgrade interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_downgrade(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_downgrade_work); + + dd_dev_info(ppd->dd, "8051: Link width downgrade\n"); + apply_link_downgrade_policy(ppd, 1); +} + +static char *dcc_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dcc_err_flags, + ARRAY_SIZE(dcc_err_flags)); +} + +static char *lcb_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, lcb_err_flags, + ARRAY_SIZE(lcb_err_flags)); +} + +static char *dc8051_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_err_flags, + ARRAY_SIZE(dc8051_err_flags)); +} + +static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_info_err_flags, + ARRAY_SIZE(dc8051_info_err_flags)); +} + +static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags, + ARRAY_SIZE(dc8051_info_host_msg_flags)); +} + +static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 info, err, host_msg; + int queue_link_down = 0; + char buf[96]; + + /* look at the flags */ + if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) { + /* 8051 information set by firmware */ + /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */ + info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051); + err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT) + & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK; + host_msg = (info >> + DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT) + & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK; + + /* + * Handle error flags. + */ + if (err & FAILED_LNI) { + /* + * LNI error indications are cleared by the 8051 + * only when starting polling. Only pay attention + * to them when in the states that occur during + * LNI. + */ + if (ppd->host_link_state + & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { + queue_link_down = 1; + dd_dev_info(dd, "Link error: %s\n", + dc8051_info_err_string(buf, + sizeof(buf), + err & FAILED_LNI)); + } + err &= ~(u64)FAILED_LNI; + } + if (err) { + /* report remaining errors, but do not do anything */ + dd_dev_err(dd, "8051 info error: %s\n", + dc8051_info_err_string(buf, sizeof(buf), err)); + } + + /* + * Handle host message flags. + */ + if (host_msg & HOST_REQ_DONE) { + /* + * Presently, the driver does a busy wait for + * host requests to complete. This is only an + * informational message. + * NOTE: The 8051 clears the host message + * information *on the next 8051 command*. + * Therefore, when linkup is achieved, + * this flag will still be set. + */ + host_msg &= ~(u64)HOST_REQ_DONE; + } + if (host_msg & BC_SMA_MSG) { + queue_work(ppd->hfi1_wq, &ppd->sma_message_work); + host_msg &= ~(u64)BC_SMA_MSG; + } + if (host_msg & LINKUP_ACHIEVED) { + dd_dev_info(dd, "8051: Link up\n"); + queue_work(ppd->hfi1_wq, &ppd->link_up_work); + host_msg &= ~(u64)LINKUP_ACHIEVED; + } + if (host_msg & EXT_DEVICE_CFG_REQ) { + handle_8051_request(dd); + host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; + } + if (host_msg & VERIFY_CAP_FRAME) { + queue_work(ppd->hfi1_wq, &ppd->link_vc_work); + host_msg &= ~(u64)VERIFY_CAP_FRAME; + } + if (host_msg & LINK_GOING_DOWN) { + const char *extra = ""; + /* no downgrade action needed if going down */ + if (host_msg & LINK_WIDTH_DOWNGRADED) { + host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED; + extra = " (ignoring downgrade)"; + } + dd_dev_info(dd, "8051: Link down%s\n", extra); + queue_link_down = 1; + host_msg &= ~(u64)LINK_GOING_DOWN; + } + if (host_msg & LINK_WIDTH_DOWNGRADED) { + queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work); + host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED; + } + if (host_msg) { + /* report remaining messages, but do not do anything */ + dd_dev_info(dd, "8051 info host message: %s\n", + dc8051_info_host_msg_string(buf, sizeof(buf), + host_msg)); + } + + reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK; + } + if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) { + /* + * Lost the 8051 heartbeat. If this happens, we + * receive constant interrupts about it. Disable + * the interrupt after the first. + */ + dd_dev_err(dd, "Lost 8051 heartbeat\n"); + write_csr(dd, DC_DC8051_ERR_EN, + read_csr(dd, DC_DC8051_ERR_EN) + & ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK); + + reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK; + } + if (reg) { + /* report the error, but do not do anything */ + dd_dev_err(dd, "8051 error: %s\n", + dc8051_err_string(buf, sizeof(buf), reg)); + } + + if (queue_link_down) { + /* if the link is already going down or disabled, do not + * queue another */ + if ((ppd->host_link_state + & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN)) + || ppd->link_enabled == 0) { + dd_dev_info(dd, "%s: not queuing link down\n", + __func__); + } else { + queue_work(ppd->hfi1_wq, &ppd->link_down_work); + } + } +} + +static const char * const fm_config_txt[] = { +[0] = + "BadHeadDist: Distance violation between two head flits", +[1] = + "BadTailDist: Distance violation between two tail flits", +[2] = + "BadCtrlDist: Distance violation between two credit control flits", +[3] = + "BadCrdAck: Credits return for unsupported VL", +[4] = + "UnsupportedVLMarker: Received VL Marker", +[5] = + "BadPreempt: Exceeded the preemption nesting level", +[6] = + "BadControlFlit: Received unsupported control flit", +/* no 7 */ +[8] = + "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL", +}; + +static const char * const port_rcv_txt[] = { +[1] = + "BadPktLen: Illegal PktLen", +[2] = + "PktLenTooLong: Packet longer than PktLen", +[3] = + "PktLenTooShort: Packet shorter than PktLen", +[4] = + "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)", +[5] = + "BadDLID: Illegal DLID (0, doesn't match HFI)", +[6] = + "BadL2: Illegal L2 opcode", +[7] = + "BadSC: Unsupported SC", +[9] = + "BadRC: Illegal RC", +[11] = + "PreemptError: Preempting with same VL", +[12] = + "PreemptVL15: Preempting a VL15 packet", +}; + +#define OPA_LDR_FMCONFIG_OFFSET 16 +#define OPA_LDR_PORTRCV_OFFSET 0 +static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + u64 info, hdr0, hdr1; + const char *extra; + char buf[96]; + struct hfi1_pportdata *ppd = dd->pport; + u8 lcl_reason = 0; + int do_bounce = 0; + + if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) { + if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) { + info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE); + dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK; + } + reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) { + struct hfi1_pportdata *ppd = dd->pport; + /* this counter saturates at (2^32) - 1 */ + if (ppd->link_downed < (u32)UINT_MAX) + ppd->link_downed++; + reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) { + u8 reason_valid = 1; + + info = read_csr(dd, DCC_ERR_INFO_FMCONFIG); + if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) { + dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK; + } + switch (info) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + extra = fm_config_txt[info]; + break; + case 8: + extra = fm_config_txt[info]; + if (ppd->port_error_action & + OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) { + do_bounce = 1; + /* + * lcl_reason cannot be derived from info + * for this error + */ + lcl_reason = + OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER; + } + break; + default: + reason_valid = 0; + snprintf(buf, sizeof(buf), "reserved%lld", info); + extra = buf; + break; + } + + if (reason_valid && !do_bounce) { + do_bounce = ppd->port_error_action & + (1 << (OPA_LDR_FMCONFIG_OFFSET + info)); + lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST; + } + + /* just report this */ + dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra); + reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) { + u8 reason_valid = 1; + + info = read_csr(dd, DCC_ERR_INFO_PORTRCV); + hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0); + hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1); + if (!(dd->err_info_rcvport.status_and_code & + OPA_EI_STATUS_SMASK)) { + dd->err_info_rcvport.status_and_code = + info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_rcvport.status_and_code |= + OPA_EI_STATUS_SMASK; + /* save first 2 flits in the packet that caused + * the error */ + dd->err_info_rcvport.packet_flit1 = hdr0; + dd->err_info_rcvport.packet_flit2 = hdr1; + } + switch (info) { + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 9: + case 11: + case 12: + extra = port_rcv_txt[info]; + break; + default: + reason_valid = 0; + snprintf(buf, sizeof(buf), "reserved%lld", info); + extra = buf; + break; + } + + if (reason_valid && !do_bounce) { + do_bounce = ppd->port_error_action & + (1 << (OPA_LDR_PORTRCV_OFFSET + info)); + lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0; + } + + /* just report this */ + dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra); + dd_dev_info(dd, " hdr0 0x%llx, hdr1 0x%llx\n", + hdr0, hdr1); + + reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) { + /* informative only */ + dd_dev_info(dd, "8051 access to LCB blocked\n"); + reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK; + } + if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) { + /* informative only */ + dd_dev_info(dd, "host access to LCB blocked\n"); + reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK; + } + + /* report any remaining errors */ + if (reg) + dd_dev_info(dd, "DCC Error: %s\n", + dcc_err_string(buf, sizeof(buf), reg)); + + if (lcl_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN; + + if (do_bounce) { + dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__); + set_link_down_reason(ppd, lcl_reason, 0, lcl_reason); + queue_work(ppd->hfi1_wq, &ppd->link_bounce_work); + } +} + +static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "LCB Error: %s\n", + lcb_err_string(buf, sizeof(buf), reg)); +} + +/* + * CCE block DC interrupt. Source is < 8. + */ +static void is_dc_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &dc_errs[source]; + + if (eri->handler) { + interrupt_clear_down(dd, 0, eri); + } else if (source == 3 /* dc_lbm_int */) { + /* + * This indicates that a parity error has occurred on the + * address/control lines presented to the LBM. The error + * is a single pulse, there is no associated error flag, + * and it is non-maskable. This is because if a parity + * error occurs on the request the request is dropped. + * This should never occur, but it is nice to know if it + * ever does. + */ + dd_dev_err(dd, "Parity error in DC LBM block\n"); + } else { + dd_dev_err(dd, "Invalid DC interrupt %u\n", source); + } +} + +/* + * TX block send credit interrupt. Source is < 160. + */ +static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source) +{ + sc_group_release_update(dd, source); +} + +/* + * TX block SDMA interrupt. Source is < 48. + * + * SDMA interrupts are grouped by type: + * + * 0 - N-1 = SDma + * N - 2N-1 = SDmaProgress + * 2N - 3N-1 = SDmaIdle + */ +static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source) +{ + /* what interrupt */ + unsigned int what = source / TXE_NUM_SDMA_ENGINES; + /* which engine */ + unsigned int which = source % TXE_NUM_SDMA_ENGINES; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which, + slashstrip(__FILE__), __LINE__, __func__); + sdma_dumpstate(&dd->per_sdma[which]); +#endif + + if (likely(what < 3 && which < dd->num_sdma)) { + sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source); + } else { + /* should not happen */ + dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source); + } +} + +/* + * RX block receive available interrupt. Source is < 160. + */ +static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) +{ + struct hfi1_ctxtdata *rcd; + char *err_detail; + + if (likely(source < dd->num_rcv_contexts)) { + rcd = dd->rcd[source]; + if (rcd) { + if (source < dd->first_user_ctxt) + rcd->do_interrupt(rcd); + else + handle_user_interrupt(rcd); + return; /* OK */ + } + /* received an interrupt, but no rcd */ + err_detail = "dataless"; + } else { + /* received an interrupt, but are not using that context */ + err_detail = "out of range"; + } + dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n", + err_detail, source); +} + +/* + * RX block receive urgent interrupt. Source is < 160. + */ +static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source) +{ + struct hfi1_ctxtdata *rcd; + char *err_detail; + + if (likely(source < dd->num_rcv_contexts)) { + rcd = dd->rcd[source]; + if (rcd) { + /* only pay attention to user urgent interrupts */ + if (source >= dd->first_user_ctxt) + handle_user_interrupt(rcd); + return; /* OK */ + } + /* received an interrupt, but no rcd */ + err_detail = "dataless"; + } else { + /* received an interrupt, but are not using that context */ + err_detail = "out of range"; + } + dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n", + err_detail, source); +} + +/* + * Reserved range interrupt. Should not be called in normal operation. + */ +static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source) +{ + char name[64]; + + dd_dev_err(dd, "unexpected %s interrupt\n", + is_reserved_name(name, sizeof(name), source)); +} + +static const struct is_table is_table[] = { +/* start end + name func interrupt func */ +{ IS_GENERAL_ERR_START, IS_GENERAL_ERR_END, + is_misc_err_name, is_misc_err_int }, +{ IS_SDMAENG_ERR_START, IS_SDMAENG_ERR_END, + is_sdma_eng_err_name, is_sdma_eng_err_int }, +{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, + is_sendctxt_err_name, is_sendctxt_err_int }, +{ IS_SDMA_START, IS_SDMA_END, + is_sdma_eng_name, is_sdma_eng_int }, +{ IS_VARIOUS_START, IS_VARIOUS_END, + is_various_name, is_various_int }, +{ IS_DC_START, IS_DC_END, + is_dc_name, is_dc_int }, +{ IS_RCVAVAIL_START, IS_RCVAVAIL_END, + is_rcv_avail_name, is_rcv_avail_int }, +{ IS_RCVURGENT_START, IS_RCVURGENT_END, + is_rcv_urgent_name, is_rcv_urgent_int }, +{ IS_SENDCREDIT_START, IS_SENDCREDIT_END, + is_send_credit_name, is_send_credit_int}, +{ IS_RESERVED_START, IS_RESERVED_END, + is_reserved_name, is_reserved_int}, +}; + +/* + * Interrupt source interrupt - called when the given source has an interrupt. + * Source is a bit index into an array of 64-bit integers. + */ +static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) +{ + const struct is_table *entry; + + /* avoids a double compare by walking the table in-order */ + for (entry = &is_table[0]; entry->is_name; entry++) { + if (source < entry->end) { + trace_hfi1_interrupt(dd, entry, source); + entry->is_int(dd, source - entry->start); + return; + } + } + /* fell off the end */ + dd_dev_err(dd, "invalid interrupt source %u\n", source); +} + +/* + * General interrupt handler. This is able to correctly handle + * all interrupts in case INTx is used. + */ +static irqreturn_t general_interrupt(int irq, void *data) +{ + struct hfi1_devdata *dd = data; + u64 regs[CCE_NUM_INT_CSRS]; + u32 bit; + int i; + + this_cpu_inc(*dd->int_counter); + + /* phase 1: scan and clear all handled interrupts */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) { + if (dd->gi_mask[i] == 0) { + regs[i] = 0; /* used later */ + continue; + } + regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) & + dd->gi_mask[i]; + /* only clear if anything is set */ + if (regs[i]) + write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]); + } + + /* phase 2: call the appropriate handler */ + for_each_set_bit(bit, (unsigned long *)®s[0], + CCE_NUM_INT_CSRS*64) { + is_interrupt(dd, bit); + } + + return IRQ_HANDLED; +} + +static irqreturn_t sdma_interrupt(int irq, void *data) +{ + struct sdma_engine *sde = data; + struct hfi1_devdata *dd = sde->dd; + u64 status; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + sdma_dumpstate(sde); +#endif + + this_cpu_inc(*dd->int_counter); + + /* This read_csr is really bad in the hot path */ + status = read_csr(dd, + CCE_INT_STATUS + (8*(IS_SDMA_START/64))) + & sde->imask; + if (likely(status)) { + /* clear the interrupt(s) */ + write_csr(dd, + CCE_INT_CLEAR + (8*(IS_SDMA_START/64)), + status); + + /* handle the interrupt(s) */ + sdma_engine_interrupt(sde, status); + } else + dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n", + sde->this_idx); + + return IRQ_HANDLED; +} + +/* + * NOTE: this routine expects to be on its own MSI-X interrupt. If + * multiple receive contexts share the same MSI-X interrupt, then this + * routine must check for who received it. + */ +static irqreturn_t receive_context_interrupt(int irq, void *data) +{ + struct hfi1_ctxtdata *rcd = data; + struct hfi1_devdata *dd = rcd->dd; + + trace_hfi1_receive_interrupt(dd, rcd->ctxt); + this_cpu_inc(*dd->int_counter); + + /* clear the interrupt */ + write_csr(rcd->dd, CCE_INT_CLEAR + (8*rcd->ireg), rcd->imask); + + /* handle the interrupt */ + rcd->do_interrupt(rcd); + + return IRQ_HANDLED; +} + +/* ========================================================================= */ + +u32 read_physical_state(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = read_csr(dd, DC_DC8051_STS_CUR_STATE); + return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT) + & DC_DC8051_STS_CUR_STATE_PORT_MASK; +} + +static u32 read_logical_state(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = read_csr(dd, DCC_CFG_PORT_CONFIG); + return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT) + & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK; +} + +static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate) +{ + u64 reg; + + reg = read_csr(dd, DCC_CFG_PORT_CONFIG); + /* clear current state, set new state */ + reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK; + reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT; + write_csr(dd, DCC_CFG_PORT_CONFIG, reg); +} + +/* + * Use the 8051 to read a LCB CSR. + */ +static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data) +{ + u32 regno; + int ret; + + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + if (acquire_lcb_access(dd, 0) == 0) { + *data = read_csr(dd, addr); + release_lcb_access(dd, 0); + return 0; + } + return -EBUSY; + } + + /* register is an index of LCB registers: (offset - base) / 8 */ + regno = (addr - DC_LCB_CFG_RUN) >> 3; + ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data); + if (ret != HCMD_SUCCESS) + return -EBUSY; + return 0; +} + +/* + * Read an LCB CSR. Access may not be in host control, so check. + * Return 0 on success, -EBUSY on failure. + */ +int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data) +{ + struct hfi1_pportdata *ppd = dd->pport; + + /* if up, go through the 8051 for the value */ + if (ppd->host_link_state & HLS_UP) + return read_lcb_via_8051(dd, addr, data); + /* if going up or down, no access */ + if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) + return -EBUSY; + /* otherwise, host has access */ + *data = read_csr(dd, addr); + return 0; +} + +/* + * Use the 8051 to write a LCB CSR. + */ +static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data) +{ + + if (acquire_lcb_access(dd, 0) == 0) { + write_csr(dd, addr, data); + release_lcb_access(dd, 0); + return 0; + } + return -EBUSY; +} + +/* + * Write an LCB CSR. Access may not be in host control, so check. + * Return 0 on success, -EBUSY on failure. + */ +int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data) +{ + struct hfi1_pportdata *ppd = dd->pport; + + /* if up, go through the 8051 for the value */ + if (ppd->host_link_state & HLS_UP) + return write_lcb_via_8051(dd, addr, data); + /* if going up or down, no access */ + if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) + return -EBUSY; + /* otherwise, host has access */ + write_csr(dd, addr, data); + return 0; +} + +/* + * Returns: + * < 0 = Linux error, not able to get access + * > 0 = 8051 command RETURN_CODE + */ +static int do_8051_command( + struct hfi1_devdata *dd, + u32 type, + u64 in_data, + u64 *out_data) +{ + u64 reg, completed; + int return_code; + unsigned long flags; + unsigned long timeout; + + hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data); + + /* + * Alternative to holding the lock for a long time: + * - keep busy wait - have other users bounce off + */ + spin_lock_irqsave(&dd->dc8051_lock, flags); + + /* We can't send any commands to the 8051 if it's in reset */ + if (dd->dc_shutdown) { + return_code = -ENODEV; + goto fail; + } + + /* + * If an 8051 host command timed out previously, then the 8051 is + * stuck. + * + * On first timeout, attempt to reset and restart the entire DC + * block (including 8051). (Is this too big of a hammer?) + * + * If the 8051 times out a second time, the reset did not bring it + * back to healthy life. In that case, fail any subsequent commands. + */ + if (dd->dc8051_timed_out) { + if (dd->dc8051_timed_out > 1) { + dd_dev_err(dd, + "Previous 8051 host command timed out, skipping command %u\n", + type); + return_code = -ENXIO; + goto fail; + } + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + dc_shutdown(dd); + dc_start(dd); + spin_lock_irqsave(&dd->dc8051_lock, flags); + } + + /* + * If there is no timeout, then the 8051 command interface is + * waiting for a command. + */ + + /* + * Do two writes: the first to stabilize the type and req_data, the + * second to activate. + */ + reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK) + << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT + | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK) + << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT; + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg); + reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK; + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg); + + /* wait for completion, alternate: interrupt */ + timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT); + while (1) { + reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1); + completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK; + if (completed) + break; + if (time_after(jiffies, timeout)) { + dd->dc8051_timed_out++; + dd_dev_err(dd, "8051 host command %u timeout\n", type); + if (out_data) + *out_data = 0; + return_code = -ETIMEDOUT; + goto fail; + } + udelay(2); + } + + if (out_data) { + *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT) + & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK; + if (type == HCMD_READ_LCB_CSR) { + /* top 16 bits are in a different register */ + *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1) + & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK) + << (48 + - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT); + } + } + return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT) + & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK; + dd->dc8051_timed_out = 0; + /* + * Clear command for next user. + */ + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0); + +fail: + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + + return return_code; +} + +static int set_physical_link_state(struct hfi1_devdata *dd, u64 state) +{ + return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL); +} + +static int load_8051_config(struct hfi1_devdata *dd, u8 field_id, + u8 lane_id, u32 config_data) +{ + u64 data; + int ret; + + data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT + | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT + | (u64)config_data << LOAD_DATA_DATA_SHIFT; + ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "load 8051 config: field id %d, lane %d, err %d\n", + (int)field_id, (int)lane_id, ret); + } + return ret; +} + +/* + * Read the 8051 firmware "registers". Use the RAM directly. Always + * set the result, even on error. + * Return 0 on success, -errno on failure + */ +static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id, + u32 *result) +{ + u64 big_data; + u32 addr; + int ret; + + /* address start depends on the lane_id */ + if (lane_id < 4) + addr = (4 * NUM_GENERAL_FIELDS) + + (lane_id * 4 * NUM_LANE_FIELDS); + else + addr = 0; + addr += field_id * 4; + + /* read is in 8-byte chunks, hardware will truncate the address down */ + ret = read_8051_data(dd, addr, 8, &big_data); + + if (ret == 0) { + /* extract the 4 bytes we want */ + if (addr & 0x4) + *result = (u32)(big_data >> 32); + else + *result = (u32)big_data; + } else { + *result = 0; + dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n", + __func__, lane_id, field_id); + } + + return ret; +} + +static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management, + u8 continuous) +{ + u32 frame; + + frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT + | power_management << POWER_MANAGEMENT_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY, + GENERAL_CONFIG, frame); +} + +static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu, + u16 vl15buf, u8 crc_sizes) +{ + u32 frame; + + frame = (u32)vau << VAU_SHIFT + | (u32)z << Z_SHIFT + | (u32)vcu << VCU_SHIFT + | (u32)vl15buf << VL15BUF_SHIFT + | (u32)crc_sizes << CRC_SIZES_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC, + GENERAL_CONFIG, frame); +} + +static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + &frame); + *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK; + *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK; + *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; +} + +static int write_vc_local_link_width(struct hfi1_devdata *dd, + u8 misc_bits, + u8 flag_bits, + u16 link_widths) +{ + u32 frame; + + frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT + | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT + | (u32)link_widths << LINK_WIDTH_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + frame); +} + +static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id, + u8 device_rev) +{ + u32 frame; + + frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT) + | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT); + return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame); +} + +static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, + u8 *device_rev) +{ + u32 frame; + + read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame); + *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK; + *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT) + & REMOTE_DEVICE_REV_MASK; +} + +void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b) +{ + u32 frame; + + read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame); + *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK; + *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK; +} + +static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management, + u8 *continuous) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame); + *power_management = (frame >> POWER_MANAGEMENT_SHIFT) + & POWER_MANAGEMENT_MASK; + *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT) + & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK; +} + +static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, + u8 *vcu, u16 *vl15buf, u8 *crc_sizes) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame); + *vau = (frame >> VAU_SHIFT) & VAU_MASK; + *z = (frame >> Z_SHIFT) & Z_MASK; + *vcu = (frame >> VCU_SHIFT) & VCU_MASK; + *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK; + *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK; +} + +static void read_vc_remote_link_width(struct hfi1_devdata *dd, + u8 *remote_tx_rate, + u16 *link_widths) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG, + &frame); + *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT) + & REMOTE_TX_RATE_MASK; + *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; +} + +static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx) +{ + u32 frame; + + read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame); + *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK; +} + +static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed) +{ + u32 frame; + + read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame); + *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK; +} + +static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls) +{ + read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls); +} + +static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs) +{ + read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs); +} + +void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality) +{ + u32 frame; + int ret; + + *link_quality = 0; + if (dd->pport->host_link_state & HLS_UP) { + ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, + &frame); + if (ret == 0) + *link_quality = (frame >> LINK_QUALITY_SHIFT) + & LINK_QUALITY_MASK; + } +} + +static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc) +{ + u32 frame; + + read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame); + *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK; +} + +static int read_tx_settings(struct hfi1_devdata *dd, + u8 *enable_lane_tx, + u8 *tx_polarity_inversion, + u8 *rx_polarity_inversion, + u8 *max_rate) +{ + u32 frame; + int ret; + + ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame); + *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT) + & ENABLE_LANE_TX_MASK; + *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT) + & TX_POLARITY_INVERSION_MASK; + *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT) + & RX_POLARITY_INVERSION_MASK; + *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK; + return ret; +} + +static int write_tx_settings(struct hfi1_devdata *dd, + u8 enable_lane_tx, + u8 tx_polarity_inversion, + u8 rx_polarity_inversion, + u8 max_rate) +{ + u32 frame; + + /* no need to mask, all variable sizes match field widths */ + frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT + | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT + | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT + | max_rate << MAX_RATE_SHIFT; + return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame); +} + +static void check_fabric_firmware_versions(struct hfi1_devdata *dd) +{ + u32 frame, version, prod_id; + int ret, lane; + + /* 4 lanes */ + for (lane = 0; lane < 4; lane++) { + ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame); + if (ret) { + dd_dev_err( + dd, + "Unable to read lane %d firmware details\n", + lane); + continue; + } + version = (frame >> SPICO_ROM_VERSION_SHIFT) + & SPICO_ROM_VERSION_MASK; + prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT) + & SPICO_ROM_PROD_ID_MASK; + dd_dev_info(dd, + "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n", + lane, version, prod_id); + } +} + +/* + * Read an idle LCB message. + * + * Returns 0 on success, -EINVAL on error + */ +static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out) +{ + int ret; + + ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, + type, data_out); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "read idle message: type %d, err %d\n", + (u32)type, ret); + return -EINVAL; + } + dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out); + /* return only the payload as we already know the type */ + *data_out >>= IDLE_PAYLOAD_SHIFT; + return 0; +} + +/* + * Read an idle SMA message. To be done in response to a notification from + * the 8051. + * + * Returns 0 on success, -EINVAL on error + */ +static int read_idle_sma(struct hfi1_devdata *dd, u64 *data) +{ + return read_idle_message(dd, + (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data); +} + +/* + * Send an idle LCB message. + * + * Returns 0 on success, -EINVAL on error + */ +static int send_idle_message(struct hfi1_devdata *dd, u64 data) +{ + int ret; + + dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data); + ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n", + data, ret); + return -EINVAL; + } + return 0; +} + +/* + * Send an idle SMA message. + * + * Returns 0 on success, -EINVAL on error + */ +int send_idle_sma(struct hfi1_devdata *dd, u64 message) +{ + u64 data; + + data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) + | ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT); + return send_idle_message(dd, data); +} + +/* + * Initialize the LCB then do a quick link up. This may or may not be + * in loopback. + * + * return 0 on success, -errno on error + */ +static int do_quick_linkup(struct hfi1_devdata *dd) +{ + u64 reg; + unsigned long timeout; + int ret; + + lcb_shutdown(dd, 0); + + if (loopback) { + /* LCB_CFG_LOOPBACK.VAL = 2 */ + /* LCB_CFG_LANE_WIDTH.VAL = 0 */ + write_csr(dd, DC_LCB_CFG_LOOPBACK, + IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT); + write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0); + } + + /* start the LCBs */ + /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0); + + /* simulator only loopback steps */ + if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + /* LCB_CFG_RUN.EN = 1 */ + write_csr(dd, DC_LCB_CFG_RUN, + 1ull << DC_LCB_CFG_RUN_EN_SHIFT); + + /* watch LCB_STS_LINK_TRANSFER_ACTIVE */ + timeout = jiffies + msecs_to_jiffies(10); + while (1) { + reg = read_csr(dd, + DC_LCB_STS_LINK_TRANSFER_ACTIVE); + if (reg) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "timeout waiting for LINK_TRANSFER_ACTIVE\n"); + return -ETIMEDOUT; + } + udelay(2); + } + + write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, + 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT); + } + + if (!loopback) { + /* + * When doing quick linkup and not in loopback, both + * sides must be done with LCB set-up before either + * starts the quick linkup. Put a delay here so that + * both sides can be started and have a chance to be + * done with LCB set up before resuming. + */ + dd_dev_err(dd, + "Pausing for peer to be finished with LCB set up\n"); + msleep(5000); + dd_dev_err(dd, + "Continuing with quick linkup\n"); + } + + write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */ + set_8051_lcb_access(dd); + + /* + * State "quick" LinkUp request sets the physical link state to + * LinkUp without a verify capability sequence. + * This state is in simulator v37 and later. + */ + ret = set_physical_link_state(dd, PLS_QUICK_LINKUP); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "%s: set physical link state to quick LinkUp failed with return %d\n", + __func__, ret); + + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + if (ret >= 0) + ret = -EINVAL; + return ret; + } + + return 0; /* success */ +} + +/* + * Set the SerDes to internal loopback mode. + * Returns 0 on success, -errno on error. + */ +static int set_serdes_loopback_mode(struct hfi1_devdata *dd) +{ + int ret; + + ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK); + if (ret == HCMD_SUCCESS) + return 0; + dd_dev_err(dd, + "Set physical link state to SerDes Loopback failed with return %d\n", + ret); + if (ret >= 0) + ret = -EINVAL; + return ret; +} + +/* + * Do all special steps to set up loopback. + */ +static int init_loopback(struct hfi1_devdata *dd) +{ + dd_dev_info(dd, "Entering loopback mode\n"); + + /* all loopbacks should disable self GUID check */ + write_csr(dd, DC_DC8051_CFG_MODE, + (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK)); + + /* + * The simulator has only one loopback option - LCB. Switch + * to that option, which includes quick link up. + * + * Accept all valid loopback values. + */ + if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + && (loopback == LOOPBACK_SERDES + || loopback == LOOPBACK_LCB + || loopback == LOOPBACK_CABLE)) { + loopback = LOOPBACK_LCB; + quick_linkup = 1; + return 0; + } + + /* handle serdes loopback */ + if (loopback == LOOPBACK_SERDES) { + /* internal serdes loopack needs quick linkup on RTL */ + if (dd->icode == ICODE_RTL_SILICON) + quick_linkup = 1; + return set_serdes_loopback_mode(dd); + } + + /* LCB loopback - handled at poll time */ + if (loopback == LOOPBACK_LCB) { + quick_linkup = 1; /* LCB is always quick linkup */ + + /* not supported in emulation due to emulation RTL changes */ + if (dd->icode == ICODE_FPGA_EMULATION) { + dd_dev_err(dd, + "LCB loopback not supported in emulation\n"); + return -EINVAL; + } + return 0; + } + + /* external cable loopback requires no extra steps */ + if (loopback == LOOPBACK_CABLE) + return 0; + + dd_dev_err(dd, "Invalid loopback mode %d\n", loopback); + return -EINVAL; +} + +/* + * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits + * used in the Verify Capability link width attribute. + */ +static u16 opa_to_vc_link_widths(u16 opa_widths) +{ + int i; + u16 result = 0; + + static const struct link_bits { + u16 from; + u16 to; + } opa_link_xlate[] = { + { OPA_LINK_WIDTH_1X, 1 << (1-1) }, + { OPA_LINK_WIDTH_2X, 1 << (2-1) }, + { OPA_LINK_WIDTH_3X, 1 << (3-1) }, + { OPA_LINK_WIDTH_4X, 1 << (4-1) }, + }; + + for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) { + if (opa_widths & opa_link_xlate[i].from) + result |= opa_link_xlate[i].to; + } + return result; +} + +/* + * Set link attributes before moving to polling. + */ +static int set_local_link_attributes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u8 enable_lane_tx; + u8 tx_polarity_inversion; + u8 rx_polarity_inversion; + int ret; + + /* reset our fabric serdes to clear any lingering problems */ + fabric_serdes_reset(dd); + + /* set the local tx rate - need to read-modify-write */ + ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, + &rx_polarity_inversion, &ppd->local_tx_rate); + if (ret) + goto set_local_link_attributes_fail; + + if (dd->dc8051_ver < dc8051_ver(0, 20)) { + /* set the tx rate to the fastest enabled */ + if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G) + ppd->local_tx_rate = 1; + else + ppd->local_tx_rate = 0; + } else { + /* set the tx rate to all enabled */ + ppd->local_tx_rate = 0; + if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G) + ppd->local_tx_rate |= 2; + if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G) + ppd->local_tx_rate |= 1; + } + ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion, + rx_polarity_inversion, ppd->local_tx_rate); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* + * DC supports continuous updates. + */ + ret = write_vc_local_phy(dd, 0 /* no power management */, + 1 /* continuous updates */); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* z=1 in the next call: AU of 0 is not supported by the hardware */ + ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init, + ppd->port_crc_mode_enabled); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + ret = write_vc_local_link_width(dd, 0, 0, + opa_to_vc_link_widths(ppd->link_width_enabled)); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* let peer know who we are */ + ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev); + if (ret == HCMD_SUCCESS) + return 0; + +set_local_link_attributes_fail: + dd_dev_err(dd, + "Failed to set local link attributes, return 0x%x\n", + ret); + return ret; +} + +/* + * Call this to start the link. Schedule a retry if the cable is not + * present or if unable to start polling. Do not do anything if the + * link is disabled. Returns 0 if link is disabled or moved to polling + */ +int start_link(struct hfi1_pportdata *ppd) +{ + if (!ppd->link_enabled) { + dd_dev_info(ppd->dd, + "%s: stopping link start because link is disabled\n", + __func__); + return 0; + } + if (!ppd->driver_link_ready) { + dd_dev_info(ppd->dd, + "%s: stopping link start because driver is not ready\n", + __func__); + return 0; + } + + if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES || + loopback == LOOPBACK_LCB || + ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + return set_link_state(ppd, HLS_DN_POLL); + + dd_dev_info(ppd->dd, + "%s: stopping link start because no cable is present\n", + __func__); + return -EAGAIN; +} + +static void reset_qsfp(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask, qsfp_mask; + + mask = (u64)QSFP_HFI0_RESET_N; + qsfp_mask = read_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE); + qsfp_mask |= mask; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE, + qsfp_mask); + + qsfp_mask = read_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT); + qsfp_mask &= ~mask; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, + qsfp_mask); + + udelay(10); + + qsfp_mask |= mask; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, + qsfp_mask); +} + +static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, + u8 *qsfp_interrupt_status) +{ + struct hfi1_devdata *dd = ppd->dd; + + if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) || + (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING)) + dd_dev_info(dd, + "%s: QSFP cable on fire\n", + __func__); + + if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) || + (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING)) + dd_dev_info(dd, + "%s: QSFP cable temperature too low\n", + __func__); + + if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) || + (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING)) + dd_dev_info(dd, + "%s: QSFP supply voltage too high\n", + __func__); + + if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) || + (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING)) + dd_dev_info(dd, + "%s: QSFP supply voltage too low\n", + __func__); + + /* Byte 2 is vendor specific */ + + if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable RX channel 1/2 power too high\n", + __func__); + + if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable RX channel 1/2 power too low\n", + __func__); + + if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable RX channel 3/4 power too high\n", + __func__); + + if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable RX channel 3/4 power too low\n", + __func__); + + if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) || + (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 1/2 bias too high\n", + __func__); + + if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) || + (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 1/2 bias too low\n", + __func__); + + if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) || + (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 3/4 bias too high\n", + __func__); + + if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) || + (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 3/4 bias too low\n", + __func__); + + if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 1/2 power too high\n", + __func__); + + if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 1/2 power too low\n", + __func__); + + if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 3/4 power too high\n", + __func__); + + if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, + "%s: Cable TX channel 3/4 power too low\n", + __func__); + + /* Bytes 9-10 and 11-12 are reserved */ + /* Bytes 13-15 are vendor specific */ + + return 0; +} + +static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd) +{ + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + + return 0; +} + +static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u8 qsfp_interrupt_status = 0; + + if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1) + != 1) { + dd_dev_info(dd, + "%s: Failed to read status of QSFP module\n", + __func__); + return -EIO; + } + + /* We don't care about alarms & warnings with a non-functional INT_N */ + if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY)) + do_pre_lni_host_behaviors(ppd); + + return 0; +} + +/* This routine will only be scheduled if the QSFP module is present */ +static void qsfp_event(struct work_struct *work) +{ + struct qsfp_data *qd; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + + qd = container_of(work, struct qsfp_data, qsfp_work); + ppd = qd->ppd; + dd = ppd->dd; + + /* Sanity check */ + if (!qsfp_mod_present(ppd)) + return; + + /* + * Turn DC back on after cables has been + * re-inserted. Up until now, the DC has been in + * reset to save power. + */ + dc_start(dd); + + if (qd->cache_refresh_required) { + msleep(3000); + reset_qsfp(ppd); + + /* Check for QSFP interrupt after t_init (SFF 8679) + * + extra + */ + msleep(3000); + if (!qd->qsfp_interrupt_functional) { + if (do_qsfp_intr_fallback(ppd) < 0) + dd_dev_info(dd, "%s: QSFP fallback failed\n", + __func__); + ppd->driver_link_ready = 1; + start_link(ppd); + } + } + + if (qd->check_interrupt_flags) { + u8 qsfp_interrupt_status[16] = {0,}; + + if (qsfp_read(ppd, dd->hfi1_id, 6, + &qsfp_interrupt_status[0], 16) != 16) { + dd_dev_info(dd, + "%s: Failed to read status of QSFP module\n", + __func__); + } else { + unsigned long flags; + u8 data_status; + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.check_interrupt_flags = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + + if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1) + != 1) { + dd_dev_info(dd, + "%s: Failed to read status of QSFP module\n", + __func__); + } + if (!(data_status & QSFP_DATA_NOT_READY)) { + do_pre_lni_host_behaviors(ppd); + start_link(ppd); + } else + handle_qsfp_error_conditions(ppd, + qsfp_interrupt_status); + } + } +} + +void init_qsfp(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 qsfp_mask; + + if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB || + ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR || + !HFI1_CAP_IS_KSET(QSFP_ENABLED)) { + ppd->driver_link_ready = 1; + return; + } + + ppd->qsfp_info.ppd = ppd; + INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); + + qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); + /* Clear current status to avoid spurious interrupts */ + write_csr(dd, + dd->hfi1_id ? + ASIC_QSFP2_CLEAR : + ASIC_QSFP1_CLEAR, + qsfp_mask); + + /* Handle active low nature of INT_N and MODPRST_N pins */ + if (qsfp_mod_present(ppd)) + qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT, + qsfp_mask); + + /* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */ + qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, + qsfp_mask); + + if (qsfp_mod_present(ppd)) { + msleep(3000); + reset_qsfp(ppd); + + /* Check for QSFP interrupt after t_init (SFF 8679) + * + extra + */ + msleep(3000); + if (!ppd->qsfp_info.qsfp_interrupt_functional) { + if (do_qsfp_intr_fallback(ppd) < 0) + dd_dev_info(dd, + "%s: QSFP fallback failed\n", + __func__); + ppd->driver_link_ready = 1; + } + } +} + +int bringup_serdes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 guid; + int ret; + + if (HFI1_CAP_IS_KSET(EXTENDED_PSN)) + add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK); + + guid = ppd->guid; + if (!guid) { + if (dd->base_guid) + guid = dd->base_guid + ppd->port - 1; + ppd->guid = guid; + } + + /* the link defaults to enabled */ + ppd->link_enabled = 1; + /* Set linkinit_reason on power up per OPA spec */ + ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP; + + if (loopback) { + ret = init_loopback(dd); + if (ret < 0) + return ret; + } + + return start_link(ppd); +} + +void hfi1_quiet_serdes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* + * Shut down the link and keep it down. First turn off that the + * driver wants to allow the link to be up (driver_link_ready). + * Then make sure the link is not automatically restarted + * (link_enabled). Cancel any pending restart. And finally + * go offline. + */ + ppd->driver_link_ready = 0; + ppd->link_enabled = 0; + + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0, + OPA_LINKDOWN_REASON_SMA_DISABLED); + set_link_state(ppd, HLS_DN_OFFLINE); + + /* disable the port */ + clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); +} + +static inline int init_cpu_counters(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->ibport_data.rc_acks = NULL; + ppd->ibport_data.rc_qacks = NULL; + ppd->ibport_data.rc_acks = alloc_percpu(u64); + ppd->ibport_data.rc_qacks = alloc_percpu(u64); + ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64); + if ((ppd->ibport_data.rc_acks == NULL) || + (ppd->ibport_data.rc_delayed_comp == NULL) || + (ppd->ibport_data.rc_qacks == NULL)) + return -ENOMEM; + } + + return 0; +} + +static const char * const pt_names[] = { + "expected", + "eager", + "invalid" +}; + +static const char *pt_name(u32 type) +{ + return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type]; +} + +/* + * index is the index into the receive array + */ +void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, + u32 type, unsigned long pa, u16 order) +{ + u64 reg; + void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc : + (dd->kregbase + RCV_ARRAY)); + + if (!(dd->flags & HFI1_PRESENT)) + goto done; + + if (type == PT_INVALID) { + pa = 0; + } else if (type > PT_INVALID) { + dd_dev_err(dd, + "unexpected receive array type %u for index %u, not handled\n", + type, index); + goto done; + } + + hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx", + pt_name(type), index, pa, (unsigned long)order); + +#define RT_ADDR_SHIFT 12 /* 4KB kernel address boundary */ + reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK + | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT + | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK) + << RCV_ARRAY_RT_ADDR_SHIFT; + writeq(reg, base + (index * 8)); + + if (type == PT_EAGER) + /* + * Eager entries are written one-by-one so we have to push them + * after we write the entry. + */ + flush_wc(); +done: + return; +} + +void hfi1_clear_tids(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 i; + + /* this could be optimized */ + for (i = rcd->eager_base; i < rcd->eager_base + + rcd->egrbufs.alloced; i++) + hfi1_put_tid(dd, i, PT_INVALID, 0, 0); + + for (i = rcd->expected_base; + i < rcd->expected_base + rcd->expected_count; i++) + hfi1_put_tid(dd, i, PT_INVALID, 0, 0); +} + +int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, + struct hfi1_ctxt_info *kinfo) +{ + kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U); + return 0; +} + +struct hfi1_message_header *hfi1_get_msgheader( + struct hfi1_devdata *dd, __le32 *rhf_addr) +{ + u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr)); + + return (struct hfi1_message_header *) + (rhf_addr - dd->rhf_offset + offset); +} + +static const char * const ib_cfg_name_strings[] = { + "HFI1_IB_CFG_LIDLMC", + "HFI1_IB_CFG_LWID_DG_ENB", + "HFI1_IB_CFG_LWID_ENB", + "HFI1_IB_CFG_LWID", + "HFI1_IB_CFG_SPD_ENB", + "HFI1_IB_CFG_SPD", + "HFI1_IB_CFG_RXPOL_ENB", + "HFI1_IB_CFG_LREV_ENB", + "HFI1_IB_CFG_LINKLATENCY", + "HFI1_IB_CFG_HRTBT", + "HFI1_IB_CFG_OP_VLS", + "HFI1_IB_CFG_VL_HIGH_CAP", + "HFI1_IB_CFG_VL_LOW_CAP", + "HFI1_IB_CFG_OVERRUN_THRESH", + "HFI1_IB_CFG_PHYERR_THRESH", + "HFI1_IB_CFG_LINKDEFAULT", + "HFI1_IB_CFG_PKEYS", + "HFI1_IB_CFG_MTU", + "HFI1_IB_CFG_LSTATE", + "HFI1_IB_CFG_VL_HIGH_LIMIT", + "HFI1_IB_CFG_PMA_TICKS", + "HFI1_IB_CFG_PORT" +}; + +static const char *ib_cfg_name(int which) +{ + if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings)) + return "invalid"; + return ib_cfg_name_strings[which]; +} + +int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which) +{ + struct hfi1_devdata *dd = ppd->dd; + int val = 0; + + switch (which) { + case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */ + val = ppd->link_width_enabled; + break; + case HFI1_IB_CFG_LWID: /* currently active Link-width */ + val = ppd->link_width_active; + break; + case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */ + val = ppd->link_speed_enabled; + break; + case HFI1_IB_CFG_SPD: /* current Link speed */ + val = ppd->link_speed_active; + break; + + case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */ + case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */ + case HFI1_IB_CFG_LINKLATENCY: + goto unimplemented; + + case HFI1_IB_CFG_OP_VLS: + val = ppd->vls_operational; + break; + case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */ + val = VL_ARB_HIGH_PRIO_TABLE_SIZE; + break; + case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */ + val = VL_ARB_LOW_PRIO_TABLE_SIZE; + break; + case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + val = ppd->overrun_threshold; + break; + case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + val = ppd->phy_error_threshold; + break; + case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + val = dd->link_default; + break; + + case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */ + case HFI1_IB_CFG_PMA_TICKS: + default: +unimplemented: + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info( + dd, + "%s: which %s: not implemented\n", + __func__, + ib_cfg_name(which)); + break; + } + + return val; +} + +/* + * The largest MAD packet size. + */ +#define MAX_MAD_PACKET 2048 + +/* + * Return the maximum header bytes that can go on the _wire_ + * for this device. This count includes the ICRC which is + * not part of the packet held in memory but it is appended + * by the HW. + * This is dependent on the device's receive header entry size. + * HFI allows this to be set per-receive context, but the + * driver presently enforces a global value. + */ +u32 lrh_max_header_bytes(struct hfi1_devdata *dd) +{ + /* + * The maximum non-payload (MTU) bytes in LRH.PktLen are + * the Receive Header Entry Size minus the PBC (or RHF) size + * plus one DW for the ICRC appended by HW. + * + * dd->rcd[0].rcvhdrqentsize is in DW. + * We use rcd[0] as all context will have the same value. Also, + * the first kernel context would have been allocated by now so + * we are guaranteed a valid value. + */ + return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2; +} + +/* + * Set Send Length + * @ppd - per port data + * + * Set the MTU by limiting how many DWs may be sent. The SendLenCheck* + * registers compare against LRH.PktLen, so use the max bytes included + * in the LRH. + * + * This routine changes all VL values except VL15, which it maintains at + * the same value. + */ +static void set_send_length(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 max_hb = lrh_max_header_bytes(dd), maxvlmtu = 0, dcmtu; + u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2) + & SEND_LEN_CHECK1_LEN_VL15_MASK) << + SEND_LEN_CHECK1_LEN_VL15_SHIFT; + int i; + + for (i = 0; i < ppd->vls_supported; i++) { + if (dd->vld[i].mtu > maxvlmtu) + maxvlmtu = dd->vld[i].mtu; + if (i <= 3) + len1 |= (((dd->vld[i].mtu + max_hb) >> 2) + & SEND_LEN_CHECK0_LEN_VL0_MASK) << + ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT); + else + len2 |= (((dd->vld[i].mtu + max_hb) >> 2) + & SEND_LEN_CHECK1_LEN_VL4_MASK) << + ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT); + } + write_csr(dd, SEND_LEN_CHECK0, len1); + write_csr(dd, SEND_LEN_CHECK1, len2); + /* adjust kernel credit return thresholds based on new MTUs */ + /* all kernel receive contexts have the same hdrqentsize */ + for (i = 0; i < ppd->vls_supported; i++) { + sc_set_cr_threshold(dd->vld[i].sc, + sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu, + dd->rcd[0]->rcvhdrqentsize)); + } + sc_set_cr_threshold(dd->vld[15].sc, + sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu, + dd->rcd[0]->rcvhdrqentsize)); + + /* Adjust maximum MTU for the port in DC */ + dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : + (ilog2(maxvlmtu >> 8) + 1); + len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG); + len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK; + len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) << + DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT; + write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1); +} + +static void set_lidlmc(struct hfi1_pportdata *ppd) +{ + int i; + u64 sreg = 0; + struct hfi1_devdata *dd = ppd->dd; + u32 mask = ~((1U << ppd->lmc) - 1); + u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1); + + if (dd->hfi1_snoop.mode_flag) + dd_dev_info(dd, "Set lid/lmc while snooping"); + + c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK + | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK); + c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) + << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)| + ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK) + << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT); + write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1); + + /* + * Iterate over all the send contexts and set their SLID check + */ + sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) << + SEND_CTXT_CHECK_SLID_MASK_SHIFT) | + (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << + SEND_CTXT_CHECK_SLID_VALUE_SHIFT); + + for (i = 0; i < dd->chip_send_contexts; i++) { + hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x", + i, (u32)sreg); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg); + } + + /* Now we have to do the same thing for the sdma engines */ + sdma_update_lmc(dd, mask, ppd->lid); +} + +static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs) +{ + unsigned long timeout; + u32 curr_state; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + curr_state = read_physical_state(dd); + if (curr_state == state) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "timeout waiting for phy link state 0x%x, current state is 0x%x\n", + state, curr_state); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + return 0; +} + +/* + * Helper for set_link_state(). Do not call except from that routine. + * Expects ppd->hls_mutex to be held. + * + * @rem_reason value to be sent to the neighbor + * + * LinkDownReasons only set if transition succeeds. + */ +static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 pstate, previous_state; + u32 last_local_state; + u32 last_remote_state; + int ret; + int do_transition; + int do_wait; + + previous_state = ppd->host_link_state; + ppd->host_link_state = HLS_GOING_OFFLINE; + pstate = read_physical_state(dd); + if (pstate == PLS_OFFLINE) { + do_transition = 0; /* in right state */ + do_wait = 0; /* ...no need to wait */ + } else if ((pstate & 0xff) == PLS_OFFLINE) { + do_transition = 0; /* in an offline transient state */ + do_wait = 1; /* ...wait for it to settle */ + } else { + do_transition = 1; /* need to move to offline */ + do_wait = 1; /* ...will need to wait */ + } + + if (do_transition) { + ret = set_physical_link_state(dd, + PLS_OFFLINE | (rem_reason << 8)); + + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Offline link state, return %d\n", + ret); + return -EINVAL; + } + if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE) + ppd->offline_disabled_reason = + OPA_LINKDOWN_REASON_TRANSIENT; + } + + if (do_wait) { + /* it can take a while for the link to go down */ + ret = wait_phy_linkstate(dd, PLS_OFFLINE, 5000); + if (ret < 0) + return ret; + } + + /* make sure the logical state is also down */ + wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); + + /* + * Now in charge of LCB - must be after the physical state is + * offline.quiet and before host_link_state is changed. + */ + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + + /* + * The LNI has a mandatory wait time after the physical state + * moves to Offline.Quiet. The wait time may be different + * depending on how the link went down. The 8051 firmware + * will observe the needed wait time and only move to ready + * when that is completed. The largest of the quiet timeouts + * is 2.5s, so wait that long and then a bit more. + */ + ret = wait_fm_ready(dd, 3000); + if (ret) { + dd_dev_err(dd, + "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n"); + /* state is really offline, so make it so */ + ppd->host_link_state = HLS_DN_OFFLINE; + return ret; + } + + /* + * The state is now offline and the 8051 is ready to accept host + * requests. + * - change our state + * - notify others if we were previously in a linkup state + */ + ppd->host_link_state = HLS_DN_OFFLINE; + if (previous_state & HLS_UP) { + /* went down while link was up */ + handle_linkup_change(dd, 0); + } else if (previous_state + & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { + /* went down while attempting link up */ + /* byte 1 of last_*_state is the failure reason */ + read_last_local_state(dd, &last_local_state); + read_last_remote_state(dd, &last_remote_state); + dd_dev_err(dd, + "LNI failure last states: local 0x%08x, remote 0x%08x\n", + last_local_state, last_remote_state); + } + + /* the active link width (downgrade) is 0 on link down */ + ppd->link_width_active = 0; + ppd->link_width_downgrade_tx_active = 0; + ppd->link_width_downgrade_rx_active = 0; + ppd->current_egress_rate = 0; + return 0; +} + +/* return the link state name */ +static const char *link_state_name(u32 state) +{ + const char *name; + int n = ilog2(state); + static const char * const names[] = { + [__HLS_UP_INIT_BP] = "INIT", + [__HLS_UP_ARMED_BP] = "ARMED", + [__HLS_UP_ACTIVE_BP] = "ACTIVE", + [__HLS_DN_DOWNDEF_BP] = "DOWNDEF", + [__HLS_DN_POLL_BP] = "POLL", + [__HLS_DN_DISABLE_BP] = "DISABLE", + [__HLS_DN_OFFLINE_BP] = "OFFLINE", + [__HLS_VERIFY_CAP_BP] = "VERIFY_CAP", + [__HLS_GOING_UP_BP] = "GOING_UP", + [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE", + [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN" + }; + + name = n < ARRAY_SIZE(names) ? names[n] : NULL; + return name ? name : "unknown"; +} + +/* return the link state reason name */ +static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state) +{ + if (state == HLS_UP_INIT) { + switch (ppd->linkinit_reason) { + case OPA_LINKINIT_REASON_LINKUP: + return "(LINKUP)"; + case OPA_LINKINIT_REASON_FLAPPING: + return "(FLAPPING)"; + case OPA_LINKINIT_OUTSIDE_POLICY: + return "(OUTSIDE_POLICY)"; + case OPA_LINKINIT_QUARANTINED: + return "(QUARANTINED)"; + case OPA_LINKINIT_INSUFIC_CAPABILITY: + return "(INSUFIC_CAPABILITY)"; + default: + break; + } + } + return ""; +} + +/* + * driver_physical_state - convert the driver's notion of a port's + * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*). + * Return -1 (converted to a u32) to indicate error. + */ +u32 driver_physical_state(struct hfi1_pportdata *ppd) +{ + switch (ppd->host_link_state) { + case HLS_UP_INIT: + case HLS_UP_ARMED: + case HLS_UP_ACTIVE: + return IB_PORTPHYSSTATE_LINKUP; + case HLS_DN_POLL: + return IB_PORTPHYSSTATE_POLLING; + case HLS_DN_DISABLE: + return IB_PORTPHYSSTATE_DISABLED; + case HLS_DN_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_VERIFY_CAP: + return IB_PORTPHYSSTATE_POLLING; + case HLS_GOING_UP: + return IB_PORTPHYSSTATE_POLLING; + case HLS_GOING_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_LINK_COOLDOWN: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_DN_DOWNDEF: + default: + dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n", + ppd->host_link_state); + return -1; + } +} + +/* + * driver_logical_state - convert the driver's notion of a port's + * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1 + * (converted to a u32) to indicate error. + */ +u32 driver_logical_state(struct hfi1_pportdata *ppd) +{ + if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP)) + return IB_PORT_DOWN; + + switch (ppd->host_link_state & HLS_UP) { + case HLS_UP_INIT: + return IB_PORT_INIT; + case HLS_UP_ARMED: + return IB_PORT_ARMED; + case HLS_UP_ACTIVE: + return IB_PORT_ACTIVE; + default: + dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n", + ppd->host_link_state); + return -1; + } +} + +void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, + u8 neigh_reason, u8 rem_reason) +{ + if (ppd->local_link_down_reason.latest == 0 && + ppd->neigh_link_down_reason.latest == 0) { + ppd->local_link_down_reason.latest = lcl_reason; + ppd->neigh_link_down_reason.latest = neigh_reason; + ppd->remote_link_down_reason = rem_reason; + } +} + +/* + * Change the physical and/or logical link state. + * + * Do not call this routine while inside an interrupt. It contains + * calls to routines that can take multiple seconds to finish. + * + * Returns 0 on success, -errno on failure. + */ +int set_link_state(struct hfi1_pportdata *ppd, u32 state) +{ + struct hfi1_devdata *dd = ppd->dd; + struct ib_event event = {.device = NULL}; + int ret1, ret = 0; + int was_up, is_down; + int orig_new_state, poll_bounce; + + mutex_lock(&ppd->hls_lock); + + orig_new_state = state; + if (state == HLS_DN_DOWNDEF) + state = dd->link_default; + + /* interpret poll -> poll as a link bounce */ + poll_bounce = ppd->host_link_state == HLS_DN_POLL + && state == HLS_DN_POLL; + + dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__, + link_state_name(ppd->host_link_state), + link_state_name(orig_new_state), + poll_bounce ? "(bounce) " : "", + link_state_reason_name(ppd, state)); + + was_up = !!(ppd->host_link_state & HLS_UP); + + /* + * If we're going to a (HLS_*) link state that implies the logical + * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then + * reset is_sm_config_started to 0. + */ + if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE))) + ppd->is_sm_config_started = 0; + + /* + * Do nothing if the states match. Let a poll to poll link bounce + * go through. + */ + if (ppd->host_link_state == state && !poll_bounce) + goto done; + + switch (state) { + case HLS_UP_INIT: + if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup + || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) { + /* + * Quick link up jumps from polling to here. + * + * Whether in normal or loopback mode, the + * simulator jumps from polling to link up. + * Accept that here. + */ + /* OK */; + } else if (ppd->host_link_state != HLS_GOING_UP) { + goto unexpected; + } + + ppd->host_link_state = HLS_UP_INIT; + ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000); + if (ret) { + /* logical state didn't change, stay at going_up */ + ppd->host_link_state = HLS_GOING_UP; + dd_dev_err(dd, + "%s: logical state did not change to INIT\n", + __func__); + } else { + /* clear old transient LINKINIT_REASON code */ + if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR) + ppd->linkinit_reason = + OPA_LINKINIT_REASON_LINKUP; + + /* enable the port */ + add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + handle_linkup_change(dd, 1); + } + break; + case HLS_UP_ARMED: + if (ppd->host_link_state != HLS_UP_INIT) + goto unexpected; + + ppd->host_link_state = HLS_UP_ARMED; + set_logical_state(dd, LSTATE_ARMED); + ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000); + if (ret) { + /* logical state didn't change, stay at init */ + ppd->host_link_state = HLS_UP_INIT; + dd_dev_err(dd, + "%s: logical state did not change to ARMED\n", + __func__); + } + /* + * The simulator does not currently implement SMA messages, + * so neighbor_normal is not set. Set it here when we first + * move to Armed. + */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + ppd->neighbor_normal = 1; + break; + case HLS_UP_ACTIVE: + if (ppd->host_link_state != HLS_UP_ARMED) + goto unexpected; + + ppd->host_link_state = HLS_UP_ACTIVE; + set_logical_state(dd, LSTATE_ACTIVE); + ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000); + if (ret) { + /* logical state didn't change, stay at armed */ + ppd->host_link_state = HLS_UP_ARMED; + dd_dev_err(dd, + "%s: logical state did not change to ACTIVE\n", + __func__); + } else { + + /* tell all engines to go running */ + sdma_all_running(dd); + + /* Signal the IB layer that the port has went active */ + event.device = &dd->verbs_dev.ibdev; + event.element.port_num = ppd->port; + event.event = IB_EVENT_PORT_ACTIVE; + } + break; + case HLS_DN_POLL: + if ((ppd->host_link_state == HLS_DN_DISABLE || + ppd->host_link_state == HLS_DN_OFFLINE) && + dd->dc_shutdown) + dc_start(dd); + /* Hand LED control to the DC */ + write_csr(dd, DCC_CFG_LED_CNTRL, 0); + + if (ppd->host_link_state != HLS_DN_OFFLINE) { + u8 tmp = ppd->link_enabled; + + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (ret) { + ppd->link_enabled = tmp; + break; + } + ppd->remote_link_down_reason = 0; + + if (ppd->driver_link_ready) + ppd->link_enabled = 1; + } + + ret = set_local_link_attributes(ppd); + if (ret) + break; + + ppd->port_error_action = 0; + ppd->host_link_state = HLS_DN_POLL; + + if (quick_linkup) { + /* quick linkup does not go into polling */ + ret = do_quick_linkup(dd); + } else { + ret1 = set_physical_link_state(dd, PLS_POLLING); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Polling link state, return 0x%x\n", + ret1); + ret = -EINVAL; + } + } + ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE; + /* + * If an error occurred above, go back to offline. The + * caller may reschedule another attempt. + */ + if (ret) + goto_offline(ppd, 0); + break; + case HLS_DN_DISABLE: + /* link is disabled */ + ppd->link_enabled = 0; + + /* allow any state to transition to disabled */ + + /* must transition to offline first */ + if (ppd->host_link_state != HLS_DN_OFFLINE) { + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (ret) + break; + ppd->remote_link_down_reason = 0; + } + + ret1 = set_physical_link_state(dd, PLS_DISABLED); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Disabled link state, return 0x%x\n", + ret1); + ret = -EINVAL; + break; + } + ppd->host_link_state = HLS_DN_DISABLE; + dc_shutdown(dd); + break; + case HLS_DN_OFFLINE: + if (ppd->host_link_state == HLS_DN_DISABLE) + dc_start(dd); + + /* allow any state to transition to offline */ + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (!ret) + ppd->remote_link_down_reason = 0; + break; + case HLS_VERIFY_CAP: + if (ppd->host_link_state != HLS_DN_POLL) + goto unexpected; + ppd->host_link_state = HLS_VERIFY_CAP; + break; + case HLS_GOING_UP: + if (ppd->host_link_state != HLS_VERIFY_CAP) + goto unexpected; + + ret1 = set_physical_link_state(dd, PLS_LINKUP); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to link up state, return 0x%x\n", + ret1); + ret = -EINVAL; + break; + } + ppd->host_link_state = HLS_GOING_UP; + break; + + case HLS_GOING_OFFLINE: /* transient within goto_offline() */ + case HLS_LINK_COOLDOWN: /* transient within goto_offline() */ + default: + dd_dev_info(dd, "%s: state 0x%x: not supported\n", + __func__, state); + ret = -EINVAL; + break; + } + + is_down = !!(ppd->host_link_state & (HLS_DN_POLL | + HLS_DN_DISABLE | HLS_DN_OFFLINE)); + + if (was_up && is_down && ppd->local_link_down_reason.sma == 0 && + ppd->neigh_link_down_reason.sma == 0) { + ppd->local_link_down_reason.sma = + ppd->local_link_down_reason.latest; + ppd->neigh_link_down_reason.sma = + ppd->neigh_link_down_reason.latest; + } + + goto done; + +unexpected: + dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n", + __func__, link_state_name(ppd->host_link_state), + link_state_name(state)); + ret = -EINVAL; + +done: + mutex_unlock(&ppd->hls_lock); + + if (event.device) + ib_dispatch_event(&event); + + return ret; +} + +int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val) +{ + u64 reg; + int ret = 0; + + switch (which) { + case HFI1_IB_CFG_LIDLMC: + set_lidlmc(ppd); + break; + case HFI1_IB_CFG_VL_HIGH_LIMIT: + /* + * The VL Arbitrator high limit is sent in units of 4k + * bytes, while HFI stores it in units of 64 bytes. + */ + val *= 4096/64; + reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK) + << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT; + write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg); + break; + case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* HFI only supports POLL as the default link down state */ + if (val != HLS_DN_POLL) + ret = -EINVAL; + break; + case HFI1_IB_CFG_OP_VLS: + if (ppd->vls_operational != val) { + ppd->vls_operational = val; + if (!ppd->port) + ret = -EINVAL; + else + ret = sdma_map_init( + ppd->dd, + ppd->port - 1, + val, + NULL); + } + break; + /* + * For link width, link width downgrade, and speed enable, always AND + * the setting with what is actually supported. This has two benefits. + * First, enabled can't have unsupported values, no matter what the + * SM or FM might want. Second, the ALL_SUPPORTED wildcards that mean + * "fill in with your supported value" have all the bits in the + * field set, so simply ANDing with supported has the desired result. + */ + case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */ + ppd->link_width_enabled = val & ppd->link_width_supported; + break; + case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */ + ppd->link_width_downgrade_enabled = + val & ppd->link_width_downgrade_supported; + break; + case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */ + ppd->link_speed_enabled = val & ppd->link_speed_supported; + break; + case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + /* + * HFI does not follow IB specs, save this value + * so we can report it, if asked. + */ + ppd->overrun_threshold = val; + break; + case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + /* + * HFI does not follow IB specs, save this value + * so we can report it, if asked. + */ + ppd->phy_error_threshold = val; + break; + + case HFI1_IB_CFG_MTU: + set_send_length(ppd); + break; + + case HFI1_IB_CFG_PKEYS: + if (HFI1_CAP_IS_KSET(PKEY_CHECK)) + set_partition_keys(ppd); + break; + + default: + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info(ppd->dd, + "%s: which %s, val 0x%x: not implemented\n", + __func__, ib_cfg_name(which), val); + break; + } + return ret; +} + +/* begin functions related to vl arbitration table caching */ +static void init_vl_arb_caches(struct hfi1_pportdata *ppd) +{ + int i; + + BUILD_BUG_ON(VL_ARB_TABLE_SIZE != + VL_ARB_LOW_PRIO_TABLE_SIZE); + BUILD_BUG_ON(VL_ARB_TABLE_SIZE != + VL_ARB_HIGH_PRIO_TABLE_SIZE); + + /* + * Note that we always return values directly from the + * 'vl_arb_cache' (and do no CSR reads) in response to a + * 'Get(VLArbTable)'. This is obviously correct after a + * 'Set(VLArbTable)', since the cache will then be up to + * date. But it's also correct prior to any 'Set(VLArbTable)' + * since then both the cache, and the relevant h/w registers + * will be zeroed. + */ + + for (i = 0; i < MAX_PRIO_TABLE; i++) + spin_lock_init(&ppd->vl_arb_cache[i].lock); +} + +/* + * vl_arb_lock_cache + * + * All other vl_arb_* functions should be called only after locking + * the cache. + */ +static inline struct vl_arb_cache * +vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx) +{ + if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE) + return NULL; + spin_lock(&ppd->vl_arb_cache[idx].lock); + return &ppd->vl_arb_cache[idx]; +} + +static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx) +{ + spin_unlock(&ppd->vl_arb_cache[idx].lock); +} + +static void vl_arb_get_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +static void vl_arb_set_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +static int vl_arb_match_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} +/* end functions related to vl arbitration table caching */ + +static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target, + u32 size, struct ib_vl_weight_elem *vl) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + unsigned int i, is_up = 0; + int drain, ret = 0; + + mutex_lock(&ppd->hls_lock); + + if (ppd->host_link_state & HLS_UP) + is_up = 1; + + drain = !is_ax(dd) && is_up; + + if (drain) + /* + * Before adjusting VL arbitration weights, empty per-VL + * FIFOs, otherwise a packet whose VL weight is being + * set to 0 could get stuck in a FIFO with no chance to + * egress. + */ + ret = stop_drain_data_vls(dd); + + if (ret) { + dd_dev_err( + dd, + "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n", + __func__); + goto err; + } + + for (i = 0; i < size; i++, vl++) { + /* + * NOTE: The low priority shift and mask are used here, but + * they are the same for both the low and high registers. + */ + reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK) + << SEND_LOW_PRIORITY_LIST_VL_SHIFT) + | (((u64)vl->weight + & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK) + << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT); + write_csr(dd, target + (i * 8), reg); + } + pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE); + + if (drain) + open_fill_data_vls(dd); /* reopen all VLs */ + +err: + mutex_unlock(&ppd->hls_lock); + + return ret; +} + +/* + * Read one credit merge VL register. + */ +static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr, + struct vl_limit *vll) +{ + u64 reg = read_csr(dd, csr); + + vll->dedicated = cpu_to_be16( + (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT) + & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK); + vll->shared = cpu_to_be16( + (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT) + & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK); +} + +/* + * Read the current credit merge limits. + */ +static int get_buffer_control(struct hfi1_devdata *dd, + struct buffer_control *bc, u16 *overall_limit) +{ + u64 reg; + int i; + + /* not all entries are filled in */ + memset(bc, 0, sizeof(*bc)); + + /* OPA and HFI have a 1-1 mapping */ + for (i = 0; i < TXE_NUM_DATA_VL; i++) + read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]); + + /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */ + read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]); + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + bc->overall_shared_limit = cpu_to_be16( + (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) + & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK); + if (overall_limit) + *overall_limit = (reg + >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) + & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK; + return sizeof(struct buffer_control); +} + +static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp) +{ + u64 reg; + int i; + + /* each register contains 16 SC->VLnt mappings, 4 bits each */ + reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0); + for (i = 0; i < sizeof(u64); i++) { + u8 byte = *(((u8 *)®) + i); + + dp->vlnt[2 * i] = byte & 0xf; + dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4; + } + + reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16); + for (i = 0; i < sizeof(u64); i++) { + u8 byte = *(((u8 *)®) + i); + + dp->vlnt[16 + (2 * i)] = byte & 0xf; + dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4; + } + return sizeof(struct sc2vlnt); +} + +static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems, + struct ib_vl_weight_elem *vl) +{ + unsigned int i; + + for (i = 0; i < nelems; i++, vl++) { + vl->vl = 0xf; + vl->weight = 0; + } +} + +static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp) +{ + write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, + DC_SC_VL_VAL(15_0, + 0, dp->vlnt[0] & 0xf, + 1, dp->vlnt[1] & 0xf, + 2, dp->vlnt[2] & 0xf, + 3, dp->vlnt[3] & 0xf, + 4, dp->vlnt[4] & 0xf, + 5, dp->vlnt[5] & 0xf, + 6, dp->vlnt[6] & 0xf, + 7, dp->vlnt[7] & 0xf, + 8, dp->vlnt[8] & 0xf, + 9, dp->vlnt[9] & 0xf, + 10, dp->vlnt[10] & 0xf, + 11, dp->vlnt[11] & 0xf, + 12, dp->vlnt[12] & 0xf, + 13, dp->vlnt[13] & 0xf, + 14, dp->vlnt[14] & 0xf, + 15, dp->vlnt[15] & 0xf)); + write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, + DC_SC_VL_VAL(31_16, + 16, dp->vlnt[16] & 0xf, + 17, dp->vlnt[17] & 0xf, + 18, dp->vlnt[18] & 0xf, + 19, dp->vlnt[19] & 0xf, + 20, dp->vlnt[20] & 0xf, + 21, dp->vlnt[21] & 0xf, + 22, dp->vlnt[22] & 0xf, + 23, dp->vlnt[23] & 0xf, + 24, dp->vlnt[24] & 0xf, + 25, dp->vlnt[25] & 0xf, + 26, dp->vlnt[26] & 0xf, + 27, dp->vlnt[27] & 0xf, + 28, dp->vlnt[28] & 0xf, + 29, dp->vlnt[29] & 0xf, + 30, dp->vlnt[30] & 0xf, + 31, dp->vlnt[31] & 0xf)); +} + +static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what, + u16 limit) +{ + if (limit != 0) + dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n", + what, (int)limit, idx); +} + +/* change only the shared limit portion of SendCmGLobalCredit */ +static void set_global_shared(struct hfi1_devdata *dd, u16 limit) +{ + u64 reg; + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK; + reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); +} + +/* change only the total credit limit portion of SendCmGLobalCredit */ +static void set_global_limit(struct hfi1_devdata *dd, u16 limit) +{ + u64 reg; + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK; + reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); +} + +/* set the given per-VL shared limit */ +static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit) +{ + u64 reg; + u32 addr; + + if (vl < TXE_NUM_DATA_VL) + addr = SEND_CM_CREDIT_VL + (8 * vl); + else + addr = SEND_CM_CREDIT_VL15; + + reg = read_csr(dd, addr); + reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK; + reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT; + write_csr(dd, addr, reg); +} + +/* set the given per-VL dedicated limit */ +static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit) +{ + u64 reg; + u32 addr; + + if (vl < TXE_NUM_DATA_VL) + addr = SEND_CM_CREDIT_VL + (8 * vl); + else + addr = SEND_CM_CREDIT_VL15; + + reg = read_csr(dd, addr); + reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK; + reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT; + write_csr(dd, addr, reg); +} + +/* spin until the given per-VL status mask bits clear */ +static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask, + const char *which) +{ + unsigned long timeout; + u64 reg; + + timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT); + while (1) { + reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask; + + if (reg == 0) + return; /* success */ + if (time_after(jiffies, timeout)) + break; /* timed out */ + udelay(1); + } + + dd_dev_err(dd, + "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n", + which, VL_STATUS_CLEAR_TIMEOUT, mask, reg); + /* + * If this occurs, it is likely there was a credit loss on the link. + * The only recovery from that is a link bounce. + */ + dd_dev_err(dd, + "Continuing anyway. A credit loss may occur. Suggest a link bounce\n"); +} + +/* + * The number of credits on the VLs may be changed while everything + * is "live", but the following algorithm must be followed due to + * how the hardware is actually implemented. In particular, + * Return_Credit_Status[] is the only correct status check. + * + * if (reducing Global_Shared_Credit_Limit or any shared limit changing) + * set Global_Shared_Credit_Limit = 0 + * use_all_vl = 1 + * mask0 = all VLs that are changing either dedicated or shared limits + * set Shared_Limit[mask0] = 0 + * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0 + * if (changing any dedicated limit) + * mask1 = all VLs that are lowering dedicated limits + * lower Dedicated_Limit[mask1] + * spin until Return_Credit_Status[mask1] == 0 + * raise Dedicated_Limits + * raise Shared_Limits + * raise Global_Shared_Credit_Limit + * + * lower = if the new limit is lower, set the limit to the new value + * raise = if the new limit is higher than the current value (may be changed + * earlier in the algorithm), set the new limit to the new value + */ +static int set_buffer_control(struct hfi1_devdata *dd, + struct buffer_control *new_bc) +{ + u64 changing_mask, ld_mask, stat_mask; + int change_count; + int i, use_all_mask; + int this_shared_changing; + /* + * A0: add the variable any_shared_limit_changing below and in the + * algorithm above. If removing A0 support, it can be removed. + */ + int any_shared_limit_changing; + struct buffer_control cur_bc; + u8 changing[OPA_MAX_VLS]; + u8 lowering_dedicated[OPA_MAX_VLS]; + u16 cur_total; + u32 new_total = 0; + const u64 all_mask = + SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK; + +#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15) +#define NUM_USABLE_VLS 16 /* look at VL15 and less */ + + + /* find the new total credits, do sanity check on unused VLs */ + for (i = 0; i < OPA_MAX_VLS; i++) { + if (valid_vl(i)) { + new_total += be16_to_cpu(new_bc->vl[i].dedicated); + continue; + } + nonzero_msg(dd, i, "dedicated", + be16_to_cpu(new_bc->vl[i].dedicated)); + nonzero_msg(dd, i, "shared", + be16_to_cpu(new_bc->vl[i].shared)); + new_bc->vl[i].dedicated = 0; + new_bc->vl[i].shared = 0; + } + new_total += be16_to_cpu(new_bc->overall_shared_limit); + if (new_total > (u32)dd->link_credits) + return -EINVAL; + /* fetch the current values */ + get_buffer_control(dd, &cur_bc, &cur_total); + + /* + * Create the masks we will use. + */ + memset(changing, 0, sizeof(changing)); + memset(lowering_dedicated, 0, sizeof(lowering_dedicated)); + /* NOTE: Assumes that the individual VL bits are adjacent and in + increasing order */ + stat_mask = + SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK; + changing_mask = 0; + ld_mask = 0; + change_count = 0; + any_shared_limit_changing = 0; + for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) { + if (!valid_vl(i)) + continue; + this_shared_changing = new_bc->vl[i].shared + != cur_bc.vl[i].shared; + if (this_shared_changing) + any_shared_limit_changing = 1; + if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated + || this_shared_changing) { + changing[i] = 1; + changing_mask |= stat_mask; + change_count++; + } + if (be16_to_cpu(new_bc->vl[i].dedicated) < + be16_to_cpu(cur_bc.vl[i].dedicated)) { + lowering_dedicated[i] = 1; + ld_mask |= stat_mask; + } + } + + /* bracket the credit change with a total adjustment */ + if (new_total > cur_total) + set_global_limit(dd, new_total); + + /* + * Start the credit change algorithm. + */ + use_all_mask = 0; + if ((be16_to_cpu(new_bc->overall_shared_limit) < + be16_to_cpu(cur_bc.overall_shared_limit)) + || (is_a0(dd) && any_shared_limit_changing)) { + set_global_shared(dd, 0); + cur_bc.overall_shared_limit = 0; + use_all_mask = 1; + } + + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (changing[i]) { + set_vl_shared(dd, i, 0); + cur_bc.vl[i].shared = 0; + } + } + + wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask, + "shared"); + + if (change_count > 0) { + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (lowering_dedicated[i]) { + set_vl_dedicated(dd, i, + be16_to_cpu(new_bc->vl[i].dedicated)); + cur_bc.vl[i].dedicated = + new_bc->vl[i].dedicated; + } + } + + wait_for_vl_status_clear(dd, ld_mask, "dedicated"); + + /* now raise all dedicated that are going up */ + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (be16_to_cpu(new_bc->vl[i].dedicated) > + be16_to_cpu(cur_bc.vl[i].dedicated)) + set_vl_dedicated(dd, i, + be16_to_cpu(new_bc->vl[i].dedicated)); + } + } + + /* next raise all shared that are going up */ + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (be16_to_cpu(new_bc->vl[i].shared) > + be16_to_cpu(cur_bc.vl[i].shared)) + set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared)); + } + + /* finally raise the global shared */ + if (be16_to_cpu(new_bc->overall_shared_limit) > + be16_to_cpu(cur_bc.overall_shared_limit)) + set_global_shared(dd, + be16_to_cpu(new_bc->overall_shared_limit)); + + /* bracket the credit change with a total adjustment */ + if (new_total < cur_total) + set_global_limit(dd, new_total); + return 0; +} + +/* + * Read the given fabric manager table. Return the size of the + * table (in bytes) on success, and a negative error code on + * failure. + */ +int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t) + +{ + int size; + struct vl_arb_cache *vlc; + + switch (which) { + case FM_TBL_VL_HIGH_ARB: + size = 256; + /* + * OPA specifies 128 elements (of 2 bytes each), though + * HFI supports only 16 elements in h/w. + */ + vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE); + vl_arb_get_cache(vlc, t); + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + break; + case FM_TBL_VL_LOW_ARB: + size = 256; + /* + * OPA specifies 128 elements (of 2 bytes each), though + * HFI supports only 16 elements in h/w. + */ + vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE); + vl_arb_get_cache(vlc, t); + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + break; + case FM_TBL_BUFFER_CONTROL: + size = get_buffer_control(ppd->dd, t, NULL); + break; + case FM_TBL_SC2VLNT: + size = get_sc2vlnt(ppd->dd, t); + break; + case FM_TBL_VL_PREEMPT_ELEMS: + size = 256; + /* OPA specifies 128 elements, of 2 bytes each */ + get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t); + break; + case FM_TBL_VL_PREEMPT_MATRIX: + size = 256; + /* + * OPA specifies that this is the same size as the VL + * arbitration tables (i.e., 256 bytes). + */ + break; + default: + return -EINVAL; + } + return size; +} + +/* + * Write the given fabric manager table. + */ +int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t) +{ + int ret = 0; + struct vl_arb_cache *vlc; + + switch (which) { + case FM_TBL_VL_HIGH_ARB: + vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE); + if (vl_arb_match_cache(vlc, t)) { + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + break; + } + vl_arb_set_cache(vlc, t); + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST, + VL_ARB_HIGH_PRIO_TABLE_SIZE, t); + break; + case FM_TBL_VL_LOW_ARB: + vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE); + if (vl_arb_match_cache(vlc, t)) { + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + break; + } + vl_arb_set_cache(vlc, t); + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST, + VL_ARB_LOW_PRIO_TABLE_SIZE, t); + break; + case FM_TBL_BUFFER_CONTROL: + ret = set_buffer_control(ppd->dd, t); + break; + case FM_TBL_SC2VLNT: + set_sc2vlnt(ppd->dd, t); + break; + default: + ret = -EINVAL; + } + return ret; +} + +/* + * Disable all data VLs. + * + * Return 0 if disabled, non-zero if the VLs cannot be disabled. + */ +static int disable_data_vls(struct hfi1_devdata *dd) +{ + if (is_a0(dd)) + return 1; + + pio_send_control(dd, PSC_DATA_VL_DISABLE); + + return 0; +} + +/* + * open_fill_data_vls() - the counterpart to stop_drain_data_vls(). + * Just re-enables all data VLs (the "fill" part happens + * automatically - the name was chosen for symmetry with + * stop_drain_data_vls()). + * + * Return 0 if successful, non-zero if the VLs cannot be enabled. + */ +int open_fill_data_vls(struct hfi1_devdata *dd) +{ + if (is_a0(dd)) + return 1; + + pio_send_control(dd, PSC_DATA_VL_ENABLE); + + return 0; +} + +/* + * drain_data_vls() - assumes that disable_data_vls() has been called, + * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA + * engines to drop to 0. + */ +static void drain_data_vls(struct hfi1_devdata *dd) +{ + sc_wait(dd); + sdma_wait(dd); + pause_for_credit_return(dd); +} + +/* + * stop_drain_data_vls() - disable, then drain all per-VL fifos. + * + * Use open_fill_data_vls() to resume using data VLs. This pair is + * meant to be used like this: + * + * stop_drain_data_vls(dd); + * // do things with per-VL resources + * open_fill_data_vls(dd); + */ +int stop_drain_data_vls(struct hfi1_devdata *dd) +{ + int ret; + + ret = disable_data_vls(dd); + if (ret == 0) + drain_data_vls(dd); + + return ret; +} + +/* + * Convert a nanosecond time to a cclock count. No matter how slow + * the cclock, a non-zero ns will always have a non-zero result. + */ +u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns) +{ + u32 cclocks; + + if (dd->icode == ICODE_FPGA_EMULATION) + cclocks = (ns * 1000) / FPGA_CCLOCK_PS; + else /* simulation pretends to be ASIC */ + cclocks = (ns * 1000) / ASIC_CCLOCK_PS; + if (ns && !cclocks) /* if ns nonzero, must be at least 1 */ + cclocks = 1; + return cclocks; +} + +/* + * Convert a cclock count to nanoseconds. Not matter how slow + * the cclock, a non-zero cclocks will always have a non-zero result. + */ +u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks) +{ + u32 ns; + + if (dd->icode == ICODE_FPGA_EMULATION) + ns = (cclocks * FPGA_CCLOCK_PS) / 1000; + else /* simulation pretends to be ASIC */ + ns = (cclocks * ASIC_CCLOCK_PS) / 1000; + if (cclocks && !ns) + ns = 1; + return ns; +} + +/* + * Dynamically adjust the receive interrupt timeout for a context based on + * incoming packet rate. + * + * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero. + */ +static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 timeout = rcd->rcvavail_timeout; + + /* + * This algorithm doubles or halves the timeout depending on whether + * the number of packets received in this interrupt were less than or + * greater equal the interrupt count. + * + * The calculations below do not allow a steady state to be achieved. + * Only at the endpoints it is possible to have an unchanging + * timeout. + */ + if (npkts < rcv_intr_count) { + /* + * Not enough packets arrived before the timeout, adjust + * timeout downward. + */ + if (timeout < 2) /* already at minimum? */ + return; + timeout >>= 1; + } else { + /* + * More than enough packets arrived before the timeout, adjust + * timeout upward. + */ + if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */ + return; + timeout = min(timeout << 1, dd->rcv_intr_timeout_csr); + } + + rcd->rcvavail_timeout = timeout; + /* timeout cannot be larger than rcv_intr_timeout_csr which has already + been verified to be in range */ + write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT, + (u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); +} + +void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd, + u32 intr_adjust, u32 npkts) +{ + struct hfi1_devdata *dd = rcd->dd; + u64 reg; + u32 ctxt = rcd->ctxt; + + /* + * Need to write timeout register before updating RcvHdrHead to ensure + * that a new value is used when the HW decides to restart counting. + */ + if (intr_adjust) + adjust_rcv_timeout(rcd, npkts); + if (updegr) { + reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK) + << RCV_EGR_INDEX_HEAD_HEAD_SHIFT; + write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg); + } + mmiowb(); + reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) | + (((u64)hd & RCV_HDR_HEAD_HEAD_MASK) + << RCV_HDR_HEAD_HEAD_SHIFT); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg); + mmiowb(); +} + +u32 hdrqempty(struct hfi1_ctxtdata *rcd) +{ + u32 head, tail; + + head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) + & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT; + + if (rcd->rcvhdrtail_kvaddr) + tail = get_rcvhdrtail(rcd); + else + tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL); + + return head == tail; +} + +/* + * Context Control and Receive Array encoding for buffer size: + * 0x0 invalid + * 0x1 4 KB + * 0x2 8 KB + * 0x3 16 KB + * 0x4 32 KB + * 0x5 64 KB + * 0x6 128 KB + * 0x7 256 KB + * 0x8 512 KB (Receive Array only) + * 0x9 1 MB (Receive Array only) + * 0xa 2 MB (Receive Array only) + * + * 0xB-0xF - reserved (Receive Array only) + * + * + * This routine assumes that the value has already been sanity checked. + */ +static u32 encoded_size(u32 size) +{ + switch (size) { + case 4*1024: return 0x1; + case 8*1024: return 0x2; + case 16*1024: return 0x3; + case 32*1024: return 0x4; + case 64*1024: return 0x5; + case 128*1024: return 0x6; + case 256*1024: return 0x7; + case 512*1024: return 0x8; + case 1*1024*1024: return 0x9; + case 2*1024*1024: return 0xa; + } + return 0x1; /* if invalid, go with the minimum size */ +} + +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) +{ + struct hfi1_ctxtdata *rcd; + u64 rcvctrl, reg; + int did_enable = 0; + + rcd = dd->rcd[ctxt]; + if (!rcd) + return; + + hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op); + + rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL); + /* if the context already enabled, don't do the extra steps */ + if ((op & HFI1_RCVCTRL_CTXT_ENB) + && !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) { + /* reset the tail and hdr addresses, and sequence count */ + write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR, + rcd->rcvhdrq_phys); + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + rcd->rcvhdrqtailaddr_phys); + rcd->seq_cnt = 1; + + /* reset the cached receive header queue head value */ + rcd->head = 0; + + /* + * Zero the receive header queue so we don't get false + * positives when checking the sequence number. The + * sequence numbers could land exactly on the same spot. + * E.g. a rcd restart before the receive header wrapped. + */ + memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); + + /* starting timeout */ + rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr; + + /* enable the context */ + rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK; + + /* clean the egr buffer size first */ + rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK; + rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size) + & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK) + << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT; + + /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */ + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0); + did_enable = 1; + + /* zero RcvEgrIndexHead */ + write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0); + + /* set eager count and base index */ + reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT) + & RCV_EGR_CTRL_EGR_CNT_MASK) + << RCV_EGR_CTRL_EGR_CNT_SHIFT) | + (((rcd->eager_base >> RCV_SHIFT) + & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK) + << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg); + + /* + * Set TID (expected) count and base index. + * rcd->expected_count is set to individual RcvArray entries, + * not pairs, and the CSR takes a pair-count in groups of + * four, so divide by 8. + */ + reg = (((rcd->expected_count >> RCV_SHIFT) + & RCV_TID_CTRL_TID_PAIR_CNT_MASK) + << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) | + (((rcd->expected_base >> RCV_SHIFT) + & RCV_TID_CTRL_TID_BASE_INDEX_MASK) + << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg); + if (ctxt == VL15CTXT) + write_csr(dd, RCV_VL15, VL15CTXT); + } + if (op & HFI1_RCVCTRL_CTXT_DIS) { + write_csr(dd, RCV_VL15, 0); + rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK; + } + if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) + rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys) + rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; + if (op & HFI1_RCVCTRL_TAILUPD_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK; + if (op & HFI1_RCVCTRL_TIDFLOW_ENB) + rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; + if (op & HFI1_RCVCTRL_TIDFLOW_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; + if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) { + /* In one-packet-per-eager mode, the size comes from + the RcvArray entry. */ + rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK; + rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; + } + if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; + if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB) + rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB) + rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + rcd->rcvctrl = rcvctrl; + hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl); + write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl); + + /* work around sticky RcvCtxtStatus.BlockedRHQFull */ + if (did_enable + && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) { + reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); + if (reg != 0) { + dd_dev_info(dd, "ctxt %d status %lld (blocked)\n", + ctxt, reg); + read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00); + read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); + reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); + dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n", + ctxt, reg, reg == 0 ? "not" : "still"); + } + } + + if (did_enable) { + /* + * The interrupt timeout and count must be set after + * the context is enabled to take effect. + */ + /* set interrupt timeout */ + write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT, + (u64)rcd->rcvavail_timeout << + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); + + /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */ + reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT; + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg); + } + + if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS)) + /* + * If the context has been disabled and the Tail Update has + * been cleared, clear the RCV_HDR_TAIL_ADDR CSR so + * it doesn't contain an address that is invalid. + */ + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, 0); +} + +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep, + u64 **cntrp) +{ + int ret; + u64 val = 0; + + if (namep) { + ret = dd->cntrnameslen; + if (pos != 0) { + dd_dev_err(dd, "read_cntrs does not support indexing"); + return 0; + } + *namep = dd->cntrnames; + } else { + const struct cntr_entry *entry; + int i, j; + + ret = (dd->ndevcntrs) * sizeof(u64); + if (pos != 0) { + dd_dev_err(dd, "read_cntrs does not support indexing"); + return 0; + } + + /* Get the start of the block of counters */ + *cntrp = dd->cntrs; + + /* + * Now go and fill in each counter in the block. + */ + for (i = 0; i < DEV_CNTR_LAST; i++) { + entry = &dev_cntrs[i]; + hfi1_cdbg(CNTR, "reading %s", entry->name); + if (entry->flags & CNTR_DISABLED) { + /* Nothing */ + hfi1_cdbg(CNTR, "\tDisabled\n"); + } else { + if (entry->flags & CNTR_VL) { + hfi1_cdbg(CNTR, "\tPer VL\n"); + for (j = 0; j < C_VL_COUNT; j++) { + val = entry->rw_cntr(entry, + dd, j, + CNTR_MODE_R, + 0); + hfi1_cdbg( + CNTR, + "\t\tRead 0x%llx for %d\n", + val, j); + dd->cntrs[entry->offset + j] = + val; + } + } else { + val = entry->rw_cntr(entry, dd, + CNTR_INVALID_VL, + CNTR_MODE_R, 0); + dd->cntrs[entry->offset] = val; + hfi1_cdbg(CNTR, "\tRead 0x%llx", val); + } + } + } + } + return ret; +} + +/* + * Used by sysfs to create files for hfi stats to read + */ +u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp) +{ + int ret; + u64 val = 0; + + if (namep) { + ret = dd->portcntrnameslen; + if (pos != 0) { + dd_dev_err(dd, "index not supported"); + return 0; + } + *namep = dd->portcntrnames; + } else { + const struct cntr_entry *entry; + struct hfi1_pportdata *ppd; + int i, j; + + ret = (dd->nportcntrs) * sizeof(u64); + if (pos != 0) { + dd_dev_err(dd, "indexing not supported"); + return 0; + } + ppd = (struct hfi1_pportdata *)(dd + 1 + port); + *cntrp = ppd->cntrs; + + for (i = 0; i < PORT_CNTR_LAST; i++) { + entry = &port_cntrs[i]; + hfi1_cdbg(CNTR, "reading %s", entry->name); + if (entry->flags & CNTR_DISABLED) { + /* Nothing */ + hfi1_cdbg(CNTR, "\tDisabled\n"); + continue; + } + + if (entry->flags & CNTR_VL) { + hfi1_cdbg(CNTR, "\tPer VL"); + for (j = 0; j < C_VL_COUNT; j++) { + val = entry->rw_cntr(entry, ppd, j, + CNTR_MODE_R, + 0); + hfi1_cdbg( + CNTR, + "\t\tRead 0x%llx for %d", + val, j); + ppd->cntrs[entry->offset + j] = val; + } + } else { + val = entry->rw_cntr(entry, ppd, + CNTR_INVALID_VL, + CNTR_MODE_R, + 0); + ppd->cntrs[entry->offset] = val; + hfi1_cdbg(CNTR, "\tRead 0x%llx", val); + } + } + } + return ret; +} + +static void free_cntrs(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + if (dd->synth_stats_timer.data) + del_timer_sync(&dd->synth_stats_timer); + dd->synth_stats_timer.data = 0; + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + kfree(ppd->cntrs); + kfree(ppd->scntrs); + free_percpu(ppd->ibport_data.rc_acks); + free_percpu(ppd->ibport_data.rc_qacks); + free_percpu(ppd->ibport_data.rc_delayed_comp); + ppd->cntrs = NULL; + ppd->scntrs = NULL; + ppd->ibport_data.rc_acks = NULL; + ppd->ibport_data.rc_qacks = NULL; + ppd->ibport_data.rc_delayed_comp = NULL; + } + kfree(dd->portcntrnames); + dd->portcntrnames = NULL; + kfree(dd->cntrs); + dd->cntrs = NULL; + kfree(dd->scntrs); + dd->scntrs = NULL; + kfree(dd->cntrnames); + dd->cntrnames = NULL; +} + +#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL +#define CNTR_32BIT_MAX 0x00000000FFFFFFFF + +static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry, + u64 *psval, void *context, int vl) +{ + u64 val; + u64 sval = *psval; + + if (entry->flags & CNTR_DISABLED) { + dd_dev_err(dd, "Counter %s not enabled", entry->name); + return 0; + } + + hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval); + + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0); + + /* If its a synthetic counter there is more work we need to do */ + if (entry->flags & CNTR_SYNTH) { + if (sval == CNTR_MAX) { + /* No need to read already saturated */ + return CNTR_MAX; + } + + if (entry->flags & CNTR_32BIT) { + /* 32bit counters can wrap multiple times */ + u64 upper = sval >> 32; + u64 lower = (sval << 32) >> 32; + + if (lower > val) { /* hw wrapped */ + if (upper == CNTR_32BIT_MAX) + val = CNTR_MAX; + else + upper++; + } + + if (val != CNTR_MAX) + val = (upper << 32) | val; + + } else { + /* If we rolled we are saturated */ + if ((val < sval) || (val > CNTR_MAX)) + val = CNTR_MAX; + } + } + + *psval = val; + + hfi1_cdbg(CNTR, "\tNew val=0x%llx", val); + + return val; +} + +static u64 write_dev_port_cntr(struct hfi1_devdata *dd, + struct cntr_entry *entry, + u64 *psval, void *context, int vl, u64 data) +{ + u64 val; + + if (entry->flags & CNTR_DISABLED) { + dd_dev_err(dd, "Counter %s not enabled", entry->name); + return 0; + } + + hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval); + + if (entry->flags & CNTR_SYNTH) { + *psval = data; + if (entry->flags & CNTR_32BIT) { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, + (data << 32) >> 32); + val = data; /* return the full 64bit value */ + } else { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, + data); + } + } else { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data); + } + + *psval = val; + + hfi1_cdbg(CNTR, "\tNew val=0x%llx", val); + + return val; +} + +u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &dev_cntrs[index]; + sval = dd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + return read_dev_port_cntr(dd, entry, sval, dd, vl); +} + +u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &dev_cntrs[index]; + sval = dd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + return write_dev_port_cntr(dd, entry, sval, dd, vl, data); +} + +u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &port_cntrs[index]; + sval = ppd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) && + (index <= C_RCV_HDR_OVF_LAST)) { + /* We do not want to bother for disabled contexts */ + return 0; + } + + return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl); +} + +u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &port_cntrs[index]; + sval = ppd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) && + (index <= C_RCV_HDR_OVF_LAST)) { + /* We do not want to bother for disabled contexts */ + return 0; + } + + return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data); +} + +static void update_synth_timer(unsigned long opaque) +{ + u64 cur_tx; + u64 cur_rx; + u64 total_flits; + u8 update = 0; + int i, j, vl; + struct hfi1_pportdata *ppd; + struct cntr_entry *entry; + + struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque; + + /* + * Rather than keep beating on the CSRs pick a minimal set that we can + * check to watch for potential roll over. We can do this by looking at + * the number of flits sent/recv. If the total flits exceeds 32bits then + * we have to iterate all the counters and update. + */ + entry = &dev_cntrs[C_DC_RCV_FLITS]; + cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0); + + entry = &dev_cntrs[C_DC_XMIT_FLITS]; + cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0); + + hfi1_cdbg( + CNTR, + "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n", + dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx); + + if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) { + /* + * May not be strictly necessary to update but it won't hurt and + * simplifies the logic here. + */ + update = 1; + hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating", + dd->unit); + } else { + total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx); + hfi1_cdbg(CNTR, + "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit, + total_flits, (u64)CNTR_32BIT_MAX); + if (total_flits >= CNTR_32BIT_MAX) { + hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating", + dd->unit); + update = 1; + } + } + + if (update) { + hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit); + for (i = 0; i < DEV_CNTR_LAST; i++) { + entry = &dev_cntrs[i]; + if (entry->flags & CNTR_VL) { + for (vl = 0; vl < C_VL_COUNT; vl++) + read_dev_cntr(dd, i, vl); + } else { + read_dev_cntr(dd, i, CNTR_INVALID_VL); + } + } + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + for (j = 0; j < PORT_CNTR_LAST; j++) { + entry = &port_cntrs[j]; + if (entry->flags & CNTR_VL) { + for (vl = 0; vl < C_VL_COUNT; vl++) + read_port_cntr(ppd, j, vl); + } else { + read_port_cntr(ppd, j, CNTR_INVALID_VL); + } + } + } + + /* + * We want the value in the register. The goal is to keep track + * of the number of "ticks" not the counter value. In other + * words if the register rolls we want to notice it and go ahead + * and force an update. + */ + entry = &dev_cntrs[C_DC_XMIT_FLITS]; + dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, + CNTR_MODE_R, 0); + + entry = &dev_cntrs[C_DC_RCV_FLITS]; + dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, + CNTR_MODE_R, 0); + + hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx", + dd->unit, dd->last_tx, dd->last_rx); + + } else { + hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit); + } + +mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); +} + +#define C_MAX_NAME 13 /* 12 chars + one for /0 */ +static int init_cntrs(struct hfi1_devdata *dd) +{ + int i, rcv_ctxts, index, j; + size_t sz; + char *p; + char name[C_MAX_NAME]; + struct hfi1_pportdata *ppd; + + /* set up the stats timer; the add_timer is done at the end */ + init_timer(&dd->synth_stats_timer); + dd->synth_stats_timer.function = update_synth_timer; + dd->synth_stats_timer.data = (unsigned long) dd; + + /***********************/ + /* per device counters */ + /***********************/ + + /* size names and determine how many we have*/ + dd->ndevcntrs = 0; + sz = 0; + index = 0; + + for (i = 0; i < DEV_CNTR_LAST; i++) { + hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name); + if (dev_cntrs[i].flags & CNTR_DISABLED) { + hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name); + continue; + } + + if (dev_cntrs[i].flags & CNTR_VL) { + hfi1_dbg_early("\tProcessing VL cntr\n"); + dev_cntrs[i].offset = index; + for (j = 0; j < C_VL_COUNT; j++) { + memset(name, '\0', C_MAX_NAME); + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, + vl_from_idx(j)); + sz += strlen(name); + sz++; + hfi1_dbg_early("\t\t%s\n", name); + dd->ndevcntrs++; + index++; + } + } else { + /* +1 for newline */ + sz += strlen(dev_cntrs[i].name) + 1; + dd->ndevcntrs++; + dev_cntrs[i].offset = index; + index++; + hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name); + } + } + + /* allocate space for the counter values */ + dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL); + if (!dd->cntrs) + goto bail; + + dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL); + if (!dd->scntrs) + goto bail; + + + /* allocate space for the counter names */ + dd->cntrnameslen = sz; + dd->cntrnames = kmalloc(sz, GFP_KERNEL); + if (!dd->cntrnames) + goto bail; + + /* fill in the names */ + for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) { + if (dev_cntrs[i].flags & CNTR_DISABLED) { + /* Nothing */ + } else { + if (dev_cntrs[i].flags & CNTR_VL) { + for (j = 0; j < C_VL_COUNT; j++) { + memset(name, '\0', C_MAX_NAME); + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, + vl_from_idx(j)); + memcpy(p, name, strlen(name)); + p += strlen(name); + *p++ = '\n'; + } + } else { + memcpy(p, dev_cntrs[i].name, + strlen(dev_cntrs[i].name)); + p += strlen(dev_cntrs[i].name); + *p++ = '\n'; + } + index++; + } + } + + /*********************/ + /* per port counters */ + /*********************/ + + /* + * Go through the counters for the overflows and disable the ones we + * don't need. This varies based on platform so we need to do it + * dynamically here. + */ + rcv_ctxts = dd->num_rcv_contexts; + for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts; + i <= C_RCV_HDR_OVF_LAST; i++) { + port_cntrs[i].flags |= CNTR_DISABLED; + } + + /* size port counter names and determine how many we have*/ + sz = 0; + dd->nportcntrs = 0; + for (i = 0; i < PORT_CNTR_LAST; i++) { + hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name); + if (port_cntrs[i].flags & CNTR_DISABLED) { + hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name); + continue; + } + + if (port_cntrs[i].flags & CNTR_VL) { + hfi1_dbg_early("\tProcessing VL cntr\n"); + port_cntrs[i].offset = dd->nportcntrs; + for (j = 0; j < C_VL_COUNT; j++) { + memset(name, '\0', C_MAX_NAME); + snprintf(name, C_MAX_NAME, "%s%d", + port_cntrs[i].name, + vl_from_idx(j)); + sz += strlen(name); + sz++; + hfi1_dbg_early("\t\t%s\n", name); + dd->nportcntrs++; + } + } else { + /* +1 for newline */ + sz += strlen(port_cntrs[i].name) + 1; + port_cntrs[i].offset = dd->nportcntrs; + dd->nportcntrs++; + hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name); + } + } + + /* allocate space for the counter names */ + dd->portcntrnameslen = sz; + dd->portcntrnames = kmalloc(sz, GFP_KERNEL); + if (!dd->portcntrnames) + goto bail; + + /* fill in port cntr names */ + for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) { + if (port_cntrs[i].flags & CNTR_DISABLED) + continue; + + if (port_cntrs[i].flags & CNTR_VL) { + for (j = 0; j < C_VL_COUNT; j++) { + memset(name, '\0', C_MAX_NAME); + snprintf(name, C_MAX_NAME, "%s%d", + port_cntrs[i].name, + vl_from_idx(j)); + memcpy(p, name, strlen(name)); + p += strlen(name); + *p++ = '\n'; + } + } else { + memcpy(p, port_cntrs[i].name, + strlen(port_cntrs[i].name)); + p += strlen(port_cntrs[i].name); + *p++ = '\n'; + } + } + + /* allocate per port storage for counter values */ + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); + if (!ppd->cntrs) + goto bail; + + ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); + if (!ppd->scntrs) + goto bail; + } + + /* CPU counters need to be allocated and zeroed */ + if (init_cpu_counters(dd)) + goto bail; + + mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); + return 0; +bail: + free_cntrs(dd); + return -ENOMEM; +} + + +static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate) +{ + switch (chip_lstate) { + default: + dd_dev_err(dd, + "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n", + chip_lstate); + /* fall through */ + case LSTATE_DOWN: + return IB_PORT_DOWN; + case LSTATE_INIT: + return IB_PORT_INIT; + case LSTATE_ARMED: + return IB_PORT_ARMED; + case LSTATE_ACTIVE: + return IB_PORT_ACTIVE; + } +} + +u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate) +{ + /* look at the HFI meta-states only */ + switch (chip_pstate & 0xf0) { + default: + dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n", + chip_pstate); + /* fall through */ + case PLS_DISABLED: + return IB_PORTPHYSSTATE_DISABLED; + case PLS_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case PLS_POLLING: + return IB_PORTPHYSSTATE_POLLING; + case PLS_CONFIGPHY: + return IB_PORTPHYSSTATE_TRAINING; + case PLS_LINKUP: + return IB_PORTPHYSSTATE_LINKUP; + case PLS_PHYTEST: + return IB_PORTPHYSSTATE_PHY_TEST; + } +} + +/* return the OPA port logical state name */ +const char *opa_lstate_name(u32 lstate) +{ + static const char * const port_logical_names[] = { + "PORT_NOP", + "PORT_DOWN", + "PORT_INIT", + "PORT_ARMED", + "PORT_ACTIVE", + "PORT_ACTIVE_DEFER", + }; + if (lstate < ARRAY_SIZE(port_logical_names)) + return port_logical_names[lstate]; + return "unknown"; +} + +/* return the OPA port physical state name */ +const char *opa_pstate_name(u32 pstate) +{ + static const char * const port_physical_names[] = { + "PHYS_NOP", + "reserved1", + "PHYS_POLL", + "PHYS_DISABLED", + "PHYS_TRAINING", + "PHYS_LINKUP", + "PHYS_LINK_ERR_RECOVER", + "PHYS_PHY_TEST", + "reserved8", + "PHYS_OFFLINE", + "PHYS_GANGED", + "PHYS_TEST", + }; + if (pstate < ARRAY_SIZE(port_physical_names)) + return port_physical_names[pstate]; + return "unknown"; +} + +/* + * Read the hardware link state and set the driver's cached value of it. + * Return the (new) current value. + */ +u32 get_logical_state(struct hfi1_pportdata *ppd) +{ + u32 new_state; + + new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd)); + if (new_state != ppd->lstate) { + dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", + opa_lstate_name(new_state), new_state); + ppd->lstate = new_state; + } + /* + * Set port status flags in the page mapped into userspace + * memory. Do it here to ensure a reliable state - this is + * the only function called by all state handling code. + * Always set the flags due to the fact that the cache value + * might have been changed explicitly outside of this + * function. + */ + if (ppd->statusp) { + switch (ppd->lstate) { + case IB_PORT_DOWN: + case IB_PORT_INIT: + *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | + HFI1_STATUS_IB_READY); + break; + case IB_PORT_ARMED: + *ppd->statusp |= HFI1_STATUS_IB_CONF; + break; + case IB_PORT_ACTIVE: + *ppd->statusp |= HFI1_STATUS_IB_READY; + break; + } + } + return ppd->lstate; +} + +/** + * wait_logical_linkstate - wait for an IB link state change to occur + * @ppd: port device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for IB link state change to occur. + * For now, take the easy polling route. + * Returns 0 if state reached, otherwise -ETIMEDOUT. + */ +static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs) +{ + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + if (get_logical_state(ppd) == state) + return 0; + if (time_after(jiffies, timeout)) + break; + msleep(20); + } + dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state); + + return -ETIMEDOUT; +} + +u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd) +{ + static u32 remembered_state = 0xff; + u32 pstate; + u32 ib_pstate; + + pstate = read_physical_state(ppd->dd); + ib_pstate = chip_to_opa_pstate(ppd->dd, pstate); + if (remembered_state != ib_pstate) { + dd_dev_info(ppd->dd, + "%s: physical state changed to %s (0x%x), phy 0x%x\n", + __func__, opa_pstate_name(ib_pstate), ib_pstate, + pstate); + remembered_state = ib_pstate; + } + return ib_pstate; +} + +/* + * Read/modify/write ASIC_QSFP register bits as selected by mask + * data: 0 or 1 in the positions depending on what needs to be written + * dir: 0 for read, 1 for write + * mask: select by setting + * I2CCLK (bit 0) + * I2CDATA (bit 1) + */ +u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, + u32 mask) +{ + u64 qsfp_oe, target_oe; + + target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE; + if (mask) { + /* We are writing register bits, so lock access */ + dir &= mask; + data &= mask; + + qsfp_oe = read_csr(dd, target_oe); + qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir; + write_csr(dd, target_oe, qsfp_oe); + } + /* We are exclusively reading bits here, but it is unlikely + * we'll get valid data when we set the direction of the pin + * in the same call, so read should call this function again + * to get valid data + */ + return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); +} + +#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ +(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +#define SET_STATIC_RATE_CONTROL_SMASK(r) \ +(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +int hfi1_init_ctxt(struct send_context *sc) +{ + if (sc != NULL) { + struct hfi1_devdata *dd = sc->dd; + u64 reg; + u8 set = (sc->type == SC_USER ? + HFI1_CAP_IS_USET(STATIC_RATE_CTRL) : + HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)); + reg = read_kctxt_csr(dd, sc->hw_context, + SEND_CTXT_CHECK_ENABLE); + if (set) + CLEAR_STATIC_RATE_CONTROL_SMASK(reg); + else + SET_STATIC_RATE_CONTROL_SMASK(reg); + write_kctxt_csr(dd, sc->hw_context, + SEND_CTXT_CHECK_ENABLE, reg); + } + return 0; +} + +int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp) +{ + int ret = 0; + u64 reg; + + if (dd->icode != ICODE_RTL_SILICON) { + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info(dd, "%s: tempsense not supported by HW\n", + __func__); + return -EINVAL; + } + reg = read_csr(dd, ASIC_STS_THERM); + temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) & + ASIC_STS_THERM_CURR_TEMP_MASK); + temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) & + ASIC_STS_THERM_LO_TEMP_MASK); + temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) & + ASIC_STS_THERM_HI_TEMP_MASK); + temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) & + ASIC_STS_THERM_CRIT_TEMP_MASK); + /* triggers is a 3-bit value - 1 bit per trigger. */ + temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7); + + return ret; +} + +/* ========================================================================= */ + +/* + * Enable/disable chip from delivering interrupts. + */ +void set_intr_state(struct hfi1_devdata *dd, u32 enable) +{ + int i; + + /* + * In HFI, the mask needs to be 1 to allow interrupts. + */ + if (enable) { + u64 cce_int_mask; + const int qsfp1_int_smask = QSFP1_INT % 64; + const int qsfp2_int_smask = QSFP2_INT % 64; + + /* enable all interrupts */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0); + + /* + * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0 + * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR, + * therefore just one of QSFP1_INT/QSFP2_INT can be used to find + * the index of the appropriate CSR in the CCEIntMask CSR array + */ + cce_int_mask = read_csr(dd, CCE_INT_MASK + + (8*(QSFP1_INT/64))); + if (dd->hfi1_id) { + cce_int_mask &= ~((u64)1 << qsfp1_int_smask); + write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)), + cce_int_mask); + } else { + cce_int_mask &= ~((u64)1 << qsfp2_int_smask); + write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)), + cce_int_mask); + } + } else { + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_MASK + (8*i), 0ull); + } +} + +/* + * Clear all interrupt sources on the chip. + */ +static void clear_all_interrupts(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0); + + write_csr(dd, CCE_ERR_CLEAR, ~(u64)0); + write_csr(dd, MISC_ERR_CLEAR, ~(u64)0); + write_csr(dd, RCV_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0); + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0); + for (i = 0; i < dd->chip_sdma_engines; i++) + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0); + + write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0); + write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0); + write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0); +} + +/* Move to pcie.c? */ +static void disable_intx(struct pci_dev *pdev) +{ + pci_intx(pdev, 0); +} + +static void clean_up_interrupts(struct hfi1_devdata *dd) +{ + int i; + + /* remove irqs - must happen before disabling/turning off */ + if (dd->num_msix_entries) { + /* MSI-X */ + struct hfi1_msix_entry *me = dd->msix_entries; + + for (i = 0; i < dd->num_msix_entries; i++, me++) { + if (me->arg == NULL) /* => no irq, no affinity */ + break; + irq_set_affinity_hint(dd->msix_entries[i].msix.vector, + NULL); + free_irq(me->msix.vector, me->arg); + } + } else { + /* INTx */ + if (dd->requested_intx_irq) { + free_irq(dd->pcidev->irq, dd); + dd->requested_intx_irq = 0; + } + } + + /* turn off interrupts */ + if (dd->num_msix_entries) { + /* MSI-X */ + hfi1_nomsix(dd); + } else { + /* INTx */ + disable_intx(dd->pcidev); + } + + /* clean structures */ + for (i = 0; i < dd->num_msix_entries; i++) + free_cpumask_var(dd->msix_entries[i].mask); + kfree(dd->msix_entries); + dd->msix_entries = NULL; + dd->num_msix_entries = 0; +} + +/* + * Remap the interrupt source from the general handler to the given MSI-X + * interrupt. + */ +static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) +{ + u64 reg; + int m, n; + + /* clear from the handled mask of the general interrupt */ + m = isrc / 64; + n = isrc % 64; + dd->gi_mask[m] &= ~((u64)1 << n); + + /* direct the chip source to the given MSI-X interrupt */ + m = isrc / 8; + n = isrc % 8; + reg = read_csr(dd, CCE_INT_MAP + (8*m)); + reg &= ~((u64)0xff << (8*n)); + reg |= ((u64)msix_intr & 0xff) << (8*n); + write_csr(dd, CCE_INT_MAP + (8*m), reg); +} + +static void remap_sdma_interrupts(struct hfi1_devdata *dd, + int engine, int msix_intr) +{ + /* + * SDMA engine interrupt sources grouped by type, rather than + * engine. Per-engine interrupts are as follows: + * SDMA + * SDMAProgress + * SDMAIdle + */ + remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine, + msix_intr); + remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine, + msix_intr); + remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine, + msix_intr); +} + +static void remap_receive_available_interrupt(struct hfi1_devdata *dd, + int rx, int msix_intr) +{ + remap_intr(dd, IS_RCVAVAIL_START + rx, msix_intr); +} + +static int request_intx_irq(struct hfi1_devdata *dd) +{ + int ret; + + snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME"_%d", + dd->unit); + ret = request_irq(dd->pcidev->irq, general_interrupt, + IRQF_SHARED, dd->intx_name, dd); + if (ret) + dd_dev_err(dd, "unable to request INTx interrupt, err %d\n", + ret); + else + dd->requested_intx_irq = 1; + return ret; +} + +static int request_msix_irqs(struct hfi1_devdata *dd) +{ + const struct cpumask *local_mask; + cpumask_var_t def, rcv; + bool def_ret, rcv_ret; + int first_general, last_general; + int first_sdma, last_sdma; + int first_rx, last_rx; + int first_cpu, restart_cpu, curr_cpu; + int rcv_cpu, sdma_cpu; + int i, ret = 0, possible; + int ht; + + /* calculate the ranges we are going to use */ + first_general = 0; + first_sdma = last_general = first_general + 1; + first_rx = last_sdma = first_sdma + dd->num_sdma; + last_rx = first_rx + dd->n_krcv_queues; + + /* + * Interrupt affinity. + * + * non-rcv avail gets a default mask that + * starts as possible cpus with threads reset + * and each rcv avail reset. + * + * rcv avail gets node relative 1 wrapping back + * to the node relative 1 as necessary. + * + */ + local_mask = cpumask_of_pcibus(dd->pcidev->bus); + /* if first cpu is invalid, use NUMA 0 */ + if (cpumask_first(local_mask) >= nr_cpu_ids) + local_mask = topology_core_cpumask(0); + + def_ret = zalloc_cpumask_var(&def, GFP_KERNEL); + rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL); + if (!def_ret || !rcv_ret) + goto bail; + /* use local mask as default */ + cpumask_copy(def, local_mask); + possible = cpumask_weight(def); + /* disarm threads from default */ + ht = cpumask_weight( + topology_sibling_cpumask(cpumask_first(local_mask))); + for (i = possible/ht; i < possible; i++) + cpumask_clear_cpu(i, def); + /* reset possible */ + possible = cpumask_weight(def); + /* def now has full cores on chosen node*/ + first_cpu = cpumask_first(def); + if (nr_cpu_ids >= first_cpu) + first_cpu++; + restart_cpu = first_cpu; + curr_cpu = restart_cpu; + + for (i = first_cpu; i < dd->n_krcv_queues + first_cpu; i++) { + cpumask_clear_cpu(curr_cpu, def); + cpumask_set_cpu(curr_cpu, rcv); + if (curr_cpu >= possible) + curr_cpu = restart_cpu; + else + curr_cpu++; + } + /* def mask has non-rcv, rcv has recv mask */ + rcv_cpu = cpumask_first(rcv); + sdma_cpu = cpumask_first(def); + + /* + * Sanity check - the code expects all SDMA chip source + * interrupts to be in the same CSR, starting at bit 0. Verify + * that this is true by checking the bit location of the start. + */ + BUILD_BUG_ON(IS_SDMA_START % 64); + + for (i = 0; i < dd->num_msix_entries; i++) { + struct hfi1_msix_entry *me = &dd->msix_entries[i]; + const char *err_info; + irq_handler_t handler; + void *arg; + int idx; + struct hfi1_ctxtdata *rcd = NULL; + struct sdma_engine *sde = NULL; + + /* obtain the arguments to request_irq */ + if (first_general <= i && i < last_general) { + idx = i - first_general; + handler = general_interrupt; + arg = dd; + snprintf(me->name, sizeof(me->name), + DRIVER_NAME"_%d", dd->unit); + err_info = "general"; + } else if (first_sdma <= i && i < last_sdma) { + idx = i - first_sdma; + sde = &dd->per_sdma[idx]; + handler = sdma_interrupt; + arg = sde; + snprintf(me->name, sizeof(me->name), + DRIVER_NAME"_%d sdma%d", dd->unit, idx); + err_info = "sdma"; + remap_sdma_interrupts(dd, idx, i); + } else if (first_rx <= i && i < last_rx) { + idx = i - first_rx; + rcd = dd->rcd[idx]; + /* no interrupt if no rcd */ + if (!rcd) + continue; + /* + * Set the interrupt register and mask for this + * context's interrupt. + */ + rcd->ireg = (IS_RCVAVAIL_START+idx) / 64; + rcd->imask = ((u64)1) << + ((IS_RCVAVAIL_START+idx) % 64); + handler = receive_context_interrupt; + arg = rcd; + snprintf(me->name, sizeof(me->name), + DRIVER_NAME"_%d kctxt%d", dd->unit, idx); + err_info = "receive context"; + remap_receive_available_interrupt(dd, idx, i); + } else { + /* not in our expected range - complain, then + ignore it */ + dd_dev_err(dd, + "Unexpected extra MSI-X interrupt %d\n", i); + continue; + } + /* no argument, no interrupt */ + if (arg == NULL) + continue; + /* make sure the name is terminated */ + me->name[sizeof(me->name)-1] = 0; + + ret = request_irq(me->msix.vector, handler, 0, me->name, arg); + if (ret) { + dd_dev_err(dd, + "unable to allocate %s interrupt, vector %d, index %d, err %d\n", + err_info, me->msix.vector, idx, ret); + return ret; + } + /* + * assign arg after request_irq call, so it will be + * cleaned up + */ + me->arg = arg; + + if (!zalloc_cpumask_var( + &dd->msix_entries[i].mask, + GFP_KERNEL)) + goto bail; + if (handler == sdma_interrupt) { + dd_dev_info(dd, "sdma engine %d cpu %d\n", + sde->this_idx, sdma_cpu); + cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask); + sdma_cpu = cpumask_next(sdma_cpu, def); + if (sdma_cpu >= nr_cpu_ids) + sdma_cpu = cpumask_first(def); + } else if (handler == receive_context_interrupt) { + dd_dev_info(dd, "rcv ctxt %d cpu %d\n", + rcd->ctxt, rcv_cpu); + cpumask_set_cpu(rcv_cpu, dd->msix_entries[i].mask); + rcv_cpu = cpumask_next(rcv_cpu, rcv); + if (rcv_cpu >= nr_cpu_ids) + rcv_cpu = cpumask_first(rcv); + } else { + /* otherwise first def */ + dd_dev_info(dd, "%s cpu %d\n", + err_info, cpumask_first(def)); + cpumask_set_cpu( + cpumask_first(def), dd->msix_entries[i].mask); + } + irq_set_affinity_hint( + dd->msix_entries[i].msix.vector, + dd->msix_entries[i].mask); + } + +out: + free_cpumask_var(def); + free_cpumask_var(rcv); + return ret; +bail: + ret = -ENOMEM; + goto out; +} + +/* + * Set the general handler to accept all interrupts, remap all + * chip interrupts back to MSI-X 0. + */ +static void reset_interrupts(struct hfi1_devdata *dd) +{ + int i; + + /* all interrupts handled by the general handler */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + dd->gi_mask[i] = ~(u64)0; + + /* all chip interrupts map to MSI-X 0 */ + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP + (8*i), 0); +} + +static int set_up_interrupts(struct hfi1_devdata *dd) +{ + struct hfi1_msix_entry *entries; + u32 total, request; + int i, ret; + int single_interrupt = 0; /* we expect to have all the interrupts */ + + /* + * Interrupt count: + * 1 general, "slow path" interrupt (includes the SDMA engines + * slow source, SDMACleanupDone) + * N interrupts - one per used SDMA engine + * M interrupt - one per kernel receive context + */ + total = 1 + dd->num_sdma + dd->n_krcv_queues; + + entries = kcalloc(total, sizeof(*entries), GFP_KERNEL); + if (!entries) { + dd_dev_err(dd, "cannot allocate msix table\n"); + ret = -ENOMEM; + goto fail; + } + /* 1-1 MSI-X entry assignment */ + for (i = 0; i < total; i++) + entries[i].msix.entry = i; + + /* ask for MSI-X interrupts */ + request = total; + request_msix(dd, &request, entries); + + if (request == 0) { + /* using INTx */ + /* dd->num_msix_entries already zero */ + kfree(entries); + single_interrupt = 1; + dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n"); + } else { + /* using MSI-X */ + dd->num_msix_entries = request; + dd->msix_entries = entries; + + if (request != total) { + /* using MSI-X, with reduced interrupts */ + dd_dev_err( + dd, + "cannot handle reduced interrupt case, want %u, got %u\n", + total, request); + ret = -EINVAL; + goto fail; + } + dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); + } + + /* mask all interrupts */ + set_intr_state(dd, 0); + /* clear all pending interrupts */ + clear_all_interrupts(dd); + + /* reset general handler mask, chip MSI-X mappings */ + reset_interrupts(dd); + + if (single_interrupt) + ret = request_intx_irq(dd); + else + ret = request_msix_irqs(dd); + if (ret) + goto fail; + + return 0; + +fail: + clean_up_interrupts(dd); + return ret; +} + +/* + * Set up context values in dd. Sets: + * + * num_rcv_contexts - number of contexts being used + * n_krcv_queues - number of kernel contexts + * first_user_ctxt - first non-kernel context in array of contexts + * freectxts - number of free user contexts + * num_send_contexts - number of PIO send contexts being used + */ +static int set_up_context_variables(struct hfi1_devdata *dd) +{ + int num_kernel_contexts; + int num_user_contexts; + int total_contexts; + int ret; + unsigned ngroups; + + /* + * Kernel contexts: (to be fixed later): + * - min or 2 or 1 context/numa + * - Context 0 - default/errors + * - Context 1 - VL15 + */ + if (n_krcvqs) + num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS; + else + num_kernel_contexts = num_online_nodes(); + num_kernel_contexts = + max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts); + /* + * Every kernel receive context needs an ACK send context. + * one send context is allocated for each VL{0-7} and VL15 + */ + if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) { + dd_dev_err(dd, + "Reducing # kernel rcv contexts to: %d, from %d\n", + (int)(dd->chip_send_contexts - num_vls - 1), + (int)num_kernel_contexts); + num_kernel_contexts = dd->chip_send_contexts - num_vls - 1; + } + /* + * User contexts: (to be fixed later) + * - set to num_rcv_contexts if non-zero + * - default to 1 user context per CPU + */ + if (num_rcv_contexts) + num_user_contexts = num_rcv_contexts; + else + num_user_contexts = num_online_cpus(); + + total_contexts = num_kernel_contexts + num_user_contexts; + + /* + * Adjust the counts given a global max. + */ + if (total_contexts > dd->chip_rcv_contexts) { + dd_dev_err(dd, + "Reducing # user receive contexts to: %d, from %d\n", + (int)(dd->chip_rcv_contexts - num_kernel_contexts), + (int)num_user_contexts); + num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts; + /* recalculate */ + total_contexts = num_kernel_contexts + num_user_contexts; + } + + /* the first N are kernel contexts, the rest are user contexts */ + dd->num_rcv_contexts = total_contexts; + dd->n_krcv_queues = num_kernel_contexts; + dd->first_user_ctxt = num_kernel_contexts; + dd->freectxts = num_user_contexts; + dd_dev_info(dd, + "rcv contexts: chip %d, used %d (kernel %d, user %d)\n", + (int)dd->chip_rcv_contexts, + (int)dd->num_rcv_contexts, + (int)dd->n_krcv_queues, + (int)dd->num_rcv_contexts - dd->n_krcv_queues); + + /* + * Receive array allocation: + * All RcvArray entries are divided into groups of 8. This + * is required by the hardware and will speed up writes to + * consecutive entries by using write-combining of the entire + * cacheline. + * + * The number of groups are evenly divided among all contexts. + * any left over groups will be given to the first N user + * contexts. + */ + dd->rcv_entries.group_size = RCV_INCREMENT; + ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size; + dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts; + dd->rcv_entries.nctxt_extra = ngroups - + (dd->num_rcv_contexts * dd->rcv_entries.ngroups); + dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n", + dd->rcv_entries.ngroups, + dd->rcv_entries.nctxt_extra); + if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size > + MAX_EAGER_ENTRIES * 2) { + dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) / + dd->rcv_entries.group_size; + dd_dev_info(dd, + "RcvArray group count too high, change to %u\n", + dd->rcv_entries.ngroups); + dd->rcv_entries.nctxt_extra = 0; + } + /* + * PIO send contexts + */ + ret = init_sc_pools_and_sizes(dd); + if (ret >= 0) { /* success */ + dd->num_send_contexts = ret; + dd_dev_info( + dd, + "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n", + dd->chip_send_contexts, + dd->num_send_contexts, + dd->sc_sizes[SC_KERNEL].count, + dd->sc_sizes[SC_ACK].count, + dd->sc_sizes[SC_USER].count); + ret = 0; /* success */ + } + + return ret; +} + +/* + * Set the device/port partition key table. The MAD code + * will ensure that, at least, the partial management + * partition key is present in the table. + */ +static void set_partition_keys(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg = 0; + int i; + + dd_dev_info(dd, "Setting partition keys\n"); + for (i = 0; i < hfi1_get_npkeys(dd); i++) { + reg |= (ppd->pkeys[i] & + RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) << + ((i % 4) * + RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT); + /* Each register holds 4 PKey values. */ + if ((i % 4) == 3) { + write_csr(dd, RCV_PARTITION_KEY + + ((i - 3) * 2), reg); + reg = 0; + } + } + + /* Always enable HW pkeys check when pkeys table is set */ + add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK); +} + +/* + * These CSRs and memories are uninitialized on reset and must be + * written before reading to set the ECC/parity bits. + * + * NOTE: All user context CSRs that are not mmaped write-only + * (e.g. the TID flows) must be initialized even if the driver never + * reads them. + */ +static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) +{ + int i, j; + + /* CceIntMap */ + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP+(8*i), 0); + + /* SendCtxtCreditReturnAddr */ + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); + + /* PIO Send buffers */ + /* SDMA Send buffers */ + /* These are not normally read, and (presently) have no method + to be read, so are not pre-initialized */ + + /* RcvHdrAddr */ + /* RcvHdrTailAddr */ + /* RcvTidFlowTable */ + for (i = 0; i < dd->chip_rcv_contexts; i++) { + write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); + write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); + for (j = 0; j < RXE_NUM_TID_FLOWS; j++) + write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0); + } + + /* RcvArray */ + for (i = 0; i < dd->chip_rcv_array_count; i++) + write_csr(dd, RCV_ARRAY + (8*i), + RCV_ARRAY_RT_WRITE_ENABLE_SMASK); + + /* RcvQPMapTable */ + for (i = 0; i < 32; i++) + write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0); +} + +/* + * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus. + */ +static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits, + u64 ctrl_bits) +{ + unsigned long timeout; + u64 reg; + + /* is the condition present? */ + reg = read_csr(dd, CCE_STATUS); + if ((reg & status_bits) == 0) + return; + + /* clear the condition */ + write_csr(dd, CCE_CTRL, ctrl_bits); + + /* wait for the condition to clear */ + timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT); + while (1) { + reg = read_csr(dd, CCE_STATUS); + if ((reg & status_bits) == 0) + return; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n", + status_bits, reg & status_bits); + return; + } + udelay(1); + } +} + +/* set CCE CSRs to chip reset defaults */ +static void reset_cce_csrs(struct hfi1_devdata *dd) +{ + int i; + + /* CCE_REVISION read-only */ + /* CCE_REVISION2 read-only */ + /* CCE_CTRL - bits clear automatically */ + /* CCE_STATUS read-only, use CceCtrl to clear */ + clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK); + clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK); + clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK); + for (i = 0; i < CCE_NUM_SCRATCH; i++) + write_csr(dd, CCE_SCRATCH + (8 * i), 0); + /* CCE_ERR_STATUS read-only */ + write_csr(dd, CCE_ERR_MASK, 0); + write_csr(dd, CCE_ERR_CLEAR, ~0ull); + /* CCE_ERR_FORCE leave alone */ + for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++) + write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0); + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR); + /* CCE_PCIE_CTRL leave alone */ + for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) { + write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0); + write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i), + CCE_MSIX_TABLE_UPPER_RESETCSR); + } + for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) { + /* CCE_MSIX_PBA read-only */ + write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull); + write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull); + } + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP, 0); + for (i = 0; i < CCE_NUM_INT_CSRS; i++) { + /* CCE_INT_STATUS read-only */ + write_csr(dd, CCE_INT_MASK + (8 * i), 0); + write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull); + /* CCE_INT_FORCE leave alone */ + /* CCE_INT_BLOCKED read-only */ + } + for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++) + write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0); +} + +/* set ASIC CSRs to chip reset defaults */ +static void reset_asic_csrs(struct hfi1_devdata *dd) +{ + static DEFINE_MUTEX(asic_mutex); + static int called; + int i; + + /* + * If the HFIs are shared between separate nodes or VMs, + * then more will need to be done here. One idea is a module + * parameter that returns early, letting the first power-on or + * a known first load do the reset and blocking all others. + */ + + /* + * These CSRs should only be reset once - the first one here will + * do the work. Use a mutex so that a non-first caller waits until + * the first is finished before it can proceed. + */ + mutex_lock(&asic_mutex); + if (called) + goto done; + called = 1; + + if (dd->icode != ICODE_FPGA_EMULATION) { + /* emulation does not have an SBus - leave these alone */ + /* + * All writes to ASIC_CFG_SBUS_REQUEST do something. + * Notes: + * o The reset is not zero if aimed at the core. See the + * SBus documentation for details. + * o If the SBus firmware has been updated (e.g. by the BIOS), + * will the reset revert that? + */ + /* ASIC_CFG_SBUS_REQUEST leave alone */ + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); + } + /* ASIC_SBUS_RESULT read-only */ + write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0); + for (i = 0; i < ASIC_NUM_SCRATCH; i++) + write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0); + write_csr(dd, ASIC_CFG_MUTEX, 0); /* this will clear it */ + write_csr(dd, ASIC_CFG_DRV_STR, 0); + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0); + /* ASIC_STS_THERM read-only */ + /* ASIC_CFG_RESET leave alone */ + + write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0); + /* ASIC_PCIE_SD_HOST_STATUS read-only */ + write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0); + write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0); + /* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */ + write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */ + /* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */ + /* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */ + for (i = 0; i < 16; i++) + write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0); + + /* ASIC_GPIO_IN read-only */ + write_csr(dd, ASIC_GPIO_OE, 0); + write_csr(dd, ASIC_GPIO_INVERT, 0); + write_csr(dd, ASIC_GPIO_OUT, 0); + write_csr(dd, ASIC_GPIO_MASK, 0); + /* ASIC_GPIO_STATUS read-only */ + write_csr(dd, ASIC_GPIO_CLEAR, ~0ull); + /* ASIC_GPIO_FORCE leave alone */ + + /* ASIC_QSFP1_IN read-only */ + write_csr(dd, ASIC_QSFP1_OE, 0); + write_csr(dd, ASIC_QSFP1_INVERT, 0); + write_csr(dd, ASIC_QSFP1_OUT, 0); + write_csr(dd, ASIC_QSFP1_MASK, 0); + /* ASIC_QSFP1_STATUS read-only */ + write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull); + /* ASIC_QSFP1_FORCE leave alone */ + + /* ASIC_QSFP2_IN read-only */ + write_csr(dd, ASIC_QSFP2_OE, 0); + write_csr(dd, ASIC_QSFP2_INVERT, 0); + write_csr(dd, ASIC_QSFP2_OUT, 0); + write_csr(dd, ASIC_QSFP2_MASK, 0); + /* ASIC_QSFP2_STATUS read-only */ + write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull); + /* ASIC_QSFP2_FORCE leave alone */ + + write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR); + /* this also writes a NOP command, clearing paging mode */ + write_csr(dd, ASIC_EEP_ADDR_CMD, 0); + write_csr(dd, ASIC_EEP_DATA, 0); + +done: + mutex_unlock(&asic_mutex); +} + +/* set MISC CSRs to chip reset defaults */ +static void reset_misc_csrs(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < 32; i++) { + write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0); + write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0); + write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0); + } + /* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can + only be written 128-byte chunks */ + /* init RSA engine to clear lingering errors */ + write_csr(dd, MISC_CFG_RSA_CMD, 1); + write_csr(dd, MISC_CFG_RSA_MU, 0); + write_csr(dd, MISC_CFG_FW_CTRL, 0); + /* MISC_STS_8051_DIGEST read-only */ + /* MISC_STS_SBM_DIGEST read-only */ + /* MISC_STS_PCIE_DIGEST read-only */ + /* MISC_STS_FAB_DIGEST read-only */ + /* MISC_ERR_STATUS read-only */ + write_csr(dd, MISC_ERR_MASK, 0); + write_csr(dd, MISC_ERR_CLEAR, ~0ull); + /* MISC_ERR_FORCE leave alone */ +} + +/* set TXE CSRs to chip reset defaults */ +static void reset_txe_csrs(struct hfi1_devdata *dd) +{ + int i; + + /* + * TXE Kernel CSRs + */ + write_csr(dd, SEND_CTRL, 0); + __cm_reset(dd, 0); /* reset CM internal state */ + /* SEND_CONTEXTS read-only */ + /* SEND_DMA_ENGINES read-only */ + /* SEND_PIO_MEM_SIZE read-only */ + /* SEND_DMA_MEM_SIZE read-only */ + write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0); + pio_reset_all(dd); /* SEND_PIO_INIT_CTXT */ + /* SEND_PIO_ERR_STATUS read-only */ + write_csr(dd, SEND_PIO_ERR_MASK, 0); + write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull); + /* SEND_PIO_ERR_FORCE leave alone */ + /* SEND_DMA_ERR_STATUS read-only */ + write_csr(dd, SEND_DMA_ERR_MASK, 0); + write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull); + /* SEND_DMA_ERR_FORCE leave alone */ + /* SEND_EGRESS_ERR_STATUS read-only */ + write_csr(dd, SEND_EGRESS_ERR_MASK, 0); + write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull); + /* SEND_EGRESS_ERR_FORCE leave alone */ + write_csr(dd, SEND_BTH_QP, 0); + write_csr(dd, SEND_STATIC_RATE_CONTROL, 0); + write_csr(dd, SEND_SC2VLT0, 0); + write_csr(dd, SEND_SC2VLT1, 0); + write_csr(dd, SEND_SC2VLT2, 0); + write_csr(dd, SEND_SC2VLT3, 0); + write_csr(dd, SEND_LEN_CHECK0, 0); + write_csr(dd, SEND_LEN_CHECK1, 0); + /* SEND_ERR_STATUS read-only */ + write_csr(dd, SEND_ERR_MASK, 0); + write_csr(dd, SEND_ERR_CLEAR, ~0ull); + /* SEND_ERR_FORCE read-only */ + for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++) + write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0); + for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++) + write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0); + for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++) + write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0); + for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++) + write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0); + for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++) + write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0); + write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR); + write_csr(dd, SEND_CM_GLOBAL_CREDIT, + SEND_CM_GLOBAL_CREDIT_RESETCSR); + /* SEND_CM_CREDIT_USED_STATUS read-only */ + write_csr(dd, SEND_CM_TIMER_CTRL, 0); + write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0); + write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0); + write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0); + write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0); + for (i = 0; i < TXE_NUM_DATA_VL; i++) + write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0); + write_csr(dd, SEND_CM_CREDIT_VL15, 0); + /* SEND_CM_CREDIT_USED_VL read-only */ + /* SEND_CM_CREDIT_USED_VL15 read-only */ + /* SEND_EGRESS_CTXT_STATUS read-only */ + /* SEND_EGRESS_SEND_DMA_STATUS read-only */ + write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull); + /* SEND_EGRESS_ERR_INFO read-only */ + /* SEND_EGRESS_ERR_SOURCE read-only */ + + /* + * TXE Per-Context CSRs + */ + for (i = 0; i < dd->chip_send_contexts; i++) { + write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0); + write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0); + write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0); + } + + /* + * TXE Per-SDMA CSRs + */ + for (i = 0; i < dd->chip_sdma_engines; i++) { + write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); + /* SEND_DMA_STATUS read-only */ + write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0); + write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0); + write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0); + /* SEND_DMA_HEAD read-only */ + write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0); + write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0); + /* SEND_DMA_IDLE_CNT read-only */ + write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0); + write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0); + /* SEND_DMA_DESC_FETCHED_CNT read-only */ + /* SEND_DMA_ENG_ERR_STATUS read-only */ + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0); + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull); + /* SEND_DMA_ENG_ERR_FORCE leave alone */ + write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0); + write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0); + } +} + +/* + * Expect on entry: + * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0 + */ +static void init_rbufs(struct hfi1_devdata *dd) +{ + u64 reg; + int count; + + /* + * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are + * clear. + */ + count = 0; + while (1) { + reg = read_csr(dd, RCV_STATUS); + if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK + | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0) + break; + /* + * Give up after 1ms - maximum wait time. + * + * RBuf size is 148KiB. Slowest possible is PCIe Gen1 x1 at + * 250MB/s bandwidth. Lower rate to 66% for overhead to get: + * 148 KB / (66% * 250MB/s) = 920us + */ + if (count++ > 500) { + dd_dev_err(dd, + "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n", + __func__, reg); + break; + } + udelay(2); /* do not busy-wait the CSR */ + } + + /* start the init - expect RcvCtrl to be 0 */ + write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK); + + /* + * Read to force the write of Rcvtrl.RxRbufInit. There is a brief + * period after the write before RcvStatus.RxRbufInitDone is valid. + * The delay in the first run through the loop below is sufficient and + * required before the first read of RcvStatus.RxRbufInintDone. + */ + read_csr(dd, RCV_CTRL); + + /* wait for the init to finish */ + count = 0; + while (1) { + /* delay is required first time through - see above */ + udelay(2); /* do not busy-wait the CSR */ + reg = read_csr(dd, RCV_STATUS); + if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK)) + break; + + /* give up after 100us - slowest possible at 33MHz is 73us */ + if (count++ > 50) { + dd_dev_err(dd, + "%s: RcvStatus.RxRbufInit not set, continuing\n", + __func__); + break; + } + } +} + +/* set RXE CSRs to chip reset defaults */ +static void reset_rxe_csrs(struct hfi1_devdata *dd) +{ + int i, j; + + /* + * RXE Kernel CSRs + */ + write_csr(dd, RCV_CTRL, 0); + init_rbufs(dd); + /* RCV_STATUS read-only */ + /* RCV_CONTEXTS read-only */ + /* RCV_ARRAY_CNT read-only */ + /* RCV_BUF_SIZE read-only */ + write_csr(dd, RCV_BTH_QP, 0); + write_csr(dd, RCV_MULTICAST, 0); + write_csr(dd, RCV_BYPASS, 0); + write_csr(dd, RCV_VL15, 0); + /* this is a clear-down */ + write_csr(dd, RCV_ERR_INFO, + RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); + /* RCV_ERR_STATUS read-only */ + write_csr(dd, RCV_ERR_MASK, 0); + write_csr(dd, RCV_ERR_CLEAR, ~0ull); + /* RCV_ERR_FORCE leave alone */ + for (i = 0; i < 32; i++) + write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0); + for (i = 0; i < 4; i++) + write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0); + for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++) + write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0); + for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++) + write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0); + for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) { + write_csr(dd, RCV_RSM_CFG + (8 * i), 0); + write_csr(dd, RCV_RSM_SELECT + (8 * i), 0); + write_csr(dd, RCV_RSM_MATCH + (8 * i), 0); + } + for (i = 0; i < 32; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0); + + /* + * RXE Kernel and User Per-Context CSRs + */ + for (i = 0; i < dd->chip_rcv_contexts; i++) { + /* kernel */ + write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0); + /* RCV_CTXT_STATUS read-only */ + write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0); + write_kctxt_csr(dd, i, RCV_TID_CTRL, 0); + write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0); + write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); + write_kctxt_csr(dd, i, RCV_HDR_CNT, 0); + write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0); + write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0); + write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); + write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0); + write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0); + + /* user */ + /* RCV_HDR_TAIL read-only */ + write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0); + /* RCV_EGR_INDEX_TAIL read-only */ + write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0); + /* RCV_EGR_OFFSET_TAIL read-only */ + for (j = 0; j < RXE_NUM_TID_FLOWS; j++) { + write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), + 0); + } + } +} + +/* + * Set sc2vl tables. + * + * They power on to zeros, so to avoid send context errors + * they need to be set: + * + * SC 0-7 -> VL 0-7 (respectively) + * SC 15 -> VL 15 + * otherwise + * -> VL 0 + */ +static void init_sc2vl_tables(struct hfi1_devdata *dd) +{ + int i; + /* init per architecture spec, constrained by hardware capability */ + + /* HFI maps sent packets */ + write_csr(dd, SEND_SC2VLT0, SC2VL_VAL( + 0, + 0, 0, 1, 1, + 2, 2, 3, 3, + 4, 4, 5, 5, + 6, 6, 7, 7)); + write_csr(dd, SEND_SC2VLT1, SC2VL_VAL( + 1, + 8, 0, 9, 0, + 10, 0, 11, 0, + 12, 0, 13, 0, + 14, 0, 15, 15)); + write_csr(dd, SEND_SC2VLT2, SC2VL_VAL( + 2, + 16, 0, 17, 0, + 18, 0, 19, 0, + 20, 0, 21, 0, + 22, 0, 23, 0)); + write_csr(dd, SEND_SC2VLT3, SC2VL_VAL( + 3, + 24, 0, 25, 0, + 26, 0, 27, 0, + 28, 0, 29, 0, + 30, 0, 31, 0)); + + /* DC maps received packets */ + write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL( + 15_0, + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15)); + write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL( + 31_16, + 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, + 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0)); + + /* initialize the cached sc2vl values consistently with h/w */ + for (i = 0; i < 32; i++) { + if (i < 8 || i == 15) + *((u8 *)(dd->sc2vl) + i) = (u8)i; + else + *((u8 *)(dd->sc2vl) + i) = 0; + } +} + +/* + * Read chip sizes and then reset parts to sane, disabled, values. We cannot + * depend on the chip going through a power-on reset - a driver may be loaded + * and unloaded many times. + * + * Do not write any CSR values to the chip in this routine - there may be + * a reset following the (possible) FLR in this routine. + * + */ +static void init_chip(struct hfi1_devdata *dd) +{ + int i; + + /* + * Put the HFI CSRs in a known state. + * Combine this with a DC reset. + * + * Stop the device from doing anything while we do a + * reset. We know there are no other active users of + * the device since we are now in charge. Turn off + * off all outbound and inbound traffic and make sure + * the device does not generate any interrupts. + */ + + /* disable send contexts and SDMA engines */ + write_csr(dd, SEND_CTRL, 0); + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); + for (i = 0; i < dd->chip_sdma_engines; i++) + write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); + /* disable port (turn off RXE inbound traffic) and contexts */ + write_csr(dd, RCV_CTRL, 0); + for (i = 0; i < dd->chip_rcv_contexts; i++) + write_csr(dd, RCV_CTXT_CTRL, 0); + /* mask all interrupt sources */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_MASK + (8*i), 0ull); + + /* + * DC Reset: do a full DC reset before the register clear. + * A recommended length of time to hold is one CSR read, + * so reread the CceDcCtrl. Then, hold the DC in reset + * across the clear. + */ + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); + (void) read_csr(dd, CCE_DC_CTRL); + + if (use_flr) { + /* + * A FLR will reset the SPC core and part of the PCIe. + * The parts that need to be restored have already been + * saved. + */ + dd_dev_info(dd, "Resetting CSRs with FLR\n"); + + /* do the FLR, the DC reset will remain */ + hfi1_pcie_flr(dd); + + /* restore command and BARs */ + restore_pci_variables(dd); + + if (is_a0(dd)) { + dd_dev_info(dd, "Resetting CSRs with FLR\n"); + hfi1_pcie_flr(dd); + restore_pci_variables(dd); + } + + } else { + dd_dev_info(dd, "Resetting CSRs with writes\n"); + reset_cce_csrs(dd); + reset_txe_csrs(dd); + reset_rxe_csrs(dd); + reset_asic_csrs(dd); + reset_misc_csrs(dd); + } + /* clear the DC reset */ + write_csr(dd, CCE_DC_CTRL, 0); + /* Set the LED off */ + if (is_a0(dd)) + setextled(dd, 0); + /* + * Clear the QSFP reset. + * A0 leaves the out lines floating on power on, then on an FLR + * enforces a 0 on all out pins. The driver does not touch + * ASIC_QSFPn_OUT otherwise. This leaves RESET_N low and + * anything plugged constantly in reset, if it pays attention + * to RESET_N. + * A prime example of this is SiPh. For now, set all pins high. + * I2CCLK and I2CDAT will change per direction, and INT_N and + * MODPRS_N are input only and their value is ignored. + */ + if (is_a0(dd)) { + write_csr(dd, ASIC_QSFP1_OUT, 0x1f); + write_csr(dd, ASIC_QSFP2_OUT, 0x1f); + } +} + +static void init_early_variables(struct hfi1_devdata *dd) +{ + int i; + + /* assign link credit variables */ + dd->vau = CM_VAU; + dd->link_credits = CM_GLOBAL_CREDITS; + if (is_a0(dd)) + dd->link_credits--; + dd->vcu = cu_to_vcu(hfi1_cu); + /* enough room for 8 MAD packets plus header - 17K */ + dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau); + if (dd->vl15_init > dd->link_credits) + dd->vl15_init = dd->link_credits; + + write_uninitialized_csrs_and_memories(dd); + + if (HFI1_CAP_IS_KSET(PKEY_CHECK)) + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_pportdata *ppd = &dd->pport[i]; + + set_partition_keys(ppd); + } + init_sc2vl_tables(dd); +} + +static void init_kdeth_qp(struct hfi1_devdata *dd) +{ + /* user changed the KDETH_QP */ + if (kdeth_qp != 0 && kdeth_qp >= 0xff) { + /* out of range or illegal value */ + dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring"); + kdeth_qp = 0; + } + if (kdeth_qp == 0) /* not set, or failed range check */ + kdeth_qp = DEFAULT_KDETH_QP; + + write_csr(dd, SEND_BTH_QP, + (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) + << SEND_BTH_QP_KDETH_QP_SHIFT); + + write_csr(dd, RCV_BTH_QP, + (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) + << RCV_BTH_QP_KDETH_QP_SHIFT); +} + +/** + * init_qpmap_table + * @dd - device data + * @first_ctxt - first context + * @last_ctxt - first context + * + * This return sets the qpn mapping table that + * is indexed by qpn[8:1]. + * + * The routine will round robin the 256 settings + * from first_ctxt to last_ctxt. + * + * The first/last looks ahead to having specialized + * receive contexts for mgmt and bypass. Normal + * verbs traffic will assumed to be on a range + * of receive contexts. + */ +static void init_qpmap_table(struct hfi1_devdata *dd, + u32 first_ctxt, + u32 last_ctxt) +{ + u64 reg = 0; + u64 regno = RCV_QP_MAP_TABLE; + int i; + u64 ctxt = first_ctxt; + + for (i = 0; i < 256;) { + if (ctxt == VL15CTXT) { + ctxt++; + if (ctxt > last_ctxt) + ctxt = first_ctxt; + continue; + } + reg |= ctxt << (8 * (i % 8)); + i++; + ctxt++; + if (ctxt > last_ctxt) + ctxt = first_ctxt; + if (i % 8 == 0) { + write_csr(dd, regno, reg); + reg = 0; + regno += 8; + } + } + if (i % 8) + write_csr(dd, regno, reg); + + add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK + | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK); +} + +/** + * init_qos - init RX qos + * @dd - device data + * @first_context + * + * This routine initializes Rule 0 and the + * RSM map table to implement qos. + * + * If all of the limit tests succeed, + * qos is applied based on the array + * interpretation of krcvqs where + * entry 0 is VL0. + * + * The number of vl bits (n) and the number of qpn + * bits (m) are computed to feed both the RSM map table + * and the single rule. + * + */ +static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt) +{ + u8 max_by_vl = 0; + unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; + u64 *rsmmap; + u64 reg; + u8 rxcontext = is_a0(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ + + /* validate */ + if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || + num_vls == 1 || + krcvqsset <= 1) + goto bail; + for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++) + if (krcvqs[i] > max_by_vl) + max_by_vl = krcvqs[i]; + if (max_by_vl > 32) + goto bail; + qpns_per_vl = __roundup_pow_of_two(max_by_vl); + /* determine bits vl */ + n = ilog2(num_vls); + /* determine bits for qpn */ + m = ilog2(qpns_per_vl); + if ((m + n) > 7) + goto bail; + if (num_vls * qpns_per_vl > dd->chip_rcv_contexts) + goto bail; + rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL); + memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64)); + /* init the local copy of the table */ + for (i = 0, ctxt = first_ctxt; i < num_vls; i++) { + unsigned tctxt; + + for (qpn = 0, tctxt = ctxt; + krcvqs[i] && qpn < qpns_per_vl; qpn++) { + unsigned idx, regoff, regidx; + + /* generate index <= 128 */ + idx = (qpn << n) ^ i; + regoff = (idx % 8) * 8; + regidx = idx / 8; + reg = rsmmap[regidx]; + /* replace 0xff with context number */ + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK + << regoff); + reg |= (u64)(tctxt++) << regoff; + rsmmap[regidx] = reg; + if (tctxt == ctxt + krcvqs[i]) + tctxt = ctxt; + } + ctxt += krcvqs[i]; + } + /* flush cached copies to chip */ + for (i = 0; i < NUM_MAP_REGS; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]); + /* add rule0 */ + write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */, + RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK + << RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT | + 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT); + write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */, + LRH_BTH_MATCH_OFFSET + << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | + LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | + LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | + ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | + QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | + ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); + write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */, + LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT | + LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT | + LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT | + LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT); + /* Enable RSM */ + add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); + kfree(rsmmap); + /* map everything else (non-VL15) to context 0 */ + init_qpmap_table( + dd, + 0, + 0); + dd->qos_shift = n + 1; + return; +bail: + dd->qos_shift = 1; + init_qpmap_table( + dd, + dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0, + dd->n_krcv_queues - 1); +} + +static void init_rxe(struct hfi1_devdata *dd) +{ + /* enable all receive errors */ + write_csr(dd, RCV_ERR_MASK, ~0ull); + /* setup QPN map table - start where VL15 context leaves off */ + init_qos( + dd, + dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0); + /* + * make sure RcvCtrl.RcvWcb <= PCIe Device Control + * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config + * space, PciCfgCap2.MaxPayloadSize in HFI). There is only one + * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and + * Max_PayLoad_Size set to its minimum of 128. + * + * Presently, RcvCtrl.RcvWcb is not modified from its default of 0 + * (64 bytes). Max_Payload_Size is possibly modified upward in + * tune_pcie_caps() which is called after this routine. + */ +} + +static void init_other(struct hfi1_devdata *dd) +{ + /* enable all CCE errors */ + write_csr(dd, CCE_ERR_MASK, ~0ull); + /* enable *some* Misc errors */ + write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK); + /* enable all DC errors, except LCB */ + write_csr(dd, DCC_ERR_FLG_EN, ~0ull); + write_csr(dd, DC_DC8051_ERR_EN, ~0ull); +} + +/* + * Fill out the given AU table using the given CU. A CU is defined in terms + * AUs. The table is a an encoding: given the index, how many AUs does that + * represent? + * + * NOTE: Assumes that the register layout is the same for the + * local and remote tables. + */ +static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu, + u32 csr0to3, u32 csr4to7) +{ + write_csr(dd, csr0to3, + 0ull << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT + | 1ull << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT + | 2ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT + | 4ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT); + write_csr(dd, csr4to7, + 8ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT + | 16ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT + | 32ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT + | 64ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT); + +} + +static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu) +{ + assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3, + SEND_CM_LOCAL_AU_TABLE4_TO7); +} + +void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu) +{ + assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3, + SEND_CM_REMOTE_AU_TABLE4_TO7); +} + +static void init_txe(struct hfi1_devdata *dd) +{ + int i; + + /* enable all PIO, SDMA, general, and Egress errors */ + write_csr(dd, SEND_PIO_ERR_MASK, ~0ull); + write_csr(dd, SEND_DMA_ERR_MASK, ~0ull); + write_csr(dd, SEND_ERR_MASK, ~0ull); + write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull); + + /* enable all per-context and per-SDMA engine errors */ + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull); + for (i = 0; i < dd->chip_sdma_engines; i++) + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull); + + /* set the local CU to AU mapping */ + assign_local_cm_au_table(dd, dd->vcu); + + /* + * Set reasonable default for Credit Return Timer + * Don't set on Simulator - causes it to choke. + */ + if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR) + write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE); +} + +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey) +{ + struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */ + ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) << + SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT); + /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */ + if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY)) + reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg); + /* + * Enable send-side J_KEY integrity check, unless this is A0 h/w + * (due to A0 erratum). + */ + if (!is_a0(dd)) { + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + } + + /* Enable J_KEY check on receive context. */ + reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK | + ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) << + RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg); +done: + return ret; +} + +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt) +{ + struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0); + /* + * Disable send-side J_KEY integrity check, unless this is A0 h/w. + * This check would not have been enabled for A0 h/w, see + * set_ctxt_jkey(). + */ + if (!is_a0(dd)) { + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + } + /* Turn off the J_KEY on the receive side */ + write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0); +done: + return ret; +} + +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey) +{ + struct hfi1_ctxtdata *rcd; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (ctxt < dd->num_rcv_contexts) + rcd = dd->rcd[ctxt]; + else { + ret = -EINVAL; + goto done; + } + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) << + SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); +done: + return ret; +} + +int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt) +{ + struct hfi1_ctxtdata *rcd; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (ctxt < dd->num_rcv_contexts) + rcd = dd->rcd[ctxt]; + else { + ret = -EINVAL; + goto done; + } + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0); +done: + return ret; +} + +/* + * Start doing the clean up the the chip. Our clean up happens in multiple + * stages and this is just the first. + */ +void hfi1_start_cleanup(struct hfi1_devdata *dd) +{ + free_cntrs(dd); + free_rcverr(dd); + clean_up_interrupts(dd); +} + +#define HFI_BASE_GUID(dev) \ + ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT)) + +/* + * Certain chip functions need to be initialized only once per asic + * instead of per-device. This function finds the peer device and + * checks whether that chip initialization needs to be done by this + * device. + */ +static void asic_should_init(struct hfi1_devdata *dd) +{ + unsigned long flags; + struct hfi1_devdata *tmp, *peer = NULL; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + /* Find our peer device */ + list_for_each_entry(tmp, &hfi1_dev_list, list) { + if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) && + dd->unit != tmp->unit) { + peer = tmp; + break; + } + } + + /* + * "Claim" the ASIC for initialization if it hasn't been + " "claimed" yet. + */ + if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC)) + dd->flags |= HFI1_DO_INIT_ASIC; + spin_unlock_irqrestore(&hfi1_devs_lock, flags); +} + +/** + * Allocate an initialize the device structure for the hfi. + * @dev: the pci_dev for hfi1_ib device + * @ent: pci_device_id struct for this dev + * + * Also allocates, initializes, and returns the devdata struct for this + * device instance + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + u64 reg; + int i, ret; + static const char * const inames[] = { /* implementation names */ + "RTL silicon", + "RTL VCS simulation", + "RTL FPGA emulation", + "Functional simulator" + }; + + dd = hfi1_alloc_devdata(pdev, + NUM_IB_PORTS * sizeof(struct hfi1_pportdata)); + if (IS_ERR(dd)) + goto bail; + ppd = dd->pport; + for (i = 0; i < dd->num_pports; i++, ppd++) { + int vl; + /* init common fields */ + hfi1_init_pportdata(pdev, ppd, dd, 0, 1); + /* DC supports 4 link widths */ + ppd->link_width_supported = + OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X | + OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X; + ppd->link_width_downgrade_supported = + ppd->link_width_supported; + /* start out enabling only 4X */ + ppd->link_width_enabled = OPA_LINK_WIDTH_4X; + ppd->link_width_downgrade_enabled = + ppd->link_width_downgrade_supported; + /* link width active is 0 when link is down */ + /* link width downgrade active is 0 when link is down */ + + if (num_vls < HFI1_MIN_VLS_SUPPORTED + || num_vls > HFI1_MAX_VLS_SUPPORTED) { + hfi1_early_err(&pdev->dev, + "Invalid num_vls %u, using %u VLs\n", + num_vls, HFI1_MAX_VLS_SUPPORTED); + num_vls = HFI1_MAX_VLS_SUPPORTED; + } + ppd->vls_supported = num_vls; + ppd->vls_operational = ppd->vls_supported; + /* Set the default MTU. */ + for (vl = 0; vl < num_vls; vl++) + dd->vld[vl].mtu = hfi1_max_mtu; + dd->vld[15].mtu = MAX_MAD_PACKET; + /* + * Set the initial values to reasonable default, will be set + * for real when link is up. + */ + ppd->lstate = IB_PORT_DOWN; + ppd->overrun_threshold = 0x4; + ppd->phy_error_threshold = 0xf; + ppd->port_crc_mode_enabled = link_crc_mask; + /* initialize supported LTP CRC mode */ + ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; + /* initialize enabled LTP CRC mode */ + ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4; + /* start in offline */ + ppd->host_link_state = HLS_DN_OFFLINE; + init_vl_arb_caches(ppd); + } + + dd->link_default = HLS_DN_POLL; + + /* + * Do remaining PCIe setup and save PCIe values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped. + */ + ret = hfi1_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* verify that reads actually work, save revision for reset check */ + dd->revision = read_csr(dd, CCE_REVISION); + if (dd->revision == ~(u64)0) { + dd_dev_err(dd, "cannot read chip CSRs\n"); + ret = -EINVAL; + goto bail_cleanup; + } + dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT) + & CCE_REVISION_CHIP_REV_MAJOR_MASK; + dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT) + & CCE_REVISION_CHIP_REV_MINOR_MASK; + + /* obtain the hardware ID - NOT related to unit, which is a + software enumeration */ + reg = read_csr(dd, CCE_REVISION2); + dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT) + & CCE_REVISION2_HFI_ID_MASK; + /* the variable size will remove unwanted bits */ + dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT; + dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT; + dd_dev_info(dd, "Implementation: %s, revision 0x%x\n", + dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown", + (int)dd->irev); + + /* speeds the hardware can support */ + dd->pport->link_speed_supported = OPA_LINK_SPEED_25G; + /* speeds allowed to run at */ + dd->pport->link_speed_enabled = dd->pport->link_speed_supported; + /* give a reasonable active value, will be set on link up */ + dd->pport->link_speed_active = OPA_LINK_SPEED_25G; + + dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS); + dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS); + dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES); + dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE); + dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE); + /* fix up link widths for emulation _p */ + ppd = dd->pport; + if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) { + ppd->link_width_supported = + ppd->link_width_enabled = + ppd->link_width_downgrade_supported = + ppd->link_width_downgrade_enabled = + OPA_LINK_WIDTH_1X; + } + /* insure num_vls isn't larger than number of sdma engines */ + if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) { + dd_dev_err(dd, "num_vls %u too large, using %u VLs\n", + num_vls, HFI1_MAX_VLS_SUPPORTED); + ppd->vls_supported = num_vls = HFI1_MAX_VLS_SUPPORTED; + ppd->vls_operational = ppd->vls_supported; + } + + /* + * Convert the ns parameter to the 64 * cclocks used in the CSR. + * Limit the max if larger than the field holds. If timeout is + * non-zero, then the calculated field will be at least 1. + * + * Must be after icode is set up - the cclock rate depends + * on knowing the hardware being used. + */ + dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64; + if (dd->rcv_intr_timeout_csr > + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK) + dd->rcv_intr_timeout_csr = + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK; + else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout) + dd->rcv_intr_timeout_csr = 1; + + /* obtain chip sizes, reset chip CSRs */ + init_chip(dd); + + /* read in the PCIe link speed information */ + ret = pcie_speeds(dd); + if (ret) + goto bail_cleanup; + + /* needs to be done before we look for the peer device */ + read_guid(dd); + + asic_should_init(dd); + + /* read in firmware */ + ret = hfi1_firmware_init(dd); + if (ret) + goto bail_cleanup; + + /* + * In general, the PCIe Gen3 transition must occur after the + * chip has been idled (so it won't initiate any PCIe transactions + * e.g. an interrupt) and before the driver changes any registers + * (the transition will reset the registers). + * + * In particular, place this call after: + * - init_chip() - the chip will not initiate any PCIe transactions + * - pcie_speeds() - reads the current link speed + * - hfi1_firmware_init() - the needed firmware is ready to be + * downloaded + */ + ret = do_pcie_gen3_transition(dd); + if (ret) + goto bail_cleanup; + + /* start setting dd values and adjusting CSRs */ + init_early_variables(dd); + + parse_platform_config(dd); + + /* add board names as they are defined */ + dd->boardname = kmalloc(64, GFP_KERNEL); + if (!dd->boardname) + goto bail_cleanup; + snprintf(dd->boardname, 64, "Board ID 0x%llx", + dd->revision >> CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT + & CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK); + + snprintf(dd->boardversion, BOARD_VERS_MAX, + "ChipABI %u.%u, %s, ChipRev %u.%u, SW Compat %llu\n", + HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN, + dd->boardname, + (u32)dd->majrev, + (u32)dd->minrev, + (dd->revision >> CCE_REVISION_SW_SHIFT) + & CCE_REVISION_SW_MASK); + + ret = set_up_context_variables(dd); + if (ret) + goto bail_cleanup; + + /* set initial RXE CSRs */ + init_rxe(dd); + /* set initial TXE CSRs */ + init_txe(dd); + /* set initial non-RXE, non-TXE CSRs */ + init_other(dd); + /* set up KDETH QP prefix in both RX and TX CSRs */ + init_kdeth_qp(dd); + + /* send contexts must be set up before receive contexts */ + ret = init_send_contexts(dd); + if (ret) + goto bail_cleanup; + + ret = hfi1_create_ctxts(dd); + if (ret) + goto bail_cleanup; + + dd->rcvhdrsize = DEFAULT_RCVHDRSIZE; + /* + * rcd[0] is guaranteed to be valid by this point. Also, all + * context are using the same value, as per the module parameter. + */ + dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32); + + ret = init_pervl_scs(dd); + if (ret) + goto bail_cleanup; + + /* sdma init */ + for (i = 0; i < dd->num_pports; ++i) { + ret = sdma_init(dd, i); + if (ret) + goto bail_cleanup; + } + + /* use contexts created by hfi1_create_ctxts */ + ret = set_up_interrupts(dd); + if (ret) + goto bail_cleanup; + + /* set up LCB access - must be after set_up_interrupts() */ + init_lcb_access(dd); + + snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n", + dd->base_guid & 0xFFFFFF); + + dd->oui1 = dd->base_guid >> 56 & 0xFF; + dd->oui2 = dd->base_guid >> 48 & 0xFF; + dd->oui3 = dd->base_guid >> 40 & 0xFF; + + ret = load_firmware(dd); /* asymmetric with dispose_firmware() */ + if (ret) + goto bail_clear_intr; + check_fabric_firmware_versions(dd); + + thermal_init(dd); + + ret = init_cntrs(dd); + if (ret) + goto bail_clear_intr; + + ret = init_rcverr(dd); + if (ret) + goto bail_free_cntrs; + + ret = eprom_init(dd); + if (ret) + goto bail_free_rcverr; + + goto bail; + +bail_free_rcverr: + free_rcverr(dd); +bail_free_cntrs: + free_cntrs(dd); +bail_clear_intr: + clean_up_interrupts(dd); +bail_cleanup: + hfi1_pcie_ddcleanup(dd); +bail_free: + hfi1_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} + +static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate, + u32 dw_len) +{ + u32 delta_cycles; + u32 current_egress_rate = ppd->current_egress_rate; + /* rates here are in units of 10^6 bits/sec */ + + if (desired_egress_rate == -1) + return 0; /* shouldn't happen */ + + if (desired_egress_rate >= current_egress_rate) + return 0; /* we can't help go faster, only slower */ + + delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) - + egress_cycles(dw_len * 4, current_egress_rate); + + return (u16)delta_cycles; +} + + +/** + * create_pbc - build a pbc for transmission + * @flags: special case flags or-ed in built pbc + * @srate: static rate + * @vl: vl + * @dwlen: dword length (header words + data words + pbc words) + * + * Create a PBC with the given flags, rate, VL, and length. + * + * NOTE: The PBC created will not insert any HCRC - all callers but one are + * for verbs, which does not use this PSM feature. The lone other caller + * is for the diagnostic interface which calls this if the user does not + * supply their own PBC. + */ +u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl, + u32 dw_len) +{ + u64 pbc, delay = 0; + + if (unlikely(srate_mbs)) + delay = delay_cycles(ppd, srate_mbs, dw_len); + + pbc = flags + | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT) + | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT) + | (vl & PBC_VL_MASK) << PBC_VL_SHIFT + | (dw_len & PBC_LENGTH_DWS_MASK) + << PBC_LENGTH_DWS_SHIFT; + + return pbc; +} + +#define SBUS_THERMAL 0x4f +#define SBUS_THERM_MONITOR_MODE 0x1 + +#define THERM_FAILURE(dev, ret, reason) \ + dd_dev_err((dd), \ + "Thermal sensor initialization failed: %s (%d)\n", \ + (reason), (ret)) + +/* + * Initialize the Avago Thermal sensor. + * + * After initialization, enable polling of thermal sensor through + * SBus interface. In order for this to work, the SBus Master + * firmware has to be loaded due to the fact that the HW polling + * logic uses SBus interrupts, which are not supported with + * default firmware. Otherwise, no data will be returned through + * the ASIC_STS_THERM CSR. + */ +static int thermal_init(struct hfi1_devdata *dd) +{ + int ret = 0; + + if (dd->icode != ICODE_RTL_SILICON || + !(dd->flags & HFI1_DO_INIT_ASIC)) + return ret; + + acquire_hw_mutex(dd); + dd_dev_info(dd, "Initializing thermal sensor\n"); + /* Thermal Sensor Initialization */ + /* Step 1: Reset the Thermal SBus Receiver */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + RESET_SBUS_RECEIVER, 0); + if (ret) { + THERM_FAILURE(dd, ret, "Bus Reset"); + goto done; + } + /* Step 2: Set Reset bit in Thermal block */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + WRITE_SBUS_RECEIVER, 0x1); + if (ret) { + THERM_FAILURE(dd, ret, "Therm Block Reset"); + goto done; + } + /* Step 3: Write clock divider value (100MHz -> 2MHz) */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1, + WRITE_SBUS_RECEIVER, 0x32); + if (ret) { + THERM_FAILURE(dd, ret, "Write Clock Div"); + goto done; + } + /* Step 4: Select temperature mode */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3, + WRITE_SBUS_RECEIVER, + SBUS_THERM_MONITOR_MODE); + if (ret) { + THERM_FAILURE(dd, ret, "Write Mode Sel"); + goto done; + } + /* Step 5: De-assert block reset and start conversion */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + WRITE_SBUS_RECEIVER, 0x2); + if (ret) { + THERM_FAILURE(dd, ret, "Write Reset Deassert"); + goto done; + } + /* Step 5.1: Wait for first conversion (21.5ms per spec) */ + msleep(22); + + /* Enable polling of thermal readings */ + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1); +done: + release_hw_mutex(dd); + return ret; +} + +static void handle_temp_err(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = &dd->pport[0]; + /* + * Thermal Critical Interrupt + * Put the device into forced freeze mode, take link down to + * offline, and put DC into reset. + */ + dd_dev_emerg(dd, + "Critical temperature reached! Forcing device into freeze mode!\n"); + dd->flags |= HFI1_FORCED_FREEZE; + start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT); + /* + * Shut DC down as much and as quickly as possible. + * + * Step 1: Take the link down to OFFLINE. This will cause the + * 8051 to put the Serdes in reset. However, we don't want to + * go through the entire link state machine since we want to + * shutdown ASAP. Furthermore, this is not a graceful shutdown + * but rather an attempt to save the chip. + * Code below is almost the same as quiet_serdes() but avoids + * all the extra work and the sleeps. + */ + ppd->driver_link_ready = 0; + ppd->link_enabled = 0; + set_physical_link_state(dd, PLS_OFFLINE | + (OPA_LINKDOWN_REASON_SMA_DISABLED << 8)); + /* + * Step 2: Shutdown LCB and 8051 + * After shutdown, do not restore DC_CFG_RESET value. + */ + dc_shutdown(dd); +} diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h new file mode 100644 index 000000000..f89a432c7 --- /dev/null +++ b/drivers/staging/rdma/hfi1/chip.h @@ -0,0 +1,1035 @@ +#ifndef _CHIP_H +#define _CHIP_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains all of the defines that is specific to the HFI chip + */ + +/* sizes */ +#define CCE_NUM_MSIX_VECTORS 256 +#define CCE_NUM_INT_CSRS 12 +#define CCE_NUM_INT_MAP_CSRS 96 +#define NUM_INTERRUPT_SOURCES 768 +#define RXE_NUM_CONTEXTS 160 +#define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */ +#define RXE_NUM_TID_FLOWS 32 +#define RXE_NUM_DATA_VL 8 +#define TXE_NUM_CONTEXTS 160 +#define TXE_NUM_SDMA_ENGINES 16 +#define NUM_CONTEXTS_PER_SET 8 +#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16 +#define VL_ARB_LOW_PRIO_TABLE_SIZE 16 +#define VL_ARB_TABLE_SIZE 16 +#define TXE_NUM_32_BIT_COUNTER 7 +#define TXE_NUM_64_BIT_COUNTER 30 +#define TXE_NUM_DATA_VL 8 +#define TXE_PIO_SIZE (32 * 0x100000) /* 32 MB */ +#define PIO_BLOCK_SIZE 64 /* bytes */ +#define SDMA_BLOCK_SIZE 64 /* bytes */ +#define RCV_BUF_BLOCK_SIZE 64 /* bytes */ +#define PIO_CMASK 0x7ff /* counter mask for free and fill counters */ +#define MAX_EAGER_ENTRIES 2048 /* max receive eager entries */ +#define MAX_TID_PAIR_ENTRIES 1024 /* max receive expected pairs */ +/* Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed + at 64 bytes for all generation one devices */ +#define CM_VAU 3 +/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */ +#define CM_GLOBAL_CREDITS 0x940 +/* Number of PKey entries in the HW */ +#define MAX_PKEY_VALUES 16 + +#include "chip_registers.h" + +#define RXE_PER_CONTEXT_USER (RXE + RXE_PER_CONTEXT_OFFSET) +#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET) + +/* PBC flags */ +#define PBC_INTR (1ull << 31) +#define PBC_DC_INFO_SHIFT (30) +#define PBC_DC_INFO (1ull << PBC_DC_INFO_SHIFT) +#define PBC_TEST_EBP (1ull << 29) +#define PBC_PACKET_BYPASS (1ull << 28) +#define PBC_CREDIT_RETURN (1ull << 25) +#define PBC_INSERT_BYPASS_ICRC (1ull << 24) +#define PBC_TEST_BAD_ICRC (1ull << 23) +#define PBC_FECN (1ull << 22) + +/* PbcInsertHcrc field settings */ +#define PBC_IHCRC_LKDETH 0x0 /* insert @ local KDETH offset */ +#define PBC_IHCRC_GKDETH 0x1 /* insert @ global KDETH offset */ +#define PBC_IHCRC_NONE 0x2 /* no HCRC inserted */ + +/* PBC fields */ +#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32 +#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull +#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \ + (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \ + PBC_STATIC_RATE_CONTROL_COUNT_SHIFT) + +#define PBC_INSERT_HCRC_SHIFT 26 +#define PBC_INSERT_HCRC_MASK 0x3ull +#define PBC_INSERT_HCRC_SMASK \ + (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT) + +#define PBC_VL_SHIFT 12 +#define PBC_VL_MASK 0xfull +#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT) + +#define PBC_LENGTH_DWS_SHIFT 0 +#define PBC_LENGTH_DWS_MASK 0xfffull +#define PBC_LENGTH_DWS_SMASK \ + (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT) + +/* Credit Return Fields */ +#define CR_COUNTER_SHIFT 0 +#define CR_COUNTER_MASK 0x7ffull +#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT) + +#define CR_STATUS_SHIFT 11 +#define CR_STATUS_MASK 0x1ull +#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12 +#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \ + CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13 +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \ + CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14 +#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \ + CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15 +#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \ + CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT) + +/* interrupt source numbers */ +#define IS_GENERAL_ERR_START 0 +#define IS_SDMAENG_ERR_START 16 +#define IS_SENDCTXT_ERR_START 32 +#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */ +#define IS_VARIOUS_START 240 +#define IS_DC_START 248 +#define IS_RCVAVAIL_START 256 +#define IS_RCVURGENT_START 416 +#define IS_SENDCREDIT_START 576 +#define IS_RESERVED_START 736 +#define IS_MAX_SOURCES 768 + +/* derived interrupt source values */ +#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START +#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START +#define IS_SENDCTXT_ERR_END IS_SDMA_START +#define IS_SDMA_END IS_VARIOUS_START +#define IS_VARIOUS_END IS_DC_START +#define IS_DC_END IS_RCVAVAIL_START +#define IS_RCVAVAIL_END IS_RCVURGENT_START +#define IS_RCVURGENT_END IS_SENDCREDIT_START +#define IS_SENDCREDIT_END IS_RESERVED_START +#define IS_RESERVED_END IS_MAX_SOURCES + +/* absolute interrupt numbers for QSFP1Int and QSFP2Int */ +#define QSFP1_INT 242 +#define QSFP2_INT 243 + +/* DCC_CFG_PORT_CONFIG logical link states */ +#define LSTATE_DOWN 0x1 +#define LSTATE_INIT 0x2 +#define LSTATE_ARMED 0x3 +#define LSTATE_ACTIVE 0x4 + +/* DC8051_STS_CUR_STATE port values (physical link states) */ +#define PLS_DISABLED 0x30 +#define PLS_OFFLINE 0x90 +#define PLS_OFFLINE_QUIET 0x90 +#define PLS_OFFLINE_PLANNED_DOWN_INFORM 0x91 +#define PLS_OFFLINE_READY_TO_QUIET_LT 0x92 +#define PLS_OFFLINE_REPORT_FAILURE 0x93 +#define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94 +#define PLS_POLLING 0x20 +#define PLS_POLLING_QUIET 0x20 +#define PLS_POLLING_ACTIVE 0x21 +#define PLS_CONFIGPHY 0x40 +#define PLS_CONFIGPHY_DEBOUCE 0x40 +#define PLS_CONFIGPHY_ESTCOMM 0x41 +#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT 0x42 +#define PLS_CONFIGPHY_ESTcOMM_LOCAL_COMPLETE 0x43 +#define PLS_CONFIGPHY_OPTEQ 0x44 +#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING 0x44 +#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE 0x45 +#define PLS_CONFIGPHY_VERIFYCAP 0x46 +#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE 0x46 +#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47 +#define PLS_CONFIGLT 0x48 +#define PLS_CONFIGLT_CONFIGURE 0x48 +#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE 0x49 +#define PLS_LINKUP 0x50 +#define PLS_PHYTEST 0xB0 +#define PLS_INTERNAL_SERDES_LOOPBACK 0xe1 +#define PLS_QUICK_LINKUP 0xe2 + +/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */ +#define HCMD_LOAD_CONFIG_DATA 0x01 +#define HCMD_READ_CONFIG_DATA 0x02 +#define HCMD_CHANGE_PHY_STATE 0x03 +#define HCMD_SEND_LCB_IDLE_MSG 0x04 +#define HCMD_MISC 0x05 +#define HCMD_READ_LCB_IDLE_MSG 0x06 +#define HCMD_READ_LCB_CSR 0x07 +#define HCMD_INTERFACE_TEST 0xff + +/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */ +#define HCMD_SUCCESS 2 + +/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */ +#define SPICO_ROM_FAILED (1 << 0) +#define UNKNOWN_FRAME (1 << 1) +#define TARGET_BER_NOT_MET (1 << 2) +#define FAILED_SERDES_INTERNAL_LOOPBACK (1 << 3) +#define FAILED_SERDES_INIT (1 << 4) +#define FAILED_LNI_POLLING (1 << 5) +#define FAILED_LNI_DEBOUNCE (1 << 6) +#define FAILED_LNI_ESTBCOMM (1 << 7) +#define FAILED_LNI_OPTEQ (1 << 8) +#define FAILED_LNI_VERIFY_CAP1 (1 << 9) +#define FAILED_LNI_VERIFY_CAP2 (1 << 10) +#define FAILED_LNI_CONFIGLT (1 << 11) + +#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \ + | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \ + | FAILED_LNI_VERIFY_CAP1 \ + | FAILED_LNI_VERIFY_CAP2 \ + | FAILED_LNI_CONFIGLT) + +/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */ +#define HOST_REQ_DONE (1 << 0) +#define BC_PWR_MGM_MSG (1 << 1) +#define BC_SMA_MSG (1 << 2) +#define BC_BCC_UNKOWN_MSG (1 << 3) +#define BC_IDLE_UNKNOWN_MSG (1 << 4) +#define EXT_DEVICE_CFG_REQ (1 << 5) +#define VERIFY_CAP_FRAME (1 << 6) +#define LINKUP_ACHIEVED (1 << 7) +#define LINK_GOING_DOWN (1 << 8) +#define LINK_WIDTH_DOWNGRADED (1 << 9) + +/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */ +#define HREQ_LOAD_CONFIG 0x01 +#define HREQ_SAVE_CONFIG 0x02 +#define HREQ_READ_CONFIG 0x03 +#define HREQ_SET_TX_EQ_ABS 0x04 +#define HREQ_SET_TX_EQ_REL 0x05 +#define HREQ_ENABLE 0x06 +#define HREQ_CONFIG_DONE 0xfe +#define HREQ_INTERFACE_TEST 0xff + +/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */ +#define HREQ_INVALID 0x01 +#define HREQ_SUCCESS 0x02 +#define HREQ_NOT_SUPPORTED 0x03 +#define HREQ_FEATURE_NOT_SUPPORTED 0x04 /* request specific feature */ +#define HREQ_REQUEST_REJECTED 0xfe +#define HREQ_EXECUTION_ONGOING 0xff + +/* MISC host command functions */ +#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1 +#define HCMD_MISC_GRANT_LCB_ACCESS 0x2 + +/* idle flit message types */ +#define IDLE_PHYSICAL_LINK_MGMT 0x1 +#define IDLE_CRU 0x2 +#define IDLE_SMA 0x3 +#define IDLE_POWER_MGMT 0x4 + +/* idle flit message send fields (both send and read) */ +#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */ +#define IDLE_PAYLOAD_SHIFT 8 +#define IDLE_MSG_TYPE_MASK 0xf +#define IDLE_MSG_TYPE_SHIFT 0 + +/* idle flit message read fields */ +#define READ_IDLE_MSG_TYPE_MASK 0xf +#define READ_IDLE_MSG_TYPE_SHIFT 0 + +/* SMA idle flit payload commands */ +#define SMA_IDLE_ARM 1 +#define SMA_IDLE_ACTIVE 2 + +/* DC_DC8051_CFG_MODE.GENERAL bits */ +#define DISABLE_SELF_GUID_CHECK 0x2 + +/* + * Eager buffer minimum and maximum sizes supported by the hardware. + * All power-of-two sizes in between are supported as well. + * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory + * allocatable for Eager buffer to a single context. All others + * are limits for the RcvArray entries. + */ +#define MIN_EAGER_BUFFER (4 * 1024) +#define MAX_EAGER_BUFFER (256 * 1024) +#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */ +#define MAX_EXPECTED_BUFFER (2048 * 1024) + +/* + * Receive expected base and count and eager base and count increment - + * the CSR fields hold multiples of this value. + */ +#define RCV_SHIFT 3 +#define RCV_INCREMENT (1 << RCV_SHIFT) + +/* + * Receive header queue entry increment - the CSR holds multiples of + * this value. + */ +#define HDRQ_SIZE_SHIFT 5 +#define HDRQ_INCREMENT (1 << HDRQ_SIZE_SHIFT) + +/* + * Freeze handling flags + */ +#define FREEZE_ABORT 0x01 /* do not do recovery */ +#define FREEZE_SELF 0x02 /* initiate the freeze */ +#define FREEZE_LINK_DOWN 0x04 /* link is down */ + +/* + * Chip implementation codes. + */ +#define ICODE_RTL_SILICON 0x00 +#define ICODE_RTL_VCS_SIMULATION 0x01 +#define ICODE_FPGA_EMULATION 0x02 +#define ICODE_FUNCTIONAL_SIMULATOR 0x03 + +/* + * 8051 data memory size. + */ +#define DC8051_DATA_MEM_SIZE 0x1000 + +/* + * 8051 firmware registers + */ +#define NUM_GENERAL_FIELDS 0x17 +#define NUM_LANE_FIELDS 0x8 + +/* 8051 general register Field IDs */ +#define TX_SETTINGS 0x06 +#define VERIFY_CAP_LOCAL_PHY 0x07 +#define VERIFY_CAP_LOCAL_FABRIC 0x08 +#define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09 +#define LOCAL_DEVICE_ID 0x0a +#define LOCAL_LNI_INFO 0x0c +#define REMOTE_LNI_INFO 0x0d +#define MISC_STATUS 0x0e +#define VERIFY_CAP_REMOTE_PHY 0x0f +#define VERIFY_CAP_REMOTE_FABRIC 0x10 +#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11 +#define LAST_LOCAL_STATE_COMPLETE 0x12 +#define LAST_REMOTE_STATE_COMPLETE 0x13 +#define LINK_QUALITY_INFO 0x14 +#define REMOTE_DEVICE_ID 0x15 + +/* Lane ID for general configuration registers */ +#define GENERAL_CONFIG 4 + +/* LOAD_DATA 8051 command shifts and fields */ +#define LOAD_DATA_FIELD_ID_SHIFT 40 +#define LOAD_DATA_FIELD_ID_MASK 0xfull +#define LOAD_DATA_LANE_ID_SHIFT 32 +#define LOAD_DATA_LANE_ID_MASK 0xfull +#define LOAD_DATA_DATA_SHIFT 0x0 +#define LOAD_DATA_DATA_MASK 0xffffffffull + +/* READ_DATA 8051 command shifts and fields */ +#define READ_DATA_FIELD_ID_SHIFT 40 +#define READ_DATA_FIELD_ID_MASK 0xffull +#define READ_DATA_LANE_ID_SHIFT 32 +#define READ_DATA_LANE_ID_MASK 0xffull +#define READ_DATA_DATA_SHIFT 0x0 +#define READ_DATA_DATA_MASK 0xffffffffull + +/* TX settings fields */ +#define ENABLE_LANE_TX_SHIFT 0 +#define ENABLE_LANE_TX_MASK 0xff +#define TX_POLARITY_INVERSION_SHIFT 8 +#define TX_POLARITY_INVERSION_MASK 0xff +#define RX_POLARITY_INVERSION_SHIFT 16 +#define RX_POLARITY_INVERSION_MASK 0xff +#define MAX_RATE_SHIFT 24 +#define MAX_RATE_MASK 0xff + +/* verify capability PHY fields */ +#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4 +#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK 0x1 +#define POWER_MANAGEMENT_SHIFT 0x0 +#define POWER_MANAGEMENT_MASK 0xf + +/* 8051 lane register Field IDs */ +#define SPICO_FW_VERSION 0x7 /* SPICO firmware version */ + +/* SPICO firmware version fields */ +#define SPICO_ROM_VERSION_SHIFT 0 +#define SPICO_ROM_VERSION_MASK 0xffff +#define SPICO_ROM_PROD_ID_SHIFT 16 +#define SPICO_ROM_PROD_ID_MASK 0xffff + +/* verify capability fabric fields */ +#define VAU_SHIFT 0 +#define VAU_MASK 0x0007 +#define Z_SHIFT 3 +#define Z_MASK 0x0001 +#define VCU_SHIFT 4 +#define VCU_MASK 0x0007 +#define VL15BUF_SHIFT 8 +#define VL15BUF_MASK 0x0fff +#define CRC_SIZES_SHIFT 20 +#define CRC_SIZES_MASK 0x7 + +/* verify capability local link width fields */ +#define LINK_WIDTH_SHIFT 0 /* also for remote link width */ +#define LINK_WIDTH_MASK 0xffff /* also for remote link width */ +#define LOCAL_FLAG_BITS_SHIFT 16 +#define LOCAL_FLAG_BITS_MASK 0xff +#define MISC_CONFIG_BITS_SHIFT 24 +#define MISC_CONFIG_BITS_MASK 0xff + +/* verify capability remote link width fields */ +#define REMOTE_TX_RATE_SHIFT 16 +#define REMOTE_TX_RATE_MASK 0xff + +/* LOCAL_DEVICE_ID fields */ +#define LOCAL_DEVICE_REV_SHIFT 0 +#define LOCAL_DEVICE_REV_MASK 0xff +#define LOCAL_DEVICE_ID_SHIFT 8 +#define LOCAL_DEVICE_ID_MASK 0xffff + +/* REMOTE_DEVICE_ID fields */ +#define REMOTE_DEVICE_REV_SHIFT 0 +#define REMOTE_DEVICE_REV_MASK 0xff +#define REMOTE_DEVICE_ID_SHIFT 8 +#define REMOTE_DEVICE_ID_MASK 0xffff + +/* local LNI link width fields */ +#define ENABLE_LANE_RX_SHIFT 16 +#define ENABLE_LANE_RX_MASK 0xff + +/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */ +#define MGMT_ALLOWED_SHIFT 23 +#define MGMT_ALLOWED_MASK 0x1 + +/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */ +#define LINK_QUALITY_SHIFT 24 +#define LINK_QUALITY_MASK 0x7 + +/* + * mask, shift for reading 'planned_down_remote_reason_code' + * from LINK_QUALITY_INFO field + */ +#define DOWN_REMOTE_REASON_SHIFT 16 +#define DOWN_REMOTE_REASON_MASK 0xff + +/* verify capability PHY power management bits */ +#define PWRM_BER_CONTROL 0x1 +#define PWRM_BANDWIDTH_CONTROL 0x2 + +/* verify capability fabric CRC size bits */ +enum { + CAP_CRC_14B = (1 << 0), /* 14b CRC */ + CAP_CRC_48B = (1 << 1), /* 48b CRC */ + CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */ +}; + +#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B) + +/* misc status version fields */ +#define STS_FM_VERSION_A_SHIFT 16 +#define STS_FM_VERSION_A_MASK 0xff +#define STS_FM_VERSION_B_SHIFT 24 +#define STS_FM_VERSION_B_MASK 0xff + +/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */ +#define LCB_CRC_16B 0x0 /* 16b CRC */ +#define LCB_CRC_14B 0x1 /* 14b CRC */ +#define LCB_CRC_48B 0x2 /* 48b CRC */ +#define LCB_CRC_12B_16B_PER_LANE 0x3 /* 12b-16b per lane CRC */ + +/* the following enum is (almost) a copy/paste of the definition + * in the OPA spec, section 20.2.2.6.8 (PortInfo) */ +enum { + PORT_LTP_CRC_MODE_NONE = 0, + PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */ + PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */ + PORT_LTP_CRC_MODE_48 = 4, + /* 48-bit overlapping LTP CRC mode (optional) */ + PORT_LTP_CRC_MODE_PER_LANE = 8 + /* 12 to 16 bit per lane LTP CRC mode (optional) */ +}; + +/* timeouts */ +#define LINK_RESTART_DELAY 1000 /* link restart delay, in ms */ +#define TIMEOUT_8051_START 5000 /* 8051 start timeout, in ms */ +#define DC8051_COMMAND_TIMEOUT 20000 /* DC8051 command timeout, in ms */ +#define FREEZE_STATUS_TIMEOUT 20 /* wait for freeze indicators, in ms */ +#define VL_STATUS_CLEAR_TIMEOUT 5000 /* per-VL status clear, in ms */ +#define CCE_STATUS_TIMEOUT 10 /* time to clear CCE Status, in ms */ + +/* cclock tick time, in picoseconds per tick: 1/speed * 10^12 */ +#define ASIC_CCLOCK_PS 1242 /* 805 MHz */ +#define FPGA_CCLOCK_PS 30300 /* 33 MHz */ + +/* + * Mask of enabled MISC errors. Do not enable the two RSA engine errors - + * see firmware.c:run_rsa() for details. + */ +#define DRIVER_MISC_MASK \ + (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \ + | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)) + +/* valid values for the loopback module parameter */ +#define LOOPBACK_NONE 0 /* no loopback - default */ +#define LOOPBACK_SERDES 1 +#define LOOPBACK_LCB 2 +#define LOOPBACK_CABLE 3 /* external cable */ + +/* read and write hardware registers */ +u64 read_csr(const struct hfi1_devdata *dd, u32 offset); +void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value); + +/* + * The *_kctxt_* flavor of the CSR read/write functions are for + * per-context or per-SDMA CSRs that are not mappable to user-space. + * Their spacing is not a PAGE_SIZE multiple. + */ +static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt, + u32 offset0) +{ + /* kernel per-context CSRs are separated by 0x100 */ + return read_csr(dd, offset0 + (0x100 * ctxt)); +} + +static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt, + u32 offset0, u64 value) +{ + /* kernel per-context CSRs are separated by 0x100 */ + write_csr(dd, offset0 + (0x100 * ctxt), value); +} + +int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data); +int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data); + +void __iomem *get_csr_addr( + struct hfi1_devdata *dd, + u32 offset); + +static inline void __iomem *get_kctxt_csr_addr( + struct hfi1_devdata *dd, + int ctxt, + u32 offset0) +{ + return get_csr_addr(dd, offset0 + (0x100 * ctxt)); +} + +/* + * The *_uctxt_* flavor of the CSR read/write functions are for + * per-context CSRs that are mappable to user space. All these CSRs + * are spaced by a PAGE_SIZE multiple in order to be mappable to + * different processes without exposing other contexts' CSRs + */ +static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt, + u32 offset0) +{ + /* user per-context CSRs are separated by 0x1000 */ + return read_csr(dd, offset0 + (0x1000 * ctxt)); +} + +static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt, + u32 offset0, u64 value) +{ + /* user per-context CSRs are separated by 0x1000 */ + write_csr(dd, offset0 + (0x1000 * ctxt), value); +} + +u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32); + +/* firmware.c */ +#define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */ +extern const u8 pcie_serdes_broadcast[]; +extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES]; +/* SBus commands */ +#define RESET_SBUS_RECEIVER 0x20 +#define WRITE_SBUS_RECEIVER 0x21 +void sbus_request(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); +int sbus_request_slow(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); +void set_sbus_fast_mode(struct hfi1_devdata *dd); +void clear_sbus_fast_mode(struct hfi1_devdata *dd); +int hfi1_firmware_init(struct hfi1_devdata *dd); +int load_pcie_firmware(struct hfi1_devdata *dd); +int load_firmware(struct hfi1_devdata *dd); +void dispose_firmware(void); +int acquire_hw_mutex(struct hfi1_devdata *dd); +void release_hw_mutex(struct hfi1_devdata *dd); +void fabric_serdes_reset(struct hfi1_devdata *dd); +int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result); + +/* chip.c */ +void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b); +void read_guid(struct hfi1_devdata *dd); +int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout); +void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, + u8 neigh_reason, u8 rem_reason); +int set_link_state(struct hfi1_pportdata *, u32 state); +int port_ltp_to_cap(int port_ltp); +void handle_verify_cap(struct work_struct *work); +void handle_freeze(struct work_struct *work); +void handle_link_up(struct work_struct *work); +void handle_link_down(struct work_struct *work); +void handle_link_downgrade(struct work_struct *work); +void handle_link_bounce(struct work_struct *work); +void handle_sma_message(struct work_struct *work); +void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); +int send_idle_sma(struct hfi1_devdata *dd, u64 message); +int start_link(struct hfi1_pportdata *ppd); +void init_qsfp(struct hfi1_pportdata *ppd); +int bringup_serdes(struct hfi1_pportdata *ppd); +void set_intr_state(struct hfi1_devdata *dd, u32 enable); +void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, + int refresh_widths); +void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32); +int stop_drain_data_vls(struct hfi1_devdata *dd); +int open_fill_data_vls(struct hfi1_devdata *dd); +u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns); +u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock); +void get_linkup_link_widths(struct hfi1_pportdata *ppd); +void read_ltp_rtt(struct hfi1_devdata *dd); +void clear_linkup_counters(struct hfi1_devdata *dd); +u32 hdrqempty(struct hfi1_ctxtdata *rcd); +int is_a0(struct hfi1_devdata *dd); +int is_ax(struct hfi1_devdata *dd); +int is_bx(struct hfi1_devdata *dd); +u32 read_physical_state(struct hfi1_devdata *dd); +u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); +u32 get_logical_state(struct hfi1_pportdata *ppd); +const char *opa_lstate_name(u32 lstate); +const char *opa_pstate_name(u32 pstate); +u32 driver_physical_state(struct hfi1_pportdata *ppd); +u32 driver_logical_state(struct hfi1_pportdata *ppd); + +int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok); +int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok); +#define LCB_START DC_LCB_CSRS +#define LCB_END DC_8051_CSRS /* next block is 8051 */ +static inline int is_lcb_offset(u32 offset) +{ + return (offset >= LCB_START && offset < LCB_END); +} + +extern uint num_vls; + +extern uint disable_integrity; +u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl); +u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data); +u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl); +u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data); + +/* Per VL indexes */ +enum { + C_VL_0 = 0, + C_VL_1, + C_VL_2, + C_VL_3, + C_VL_4, + C_VL_5, + C_VL_6, + C_VL_7, + C_VL_15, + C_VL_COUNT +}; + +static inline int vl_from_idx(int idx) +{ + return (idx == C_VL_15 ? 15 : idx); +} + +static inline int idx_from_vl(int vl) +{ + return (vl == 15 ? C_VL_15 : vl); +} + +/* Per device counter indexes */ +enum { + C_RCV_OVF = 0, + C_RX_TID_FULL, + C_RX_TID_INVALID, + C_RX_TID_FLGMS, + C_RX_CTX_RHQS, + C_RX_CTX_EGRS, + C_RCV_TID_FLSMS, + C_CCE_PCI_CR_ST, + C_CCE_PCI_TR_ST, + C_CCE_PIO_WR_ST, + C_CCE_ERR_INT, + C_CCE_SDMA_INT, + C_CCE_MISC_INT, + C_CCE_RCV_AV_INT, + C_CCE_RCV_URG_INT, + C_CCE_SEND_CR_INT, + C_DC_UNC_ERR, + C_DC_RCV_ERR, + C_DC_FM_CFG_ERR, + C_DC_RMT_PHY_ERR, + C_DC_DROPPED_PKT, + C_DC_MC_XMIT_PKTS, + C_DC_MC_RCV_PKTS, + C_DC_XMIT_CERR, + C_DC_RCV_CERR, + C_DC_RCV_FCC, + C_DC_XMIT_FCC, + C_DC_XMIT_FLITS, + C_DC_RCV_FLITS, + C_DC_XMIT_PKTS, + C_DC_RCV_PKTS, + C_DC_RX_FLIT_VL, + C_DC_RX_PKT_VL, + C_DC_RCV_FCN, + C_DC_RCV_FCN_VL, + C_DC_RCV_BCN, + C_DC_RCV_BCN_VL, + C_DC_RCV_BBL, + C_DC_RCV_BBL_VL, + C_DC_MARK_FECN, + C_DC_MARK_FECN_VL, + C_DC_TOTAL_CRC, + C_DC_CRC_LN0, + C_DC_CRC_LN1, + C_DC_CRC_LN2, + C_DC_CRC_LN3, + C_DC_CRC_MULT_LN, + C_DC_TX_REPLAY, + C_DC_RX_REPLAY, + C_DC_SEQ_CRC_CNT, + C_DC_ESC0_ONLY_CNT, + C_DC_ESC0_PLUS1_CNT, + C_DC_ESC0_PLUS2_CNT, + C_DC_REINIT_FROM_PEER_CNT, + C_DC_SBE_CNT, + C_DC_MISC_FLG_CNT, + C_DC_PRF_GOOD_LTP_CNT, + C_DC_PRF_ACCEPTED_LTP_CNT, + C_DC_PRF_RX_FLIT_CNT, + C_DC_PRF_TX_FLIT_CNT, + C_DC_PRF_CLK_CNTR, + C_DC_PG_DBG_FLIT_CRDTS_CNT, + C_DC_PG_STS_PAUSE_COMPLETE_CNT, + C_DC_PG_STS_TX_SBE_CNT, + C_DC_PG_STS_TX_MBE_CNT, + C_SW_CPU_INTR, + C_SW_CPU_RCV_LIM, + C_SW_VTX_WAIT, + C_SW_PIO_WAIT, + C_SW_KMEM_WAIT, + DEV_CNTR_LAST /* Must be kept last */ +}; + +/* Per port counter indexes */ +enum { + C_TX_UNSUP_VL = 0, + C_TX_INVAL_LEN, + C_TX_MM_LEN_ERR, + C_TX_UNDERRUN, + C_TX_FLOW_STALL, + C_TX_DROPPED, + C_TX_HDR_ERR, + C_TX_PKT, + C_TX_WORDS, + C_TX_WAIT, + C_TX_FLIT_VL, + C_TX_PKT_VL, + C_TX_WAIT_VL, + C_RX_PKT, + C_RX_WORDS, + C_SW_LINK_DOWN, + C_SW_LINK_UP, + C_SW_XMIT_DSCD, + C_SW_XMIT_DSCD_VL, + C_SW_XMIT_CSTR_ERR, + C_SW_RCV_CSTR_ERR, + C_SW_IBP_LOOP_PKTS, + C_SW_IBP_RC_RESENDS, + C_SW_IBP_RNR_NAKS, + C_SW_IBP_OTHER_NAKS, + C_SW_IBP_RC_TIMEOUTS, + C_SW_IBP_PKT_DROPS, + C_SW_IBP_DMA_WAIT, + C_SW_IBP_RC_SEQNAK, + C_SW_IBP_RC_DUPREQ, + C_SW_IBP_RDMA_SEQ, + C_SW_IBP_UNALIGNED, + C_SW_IBP_SEQ_NAK, + C_SW_CPU_RC_ACKS, + C_SW_CPU_RC_QACKS, + C_SW_CPU_RC_DELAYED_COMP, + C_RCV_HDR_OVF_0, + C_RCV_HDR_OVF_1, + C_RCV_HDR_OVF_2, + C_RCV_HDR_OVF_3, + C_RCV_HDR_OVF_4, + C_RCV_HDR_OVF_5, + C_RCV_HDR_OVF_6, + C_RCV_HDR_OVF_7, + C_RCV_HDR_OVF_8, + C_RCV_HDR_OVF_9, + C_RCV_HDR_OVF_10, + C_RCV_HDR_OVF_11, + C_RCV_HDR_OVF_12, + C_RCV_HDR_OVF_13, + C_RCV_HDR_OVF_14, + C_RCV_HDR_OVF_15, + C_RCV_HDR_OVF_16, + C_RCV_HDR_OVF_17, + C_RCV_HDR_OVF_18, + C_RCV_HDR_OVF_19, + C_RCV_HDR_OVF_20, + C_RCV_HDR_OVF_21, + C_RCV_HDR_OVF_22, + C_RCV_HDR_OVF_23, + C_RCV_HDR_OVF_24, + C_RCV_HDR_OVF_25, + C_RCV_HDR_OVF_26, + C_RCV_HDR_OVF_27, + C_RCV_HDR_OVF_28, + C_RCV_HDR_OVF_29, + C_RCV_HDR_OVF_30, + C_RCV_HDR_OVF_31, + C_RCV_HDR_OVF_32, + C_RCV_HDR_OVF_33, + C_RCV_HDR_OVF_34, + C_RCV_HDR_OVF_35, + C_RCV_HDR_OVF_36, + C_RCV_HDR_OVF_37, + C_RCV_HDR_OVF_38, + C_RCV_HDR_OVF_39, + C_RCV_HDR_OVF_40, + C_RCV_HDR_OVF_41, + C_RCV_HDR_OVF_42, + C_RCV_HDR_OVF_43, + C_RCV_HDR_OVF_44, + C_RCV_HDR_OVF_45, + C_RCV_HDR_OVF_46, + C_RCV_HDR_OVF_47, + C_RCV_HDR_OVF_48, + C_RCV_HDR_OVF_49, + C_RCV_HDR_OVF_50, + C_RCV_HDR_OVF_51, + C_RCV_HDR_OVF_52, + C_RCV_HDR_OVF_53, + C_RCV_HDR_OVF_54, + C_RCV_HDR_OVF_55, + C_RCV_HDR_OVF_56, + C_RCV_HDR_OVF_57, + C_RCV_HDR_OVF_58, + C_RCV_HDR_OVF_59, + C_RCV_HDR_OVF_60, + C_RCV_HDR_OVF_61, + C_RCV_HDR_OVF_62, + C_RCV_HDR_OVF_63, + C_RCV_HDR_OVF_64, + C_RCV_HDR_OVF_65, + C_RCV_HDR_OVF_66, + C_RCV_HDR_OVF_67, + C_RCV_HDR_OVF_68, + C_RCV_HDR_OVF_69, + C_RCV_HDR_OVF_70, + C_RCV_HDR_OVF_71, + C_RCV_HDR_OVF_72, + C_RCV_HDR_OVF_73, + C_RCV_HDR_OVF_74, + C_RCV_HDR_OVF_75, + C_RCV_HDR_OVF_76, + C_RCV_HDR_OVF_77, + C_RCV_HDR_OVF_78, + C_RCV_HDR_OVF_79, + C_RCV_HDR_OVF_80, + C_RCV_HDR_OVF_81, + C_RCV_HDR_OVF_82, + C_RCV_HDR_OVF_83, + C_RCV_HDR_OVF_84, + C_RCV_HDR_OVF_85, + C_RCV_HDR_OVF_86, + C_RCV_HDR_OVF_87, + C_RCV_HDR_OVF_88, + C_RCV_HDR_OVF_89, + C_RCV_HDR_OVF_90, + C_RCV_HDR_OVF_91, + C_RCV_HDR_OVF_92, + C_RCV_HDR_OVF_93, + C_RCV_HDR_OVF_94, + C_RCV_HDR_OVF_95, + C_RCV_HDR_OVF_96, + C_RCV_HDR_OVF_97, + C_RCV_HDR_OVF_98, + C_RCV_HDR_OVF_99, + C_RCV_HDR_OVF_100, + C_RCV_HDR_OVF_101, + C_RCV_HDR_OVF_102, + C_RCV_HDR_OVF_103, + C_RCV_HDR_OVF_104, + C_RCV_HDR_OVF_105, + C_RCV_HDR_OVF_106, + C_RCV_HDR_OVF_107, + C_RCV_HDR_OVF_108, + C_RCV_HDR_OVF_109, + C_RCV_HDR_OVF_110, + C_RCV_HDR_OVF_111, + C_RCV_HDR_OVF_112, + C_RCV_HDR_OVF_113, + C_RCV_HDR_OVF_114, + C_RCV_HDR_OVF_115, + C_RCV_HDR_OVF_116, + C_RCV_HDR_OVF_117, + C_RCV_HDR_OVF_118, + C_RCV_HDR_OVF_119, + C_RCV_HDR_OVF_120, + C_RCV_HDR_OVF_121, + C_RCV_HDR_OVF_122, + C_RCV_HDR_OVF_123, + C_RCV_HDR_OVF_124, + C_RCV_HDR_OVF_125, + C_RCV_HDR_OVF_126, + C_RCV_HDR_OVF_127, + C_RCV_HDR_OVF_128, + C_RCV_HDR_OVF_129, + C_RCV_HDR_OVF_130, + C_RCV_HDR_OVF_131, + C_RCV_HDR_OVF_132, + C_RCV_HDR_OVF_133, + C_RCV_HDR_OVF_134, + C_RCV_HDR_OVF_135, + C_RCV_HDR_OVF_136, + C_RCV_HDR_OVF_137, + C_RCV_HDR_OVF_138, + C_RCV_HDR_OVF_139, + C_RCV_HDR_OVF_140, + C_RCV_HDR_OVF_141, + C_RCV_HDR_OVF_142, + C_RCV_HDR_OVF_143, + C_RCV_HDR_OVF_144, + C_RCV_HDR_OVF_145, + C_RCV_HDR_OVF_146, + C_RCV_HDR_OVF_147, + C_RCV_HDR_OVF_148, + C_RCV_HDR_OVF_149, + C_RCV_HDR_OVF_150, + C_RCV_HDR_OVF_151, + C_RCV_HDR_OVF_152, + C_RCV_HDR_OVF_153, + C_RCV_HDR_OVF_154, + C_RCV_HDR_OVF_155, + C_RCV_HDR_OVF_156, + C_RCV_HDR_OVF_157, + C_RCV_HDR_OVF_158, + C_RCV_HDR_OVF_159, + PORT_CNTR_LAST /* Must be kept last */ +}; + +u64 get_all_cpu_total(u64 __percpu *cntr); +void hfi1_start_cleanup(struct hfi1_devdata *dd); +void hfi1_clear_tids(struct hfi1_ctxtdata *rcd); +struct hfi1_message_header *hfi1_get_msgheader( + struct hfi1_devdata *dd, __le32 *rhf_addr); +int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, + struct hfi1_ctxt_info *kinfo); +u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, + u32 mask); +int hfi1_init_ctxt(struct send_context *sc); +void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, + u32 type, unsigned long pa, u16 order); +void hfi1_quiet_serdes(struct hfi1_pportdata *ppd); +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt); +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep, + u64 **cntrp); +u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port, + char **namep, u64 **cntrp); +u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd); +int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which); +int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val); +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey); +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt); +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey); +int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt); +void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); + +/* + * Interrupt source table. + * + * Each entry is an interrupt source "type". It is ordered by increasing + * number. + */ +struct is_table { + int start; /* interrupt source type start */ + int end; /* interrupt source type end */ + /* routine that returns the name of the interrupt source */ + char *(*is_name)(char *name, size_t size, unsigned int source); + /* routine to call when receiving an interrupt */ + void (*is_int)(struct hfi1_devdata *dd, unsigned int source); +}; + +#endif /* _CHIP_H */ diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h new file mode 100644 index 000000000..bf45de29d --- /dev/null +++ b/drivers/staging/rdma/hfi1/chip_registers.h @@ -0,0 +1,1292 @@ +#ifndef DEF_CHIP_REG +#define DEF_CHIP_REG + +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define CORE 0x000000000000 +#define CCE (CORE + 0x000000000000) +#define ASIC (CORE + 0x000000400000) +#define MISC (CORE + 0x000000500000) +#define DC_TOP_CSRS (CORE + 0x000000600000) +#define CHIP_DEBUG (CORE + 0x000000700000) +#define RXE (CORE + 0x000001000000) +#define TXE (CORE + 0x000001800000) +#define DCC_CSRS (DC_TOP_CSRS + 0x000000000000) +#define DC_LCB_CSRS (DC_TOP_CSRS + 0x000000001000) +#define DC_8051_CSRS (DC_TOP_CSRS + 0x000000002000) +#define PCIE 0 + +#define ASIC_NUM_SCRATCH 4 +#define CCE_ERR_INT_CNT 0 +#define CCE_MISC_INT_CNT 2 +#define CCE_NUM_32_BIT_COUNTERS 3 +#define CCE_NUM_32_BIT_INT_COUNTERS 6 +#define CCE_NUM_INT_CSRS 12 +#define CCE_NUM_INT_MAP_CSRS 96 +#define CCE_NUM_MSIX_PBAS 4 +#define CCE_NUM_MSIX_VECTORS 256 +#define CCE_NUM_SCRATCH 4 +#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2 +#define CCE_PCIE_TRGT_STALL_CNT 0 +#define CCE_PIO_WR_STALL_CNT 1 +#define CCE_RCV_AVAIL_INT_CNT 3 +#define CCE_RCV_URGENT_INT_CNT 4 +#define CCE_SDMA_INT_CNT 1 +#define CCE_SEND_CREDIT_INT_CNT 5 +#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040) +#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull +#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0 +#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull +#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008) +#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010) +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16 +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0 +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull +#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48 +#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull +#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull +#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32 +#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull +#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000) +#define DCC_CFG_RESET_RESET_LCB_SHIFT 0 +#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2 +#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028) +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36 +#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030) +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60 +#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120) +#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050) +#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull +#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull +#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull +#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull +#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull +#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull +#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull +#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull +#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull +#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull +#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull +#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull +#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull +#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060) +#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull +#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull +#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull +#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull +#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull +#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058) +#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull +#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull +#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull +#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull +#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull +#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull +#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull +#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull +#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull +#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull +#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull +#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull +#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull +#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull +#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull +#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull +#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull +#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull +#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull +#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull +#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull +#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull +#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull +#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull +#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull +#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull +#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull +#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull +#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull +#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull +#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110) +#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090) +#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078) +#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080) +#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088) +#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098) +#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108) +#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118) +#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100) +#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330) +#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290) +#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0) +#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140) +#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198) +#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240) +#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130) +#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8) +#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338) +#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298) +#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8) +#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0) +#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248) +#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8) +#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138) +#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190) +#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128) +#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0) +#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180) +#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188) +#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110) +#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull +#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118) +#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8 +#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16 +#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120) +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16 +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028) +#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16 +#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030) +#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull +#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull +#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull +#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16 +#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038) +#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070) +#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008) +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0 +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull +#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000) +#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull +#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull +#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018) +#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull +#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010) +#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020) +#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068) +#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull +#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull +#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull +#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull +#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8) +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16 +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0 +#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8) +#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0) +#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull +#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0) +#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull +#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull +#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull +#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull +#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull +#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull +#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull +#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull +#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull +#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull +#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060) +#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull +#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16 +#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull +#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0 +#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050) +#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull +#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058) +#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040) +#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048) +#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull +#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130) +#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull +#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128) +#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0 +#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058) +#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0 +#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020) +#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull +#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100) +#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120) +#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull +#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull +#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060) +#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8) +#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0 +#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000) +#define DC_LCB_CFG_RUN_EN_SHIFT 0 +#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018) +#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8 +#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4 +#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0 +#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010) +#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0 +#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008) +#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0 +#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308) +#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310) +#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300) +#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull +#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull +#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull +#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull +#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull +#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull +#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull +#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull +#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull +#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull +#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull +#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull +#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull +#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull +#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull +#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull +#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull +#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull +#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull +#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull +#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull +#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull +#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull +#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull +#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328) +#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330) +#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338) +#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340) +#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348) +#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368) +#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370) +#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378) +#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390) +#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380) +#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358) +#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388) +#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360) +#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320) +#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350) +#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580) +#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8) +#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608) +#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600) +#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408) +#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420) +#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400) +#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410) +#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418) +#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468) +#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0) +#define RCV_BUF_OVFL_CNT 10 +#define RCV_CONTEXT_EGR_STALL 22 +#define RCV_CONTEXT_RHQ_STALL 21 +#define RCV_DATA_PKT_CNT 0 +#define RCV_DWORD_CNT 1 +#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20 +#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23 +#define RCV_TID_FULL_ERR_CNT 18 +#define RCV_TID_VALID_ERR_CNT 19 +#define RXE_NUM_32_BIT_COUNTERS 24 +#define RXE_NUM_64_BIT_COUNTERS 2 +#define RXE_NUM_RSM_INSTANCES 4 +#define RXE_NUM_TID_FLOWS 32 +#define RXE_PER_CONTEXT_OFFSET 0x0300000 +#define SEND_DATA_PKT_CNT 0 +#define SEND_DATA_PKT_VL0_CNT 12 +#define SEND_DATA_VL0_CNT 3 +#define SEND_DROPPED_PKT_CNT 5 +#define SEND_DWORD_CNT 1 +#define SEND_FLOW_STALL_CNT 4 +#define SEND_HEADERS_ERR_CNT 6 +#define SEND_LEN_ERR_CNT 1 +#define SEND_MAX_MIN_LEN_ERR_CNT 2 +#define SEND_UNDERRUN_CNT 3 +#define SEND_UNSUP_VL_ERR_CNT 0 +#define SEND_WAIT_CNT 2 +#define SEND_WAIT_VL0_CNT 21 +#define TXE_PIO_SEND_OFFSET 0x0800000 +#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048) +#define ASIC_CFG_MUTEX (ASIC + 0x000000000040) +#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008) +#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull +#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull +#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000) +#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16 +#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8 +#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32 +#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0 +#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020) +#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050) +#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308) +#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull +#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300) +#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull +#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8 +#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull +#define ASIC_EEP_DATA (ASIC + 0x000000000310) +#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230) +#define ASIC_GPIO_FORCE (ASIC + 0x000000000238) +#define ASIC_GPIO_IN (ASIC + 0x000000000200) +#define ASIC_GPIO_INVERT (ASIC + 0x000000000210) +#define ASIC_GPIO_MASK (ASIC + 0x000000000220) +#define ASIC_GPIO_OE (ASIC + 0x000000000208) +#define ASIC_GPIO_OUT (ASIC + 0x000000000218) +#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100) +#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0 +#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull +#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2 +#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull +#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12 +#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108) +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2 +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0 +#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110) +#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118) +#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180) +#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16 +#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0 +#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128) +#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270) +#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278) +#define ASIC_QSFP1_IN (ASIC + 0x000000000240) +#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250) +#define ASIC_QSFP1_MASK (ASIC + 0x000000000260) +#define ASIC_QSFP1_OE (ASIC + 0x000000000248) +#define ASIC_QSFP1_OUT (ASIC + 0x000000000258) +#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268) +#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0) +#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8) +#define ASIC_QSFP2_IN (ASIC + 0x000000000280) +#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290) +#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0) +#define ASIC_QSFP2_OE (ASIC + 0x000000000288) +#define ASIC_QSFP2_OUT (ASIC + 0x000000000298) +#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8) +#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018) +#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull +#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0 +#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull +#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16 +#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010) +#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull +#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull +#define ASIC_STS_THERM (ASIC + 0x000000000058) +#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18 +#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2 +#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_HI_TEMP_SHIFT 50 +#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_LO_TEMP_SHIFT 34 +#define ASIC_STS_THERM_LOW_SHIFT 13 +#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060) +#define CCE_CTRL (CCE + 0x000000000010) +#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull +#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull +#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull +#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull +#define CCE_DC_CTRL (CCE + 0x0000000000B8) +#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull +#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull +#define CCE_ERR_CLEAR (CCE + 0x000000000050) +#define CCE_ERR_MASK (CCE + 0x000000000048) +#define CCE_ERR_STATUS (CCE + 0x000000000040) +#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \ + 0x200ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \ + 0x800ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \ + 0x400ull +#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull +#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull +#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull +#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull +#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull +#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull +#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull +#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull +#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull +#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull +#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull +#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull +#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull +#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull +#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull +#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull +#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull +#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull +#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull +#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull +#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull +#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull +#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull +#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull +#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull +#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull +#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull +#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull +#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull +#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull +#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull +#define CCE_INT_CLEAR (CCE + 0x000000110A00) +#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00) +#define CCE_INT_FORCE (CCE + 0x000000110B00) +#define CCE_INT_MAP (CCE + 0x000000110500) +#define CCE_INT_MASK (CCE + 0x000000110900) +#define CCE_INT_STATUS (CCE + 0x000000110800) +#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200) +#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000) +#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008) +#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull +#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400) +#define CCE_REVISION (CCE + 0x000000000000) +#define CCE_REVISION2 (CCE + 0x000000000008) +#define CCE_REVISION2_HFI_ID_MASK 0x1ull +#define CCE_REVISION2_HFI_ID_SHIFT 0 +#define CCE_REVISION2_IMPL_CODE_SHIFT 8 +#define CCE_REVISION2_IMPL_REVISION_SHIFT 16 +#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull +#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32 +#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull +#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8 +#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull +#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0 +#define CCE_REVISION_SW_MASK 0xFFull +#define CCE_REVISION_SW_SHIFT 24 +#define CCE_SCRATCH (CCE + 0x000000000020) +#define CCE_STATUS (CCE + 0x000000000018) +#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull +#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull +#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull +#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull +#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull +#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull +#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull +#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull +#define MISC_CFG_FW_CTRL (MISC + 0x000000001000) +#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull +#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2 +#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull +#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08) +#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400) +#define MISC_CFG_RSA_MU (MISC + 0x000000000A10) +#define MISC_CFG_RSA_R2 (MISC + 0x000000000000) +#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200) +#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00) +#define MISC_ERR_CLEAR (MISC + 0x000000002010) +#define MISC_ERR_MASK (MISC + 0x000000002008) +#define MISC_ERR_STATUS (MISC + 0x000000002000) +#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull +#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull +#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull +#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull +#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull +#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull +#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull +#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull +#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull +#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull +#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull +#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0) +#define PCI_CFG_REG1 (PCIE + 0x000000000004) +#define PCI_CFG_REG11 (PCIE + 0x00000000002C) +#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C) +#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150) +#define PCIE_CFG_TPH2 (PCIE + 0x000000000180) +#define RCV_ARRAY (RXE + 0x000000200000) +#define RCV_ARRAY_CNT (RXE + 0x000000000018) +#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull +#define RCV_ARRAY_RT_ADDR_SHIFT 0 +#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36 +#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull +#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050) +#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull +#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0 +#define RCV_BTH_QP (RXE + 0x000000000028) +#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull +#define RCV_BTH_QP_KDETH_QP_SHIFT 16 +#define RCV_BYPASS (RXE + 0x000000000038) +#define RCV_CONTEXTS (RXE + 0x000000000010) +#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400) +#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500) +#define RCV_CTRL (RXE + 0x000000000000) +#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull +#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull +#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull +#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull +#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull +#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull +#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull +#define RCV_CTXT_CTRL (RXE + 0x000000100000) +#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull +#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8 +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull +#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull +#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull +#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull +#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull +#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull +#define RCV_CTXT_STATUS (RXE + 0x000000100008) +#define RCV_EGR_CTRL (RXE + 0x000000100010) +#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull +#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0 +#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull +#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32 +#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018) +#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull +#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0 +#define RCV_ERR_CLEAR (RXE + 0x000000000070) +#define RCV_ERR_INFO (RXE + 0x000000000050) +#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full +#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull +#define RCV_ERR_MASK (RXE + 0x000000000068) +#define RCV_ERR_STATUS (RXE + 0x000000000060) +#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull +#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull +#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \ + 0x4000000000000000ull +#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull +#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull +#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull +#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull +#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \ + 0x40000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \ + 0x20000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \ + 0x800000000000000ull +#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \ + 0x400000000000000ull +#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull +#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull +#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull +#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull +#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull +#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \ + 0x10000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \ + 0x20000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull +#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull +#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull +#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \ + 0x200000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \ + 0x8000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull +#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull +#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \ + 0x1000000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull +#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull +#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull +#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull +#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull +#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull +#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull +#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull +#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull +#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull +#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull +#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull +#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull +#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull +#define RCV_HDR_ADDR (RXE + 0x000000100028) +#define RCV_HDR_CNT (RXE + 0x000000100030) +#define RCV_HDR_CNT_CNT_MASK 0x1FFull +#define RCV_HDR_CNT_CNT_SHIFT 0 +#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038) +#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull +#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0 +#define RCV_HDR_HEAD (RXE + 0x000000300008) +#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull +#define RCV_HDR_HEAD_COUNTER_SHIFT 32 +#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull +#define RCV_HDR_HEAD_HEAD_SHIFT 0 +#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull +#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058) +#define RCV_HDR_SIZE (RXE + 0x000000100040) +#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full +#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0 +#define RCV_HDR_TAIL (RXE + 0x000000300000) +#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048) +#define RCV_KEY_CTRL (RXE + 0x000000100020) +#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull +#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull +#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0 +#define RCV_MULTICAST (RXE + 0x000000000030) +#define RCV_PARTITION_KEY (RXE + 0x000000000200) +#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull +#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16 +#define RCV_QP_MAP_TABLE (RXE + 0x000000000100) +#define RCV_RSM_CFG (RXE + 0x000000000600) +#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull +#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0 +#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60 +#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900) +#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull +#define RCV_RSM_MATCH (RXE + 0x000000000800) +#define RCV_RSM_MATCH_MASK1_SHIFT 0 +#define RCV_RSM_MATCH_MASK2_SHIFT 16 +#define RCV_RSM_MATCH_VALUE1_SHIFT 8 +#define RCV_RSM_MATCH_VALUE2_SHIFT 24 +#define RCV_RSM_SELECT (RXE + 0x000000000700) +#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0 +#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16 +#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32 +#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44 +#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48 +#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60 +#define RCV_STATUS (RXE + 0x000000000008) +#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull +#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull +#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull +#define RCV_TID_CTRL (RXE + 0x000000100018) +#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull +#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0 +#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull +#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32 +#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800) +#define RCV_VL15 (RXE + 0x000000000048) +#define SEND_BTH_QP (TXE + 0x0000000000A0) +#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull +#define SEND_BTH_QP_KDETH_QP_SHIFT 16 +#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510) +#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \ + 0x1000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \ + 0x8000000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \ + 0x2000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \ + 0x4000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \ + 0x8000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \ + 0x10000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \ + 0x20000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \ + 0x40000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \ + 0x80000000000000ull +#define SEND_CM_CREDIT_VL (TXE + 0x000000000600) +#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678) +#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0 +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0 +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16 +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull +#define SEND_CM_CTRL (TXE + 0x000000000500) +#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull +#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull +#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508) +#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16 +#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0 +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32 +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull +#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520) +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48 +#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528) +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48 +#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530) +#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538) +#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518) +#define SEND_CONTEXTS (TXE + 0x000000000010) +#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200) +#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300) +#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400) +#define SEND_CTRL (TXE + 0x000000000000) +#define SEND_CTRL_CM_RESET_SMASK 0x4ull +#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull +#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull +#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080) +#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \ + 0x200000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \ + 0x100000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \ + 0x80000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \ + 0x40000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \ + 0x8000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \ + 0x4000ull +#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090) +#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull +#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull +#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8) +#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8 +#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098) +#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0) +#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull +#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16 +#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088) +#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010) +#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull +#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0 +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull +#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028) +#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull +#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020) +#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull +#define SEND_CTXT_CTRL (TXE + 0x000000100000) +#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull +#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32 +#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull +#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48 +#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull +#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050) +#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048) +#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040) +#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull +#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull +#define SEND_CTXT_STATUS (TXE + 0x000000100008) +#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull +#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010) +#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080) +#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull +#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull +#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull +#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull +#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull +#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull +#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull +#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \ + 0x100000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \ + 0x80000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \ + 0x8000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull +#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090) +#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8) +#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098) +#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0) +#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull +#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16 +#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull +#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0 +#define SEND_DMA_CHECK_VL (TXE + 0x000000200088) +#define SEND_DMA_CTRL (TXE + 0x000000200000) +#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull +#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull +#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull +#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull +#define SEND_DMA_DESC_CNT (TXE + 0x000000200050) +#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull +#define SEND_DMA_DESC_CNT_CNT_SHIFT 0 +#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070) +#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull +#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18 +#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068) +#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060) +#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \ + 0x40000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \ + 0x20000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \ + 0x100ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \ + 0x10000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull +#define SEND_DMA_ENGINES (TXE + 0x000000000018) +#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070) +#define SEND_DMA_ERR_MASK (TXE + 0x000000000068) +#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060) +#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull +#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull +#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull +#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull +#define SEND_DMA_HEAD (TXE + 0x000000200028) +#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030) +#define SEND_DMA_LEN_GEN (TXE + 0x000000200018) +#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16 +#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6 +#define SEND_DMA_MEMORY (TXE + 0x0000002000B0) +#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16 +#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0 +#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028) +#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038) +#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048) +#define SEND_DMA_STATUS (TXE + 0x000000200008) +#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull +#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull +#define SEND_DMA_TAIL (TXE + 0x000000200020) +#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800) +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0 +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \ + 0x3FFFull +#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090) +#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00) +#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull +#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull +#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull +#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull +#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull +#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull +#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull +#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull +#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull +#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull +#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull +#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull +#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull +#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull +#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull +#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull +#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull +#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull +#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull +#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088) +#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08) +#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080) +#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \ + 0x200000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \ + 0x20000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \ + 0x800000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \ + 0x2000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \ + 0x200000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \ + 0x8ull +#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \ + 0x400000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull +#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \ + 0x1000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \ + 0x100000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \ + 0x2000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \ + 0x200000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \ + 0x4000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \ + 0x400000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \ + 0x8000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \ + 0x800000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \ + 0x10000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \ + 0x1000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \ + 0x20000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \ + 0x2000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \ + 0x40000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \ + 0x4000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \ + 0x80000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \ + 0x8000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \ + 0x100000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \ + 0x10000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull +#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull +#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull +#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \ + 0x1000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \ + 0x8000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \ + 0x100000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \ + 0x800000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \ + 0x4000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \ + 0x80000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull +#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \ + 0x800ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \ + 0x10000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \ + 0x4000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \ + 0x8000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \ + 0x10000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \ + 0x20000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \ + 0x40000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \ + 0x80000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \ + 0x20000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \ + 0x40000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \ + 0x80000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \ + 0x100000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \ + 0x200000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \ + 0x400000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \ + 0x800000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \ + 0x1000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \ + 0x2000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \ + 0x100ull +#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00) +#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0 +#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \ + 0x3FFFull +#define SEND_ERR_CLEAR (TXE + 0x0000000000F0) +#define SEND_ERR_MASK (TXE + 0x0000000000E8) +#define SEND_ERR_STATUS (TXE + 0x0000000000E0) +#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull +#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030) +#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull +#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0 +#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180) +#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0) +#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull +#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12 +#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8) +#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull +#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48 +#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull +#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12 +#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100) +#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull +#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16 +#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull +#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0 +#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050) +#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull +#define SEND_PIO_ERR_MASK (TXE + 0x000000000048) +#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040) +#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \ + 0x1000000ull +#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull +#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull +#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \ + 0x100000000ull +#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull +#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull +#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull +#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \ + 0x200000000ull +#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull +#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \ + 0x400000000ull +#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull +#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \ + 0x800000000ull +#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull +#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull +#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull +#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull +#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull +#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \ + 0x100ull +#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull +#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull +#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull +#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull +#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038) +#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull +#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull +#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8 +#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull +#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull +#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull +#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020) +#define SEND_SC2VLT0 (TXE + 0x0000000000B0) +#define SEND_SC2VLT0_SC0_SHIFT 0 +#define SEND_SC2VLT0_SC1_SHIFT 8 +#define SEND_SC2VLT0_SC2_SHIFT 16 +#define SEND_SC2VLT0_SC3_SHIFT 24 +#define SEND_SC2VLT0_SC4_SHIFT 32 +#define SEND_SC2VLT0_SC5_SHIFT 40 +#define SEND_SC2VLT0_SC6_SHIFT 48 +#define SEND_SC2VLT0_SC7_SHIFT 56 +#define SEND_SC2VLT1 (TXE + 0x0000000000B8) +#define SEND_SC2VLT1_SC10_SHIFT 16 +#define SEND_SC2VLT1_SC11_SHIFT 24 +#define SEND_SC2VLT1_SC12_SHIFT 32 +#define SEND_SC2VLT1_SC13_SHIFT 40 +#define SEND_SC2VLT1_SC14_SHIFT 48 +#define SEND_SC2VLT1_SC15_SHIFT 56 +#define SEND_SC2VLT1_SC8_SHIFT 0 +#define SEND_SC2VLT1_SC9_SHIFT 8 +#define SEND_SC2VLT2 (TXE + 0x0000000000C0) +#define SEND_SC2VLT2_SC16_SHIFT 0 +#define SEND_SC2VLT2_SC17_SHIFT 8 +#define SEND_SC2VLT2_SC18_SHIFT 16 +#define SEND_SC2VLT2_SC19_SHIFT 24 +#define SEND_SC2VLT2_SC20_SHIFT 32 +#define SEND_SC2VLT2_SC21_SHIFT 40 +#define SEND_SC2VLT2_SC22_SHIFT 48 +#define SEND_SC2VLT2_SC23_SHIFT 56 +#define SEND_SC2VLT3 (TXE + 0x0000000000C8) +#define SEND_SC2VLT3_SC24_SHIFT 0 +#define SEND_SC2VLT3_SC25_SHIFT 8 +#define SEND_SC2VLT3_SC26_SHIFT 16 +#define SEND_SC2VLT3_SC27_SHIFT 24 +#define SEND_SC2VLT3_SC28_SHIFT 32 +#define SEND_SC2VLT3_SC29_SHIFT 40 +#define SEND_SC2VLT3_SC30_SHIFT 48 +#define SEND_SC2VLT3_SC31_SHIFT 56 +#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8) +#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0 +#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull +#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708) +#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898) +#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12 +#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6 +#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0 +#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C) +#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4) +#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull +#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24 +#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890) +#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull +#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894) +#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6 +#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0 +#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8) +#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8 +#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull +#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull +#define CCE_INT_BLOCKED (CCE + 0x000000110C00) +#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040) +#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058) + +#endif /* DEF_CHIP_REG */ diff --git a/drivers/staging/rdma/hfi1/common.h b/drivers/staging/rdma/hfi1/common.h new file mode 100644 index 000000000..5f2293729 --- /dev/null +++ b/drivers/staging/rdma/hfi1/common.h @@ -0,0 +1,415 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _COMMON_H +#define _COMMON_H + +#include + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in + * fast path code + * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in fast path code + * _HFI1_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + */ + +/* + * If a packet's QP[23:16] bits match this value, then it is + * a PSM packet and the hardware will expect a KDETH header + * following the BTH. + */ +#define DEFAULT_KDETH_QP 0x80 + +/* driver/hw feature set bitmask */ +#define HFI1_CAP_USER_SHIFT 24 +#define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1) +/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */ +#define HFI1_CAP_LOCKED_SHIFT 63 +#define HFI1_CAP_LOCKED_MASK 0x1ULL +#define HFI1_CAP_LOCKED_SMASK (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT) +/* extra bits used between kernel and user processes */ +#define HFI1_CAP_MISC_SHIFT (HFI1_CAP_USER_SHIFT * 2) +#define HFI1_CAP_MISC_MASK ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \ + HFI1_CAP_MISC_SHIFT)) - 1) + +#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; }) +#define HFI1_CAP_KCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~HFI1_CAP_##cap; \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_USET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_UCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_SET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap << \ + HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_CLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap | \ + (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_LOCK() \ + ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; }) +#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK)) +/* + * The set of capability bits that can be changed after initial load + * This set is the same for kernel and user contexts. However, for + * user contexts, the set can be further filtered by using the + * HFI1_CAP_RESERVED_MASK bits. + */ +#define HFI1_CAP_WRITABLE_MASK (HFI1_CAP_SDMA_AHG | \ + HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_ALLOW_PERM_JKEY | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PRINT_UNIMPL) +/* + * A set of capability bits that are "global" and are not allowed to be + * set in the user bitmask. + */ +#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \ + HFI1_CAP_USE_SDMA_HEAD | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_QSFP_ENABLED | \ + HFI1_CAP_NO_INTEGRITY | \ + HFI1_CAP_PKEY_CHECK) << \ + HFI1_CAP_USER_SHIFT) +/* + * Set of capabilities that need to be enabled for kernel context in + * order to be allowed for user contexts, as well. + */ +#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL) +/* Default enabled capabilities (both kernel and user) */ +#define HFI1_CAP_MASK_DEFAULT (HFI1_CAP_HDRSUPP | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_SDMA | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_QSFP_ENABLED | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_EXTENDED_PSN | \ + ((HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_EARLY_CREDIT_RETURN) << \ + HFI1_CAP_USER_SHIFT)) +/* + * A bitmask of kernel/global capabilities that should be communicated + * to user level processes. + */ +#define HFI1_CAP_K2U (HFI1_CAP_SDMA | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_NO_INTEGRITY) + +#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << 16) | HFI1_USER_SWMINOR) + +#ifndef HFI1_KERN_TYPE +#define HFI1_KERN_TYPE 0 +#endif + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a Intel release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-Intel and 1 for Intel-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of hfi1_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION) + +/* + * Define the driver version number. This is something that refers only + * to the driver itself, not the software interfaces it supports. + */ +#ifndef HFI1_DRIVER_VERSION_BASE +#define HFI1_DRIVER_VERSION_BASE "0.9-248" +#endif + +/* create the final driver version string */ +#ifdef HFI1_IDSTR +#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR +#else +#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE +#endif + +/* + * Diagnostics can send a packet by writing the following + * struct to the diag packet special file. + * + * This allows a custom PBC qword, so that special modes and deliberate + * changes to CRCs can be used. + */ +#define _DIAG_PKT_VERS 1 +struct diag_pkt { + __u16 version; /* structure version */ + __u16 unit; /* which device */ + __u16 sw_index; /* send sw index to use */ + __u16 len; /* data length, in bytes */ + __u16 port; /* port number */ + __u16 unused; + __u32 flags; /* call flags */ + __u64 data; /* user data pointer */ + __u64 pbc; /* PBC for the packet */ +}; + +/* diag_pkt flags */ +#define F_DIAGPKT_WAIT 0x1 /* wait until packet is sent */ + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software. + */ + +/* + * Receive Header Flags + */ +#define RHF_PKT_LEN_SHIFT 0 +#define RHF_PKT_LEN_MASK 0xfffull +#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT) + +#define RHF_RCV_TYPE_SHIFT 12 +#define RHF_RCV_TYPE_MASK 0x7ull +#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT) + +#define RHF_USE_EGR_BFR_SHIFT 15 +#define RHF_USE_EGR_BFR_MASK 0x1ull +#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT) + +#define RHF_EGR_INDEX_SHIFT 16 +#define RHF_EGR_INDEX_MASK 0x7ffull +#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT) + +#define RHF_DC_INFO_SHIFT 27 +#define RHF_DC_INFO_MASK 0x1ull +#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT) + +#define RHF_RCV_SEQ_SHIFT 28 +#define RHF_RCV_SEQ_MASK 0xfull +#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT) + +#define RHF_EGR_OFFSET_SHIFT 32 +#define RHF_EGR_OFFSET_MASK 0xfffull +#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT) +#define RHF_HDRQ_OFFSET_SHIFT 44 +#define RHF_HDRQ_OFFSET_MASK 0x1ffull +#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT) +#define RHF_K_HDR_LEN_ERR (0x1ull << 53) +#define RHF_DC_UNC_ERR (0x1ull << 54) +#define RHF_DC_ERR (0x1ull << 55) +#define RHF_RCV_TYPE_ERR_SHIFT 56 +#define RHF_RCV_TYPE_ERR_MASK 0x7ul +#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT) +#define RHF_TID_ERR (0x1ull << 59) +#define RHF_LEN_ERR (0x1ull << 60) +#define RHF_ECC_ERR (0x1ull << 61) +#define RHF_VCRC_ERR (0x1ull << 62) +#define RHF_ICRC_ERR (0x1ull << 63) + +#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */ + +/* RHF receive types */ +#define RHF_RCV_TYPE_EXPECTED 0 +#define RHF_RCV_TYPE_EAGER 1 +#define RHF_RCV_TYPE_IB 2 /* normal IB, IB Raw, or IPv6 */ +#define RHF_RCV_TYPE_ERROR 3 +#define RHF_RCV_TYPE_BYPASS 4 +#define RHF_RCV_TYPE_INVALID5 5 +#define RHF_RCV_TYPE_INVALID6 6 +#define RHF_RCV_TYPE_INVALID7 7 + +/* RHF receive type error - expected packet errors */ +#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR 0x2 +#define RHF_RTE_EXPECTED_FLOW_GEN_ERR 0x4 + +/* RHF receive type error - eager packet errors */ +#define RHF_RTE_EAGER_NO_ERR 0x0 + +/* RHF receive type error - IB packet errors */ +#define RHF_RTE_IB_NO_ERR 0x0 + +/* RHF receive type error - error packet errors */ +#define RHF_RTE_ERROR_NO_ERR 0x0 +#define RHF_RTE_ERROR_OP_CODE_ERR 0x1 +#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2 +#define RHF_RTE_ERROR_KHDR_HCRC_ERR 0x3 +#define RHF_RTE_ERROR_KHDR_KVER_ERR 0x4 +#define RHF_RTE_ERROR_CONTEXT_ERR 0x5 +#define RHF_RTE_ERROR_KHDR_TID_ERR 0x6 + +/* RHF receive type error - bypass packet errors */ +#define RHF_RTE_BYPASS_NO_ERR 0x0 + +/* + * This structure contains the first field common to all protocols + * that employ this chip. + */ +struct hfi1_message_header { + __be16 lrh[4]; +}; + +/* IB - LRH header constants */ +#define HFI1_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define LIM_MGMT_P_KEY 0x7FFF +#define FULL_MGMT_P_KEY 0xFFFF + +#define DEFAULT_P_KEY LIM_MGMT_P_KEY +#define HFI1_PERMISSIVE_LID 0xFFFF +#define HFI1_AETH_CREDIT_SHIFT 24 +#define HFI1_AETH_CREDIT_MASK 0x1F +#define HFI1_AETH_CREDIT_INVAL 0x1F +#define HFI1_MSN_MASK 0xFFFFFF +#define HFI1_QPN_MASK 0xFFFFFF +#define HFI1_FECN_SHIFT 31 +#define HFI1_FECN_MASK 1 +#define HFI1_FECN_SMASK (1 << HFI1_FECN_SHIFT) +#define HFI1_BECN_SHIFT 30 +#define HFI1_BECN_MASK 1 +#define HFI1_BECN_SMASK (1 << HFI1_BECN_SHIFT) +#define HFI1_MULTICAST_LID_BASE 0xC000 + +static inline __u64 rhf_to_cpu(const __le32 *rbuf) +{ + return __le64_to_cpu(*((__le64 *)rbuf)); +} + +static inline u64 rhf_err_flags(u64 rhf) +{ + return rhf & RHF_ERROR_SMASK; +} + +static inline u32 rhf_rcv_type(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK; +} + +static inline u32 rhf_rcv_type_err(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK; +} + +/* return size is in bytes, not DWORDs */ +static inline u32 rhf_pkt_len(u64 rhf) +{ + return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2; +} + +static inline u32 rhf_egr_index(u64 rhf) +{ + return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK; +} + +static inline u32 rhf_rcv_seq(u64 rhf) +{ + return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK; +} + +/* returned offset is in DWORDS */ +static inline u32 rhf_hdrq_offset(u64 rhf) +{ + return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK; +} + +static inline u64 rhf_use_egr_bfr(u64 rhf) +{ + return rhf & RHF_USE_EGR_BFR_SMASK; +} + +static inline u64 rhf_dc_info(u64 rhf) +{ + return rhf & RHF_DC_INFO_SMASK; +} + +static inline u32 rhf_egr_buf_offset(u64 rhf) +{ + return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK; +} +#endif /* _COMMON_H */ diff --git a/drivers/staging/rdma/hfi1/cq.c b/drivers/staging/rdma/hfi1/cq.c new file mode 100644 index 000000000..4f046ffe7 --- /dev/null +++ b/drivers/staging/rdma/hfi1/cq.c @@ -0,0 +1,558 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include + +#include "verbs.h" +#include "hfi.h" + +/** + * hfi1_cq_enter - add a new entry to the completion queue + * @cq: completion queue + * @entry: work completion entry to add + * @sig: true if @entry is a solicited entry + * + * This may be called with qp->s_lock held. + */ +void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int solicited) +{ + struct hfi1_cq_wc *wc; + unsigned long flags; + u32 head; + u32 next; + + spin_lock_irqsave(&cq->lock, flags); + + /* + * Note that the head pointer might be writable by user processes. + * Take care to verify it is a sane value. + */ + wc = cq->queue; + head = wc->head; + if (head >= (unsigned) cq->ibcq.cqe) { + head = cq->ibcq.cqe; + next = 0; + } else + next = head + 1; + if (unlikely(next == wc->tail)) { + spin_unlock_irqrestore(&cq->lock, flags); + if (cq->ibcq.event_handler) { + struct ib_event ev; + + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + return; + } + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].ex.imm_data = + (__u32 __force)entry->ex.imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; + wc->head = next; + + if (cq->notify == IB_CQ_NEXT_COMP || + (cq->notify == IB_CQ_SOLICITED && + (solicited || entry->status != IB_WC_SUCCESS))) { + struct kthread_worker *worker; + /* + * This will cause send_complete() to be called in + * another thread. + */ + smp_read_barrier_depends(); /* see hfi1_cq_exit */ + worker = cq->dd->worker; + if (likely(worker)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + queue_kthread_work(worker, &cq->comptask); + } + } + + spin_unlock_irqrestore(&cq->lock, flags); +} + +/** + * hfi1_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * Returns the number of completion entries polled. + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + */ +int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct hfi1_cq *cq = to_icq(ibcq); + struct hfi1_cq_wc *wc; + unsigned long flags; + int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&cq->lock, flags); + + wc = cq->queue; + tail = wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (tail == wc->head) + break; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; + else + tail++; + } + wc->tail = tail; + + spin_unlock_irqrestore(&cq->lock, flags); + +bail: + return npolled; +} + +static void send_complete(struct kthread_work *work) +{ + struct hfi1_cq *cq = container_of(work, struct hfi1_cq, comptask); + + /* + * The completion handler will most likely rearm the notification + * and poll for all pending entries. If a new completion entry + * is added while we are in this routine, queue_work() + * won't call us again until we return so we check triggered to + * see if we need to call the handler again. + */ + for (;;) { + u8 triggered = cq->triggered; + + /* + * IPoIB connected mode assumes the callback is from a + * soft IRQ. We simulate this by blocking "bottom halves". + * See the implementation for ipoib_cm_handle_tx_wc(), + * netif_tx_lock_bh() and netif_tx_lock(). + */ + local_bh_disable(); + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + local_bh_enable(); + + if (cq->triggered == triggered) + return; + } +} + +/** + * hfi1_create_cq - create a completion queue + * @ibdev: the device this completion queue is attached to + * @attr: creation attributes + * @context: unused by the driver + * @udata: user data for libibverbs.so + * + * Returns a pointer to the completion queue or negative errno values + * for failure. + * + * Called by ib_create_cq() in the generic verbs code. + */ +struct ib_cq *hfi1_create_cq( + struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct hfi1_ibdev *dev = to_idev(ibdev); + struct hfi1_cq *cq; + struct hfi1_cq_wc *wc; + struct ib_cq *ret; + u32 sz; + unsigned int entries = attr->cqe; + + if (attr->flags) + return ERR_PTR(-EINVAL); + + if (entries < 1 || entries > hfi1_max_cqes) + return ERR_PTR(-EINVAL); + + /* Allocate the completion queue structure. */ + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + + /* + * Allocate the completion queue entries and head/tail pointers. + * This is allocated separately so that it can be resized and + * also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* + * Return the address of the WC as the offset to mmap. + * See hfi1_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + + cq->ip = hfi1_create_mmap_info(dev, sz, context, wc); + if (!cq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wc; + } + + err = ib_copy_to_udata(udata, &cq->ip->offset, + sizeof(cq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + cq->ip = NULL; + + spin_lock(&dev->n_cqs_lock); + if (dev->n_cqs_allocated == hfi1_max_cqs) { + spin_unlock(&dev->n_cqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_cqs_allocated++; + spin_unlock(&dev->n_cqs_lock); + + if (cq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + /* + * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. + * The number of entries should be >= the number requested or return + * an error. + */ + cq->dd = dd_from_dev(dev); + cq->ibcq.cqe = entries; + cq->notify = IB_CQ_NONE; + cq->triggered = 0; + spin_lock_init(&cq->lock); + init_kthread_work(&cq->comptask, send_complete); + wc->head = 0; + wc->tail = 0; + cq->queue = wc; + + ret = &cq->ibcq; + + goto done; + +bail_ip: + kfree(cq->ip); +bail_wc: + vfree(wc); +bail_cq: + kfree(cq); +done: + return ret; +} + +/** + * hfi1_destroy_cq - destroy a completion queue + * @ibcq: the completion queue to destroy. + * + * Returns 0 for success. + * + * Called by ib_destroy_cq() in the generic verbs code. + */ +int hfi1_destroy_cq(struct ib_cq *ibcq) +{ + struct hfi1_ibdev *dev = to_idev(ibcq->device); + struct hfi1_cq *cq = to_icq(ibcq); + + flush_kthread_work(&cq->comptask); + spin_lock(&dev->n_cqs_lock); + dev->n_cqs_allocated--; + spin_unlock(&dev->n_cqs_lock); + if (cq->ip) + kref_put(&cq->ip->ref, hfi1_release_mmap_info); + else + vfree(cq->queue); + kfree(cq); + + return 0; +} + +/** + * hfi1_req_notify_cq - change the notification type for a completion queue + * @ibcq: the completion queue + * @notify_flags: the type of notification to request + * + * Returns 0 for success. + * + * This may be called from interrupt context. Also called by + * ib_req_notify_cq() in the generic verbs code. + */ +int hfi1_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct hfi1_cq *cq = to_icq(ibcq); + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, flags); + /* + * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow + * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). + */ + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->queue->head != cq->queue->tail) + ret = 1; + + spin_unlock_irqrestore(&cq->lock, flags); + + return ret; +} + +/** + * hfi1_resize_cq - change the size of the CQ + * @ibcq: the completion queue + * + * Returns 0 for success. + */ +int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct hfi1_cq *cq = to_icq(ibcq); + struct hfi1_cq_wc *old_wc; + struct hfi1_cq_wc *wc; + u32 head, tail, n; + int ret; + u32 sz; + + if (cqe < 1 || cqe > hfi1_max_cqes) { + ret = -EINVAL; + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->outlen >= sizeof(__u64)) { + __u64 offset = 0; + + ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&cq->lock); + /* + * Make sure head and tail are sane since they + * might be user writable. + */ + old_wc = cq->queue; + head = old_wc->head; + if (head > (u32) cq->ibcq.cqe) + head = (u32) cq->ibcq.cqe; + tail = old_wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + if (head < tail) + n = cq->ibcq.cqe + 1 + head - tail; + else + n = head - tail; + if (unlikely((u32)cqe < n)) { + ret = -EINVAL; + goto bail_unlock; + } + for (n = 0; tail != head; n++) { + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; + if (tail == (u32) cq->ibcq.cqe) + tail = 0; + else + tail++; + } + cq->ibcq.cqe = cqe; + wc->head = n; + wc->tail = 0; + cq->queue = wc; + spin_unlock_irq(&cq->lock); + + vfree(old_wc); + + if (cq->ip) { + struct hfi1_ibdev *dev = to_idev(ibcq->device); + struct hfi1_mmap_info *ip = cq->ip; + + hfi1_update_mmap_info(dev, ip, sz, wc); + + /* + * Return the offset to mmap. + * See hfi1_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = 0; + goto bail; + +bail_unlock: + spin_unlock_irq(&cq->lock); +bail_free: + vfree(wc); +bail: + return ret; +} + +int hfi1_cq_init(struct hfi1_devdata *dd) +{ + int ret = 0; + int cpu; + struct task_struct *task; + + if (dd->worker) + return 0; + dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL); + if (!dd->worker) + return -ENOMEM; + init_kthread_worker(dd->worker); + task = kthread_create_on_node( + kthread_worker_fn, + dd->worker, + dd->assigned_node_id, + "hfi1_cq%d", dd->unit); + if (IS_ERR(task)) + goto task_fail; + cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id)); + kthread_bind(task, cpu); + wake_up_process(task); +out: + return ret; +task_fail: + ret = PTR_ERR(task); + kfree(dd->worker); + dd->worker = NULL; + goto out; +} + +void hfi1_cq_exit(struct hfi1_devdata *dd) +{ + struct kthread_worker *worker; + + worker = dd->worker; + if (!worker) + return; + /* blocks future queuing from send_complete() */ + dd->worker = NULL; + smp_wmb(); /* See hfi1_cq_enter */ + flush_kthread_worker(worker); + kthread_stop(worker->task); + kfree(worker); +} diff --git a/drivers/staging/rdma/hfi1/debugfs.c b/drivers/staging/rdma/hfi1/debugfs.c new file mode 100644 index 000000000..acd2269e9 --- /dev/null +++ b/drivers/staging/rdma/hfi1/debugfs.c @@ -0,0 +1,899 @@ +#ifdef CONFIG_DEBUG_FS +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include + +#include "hfi.h" +#include "debugfs.h" +#include "device.h" +#include "qp.h" +#include "sdma.h" + +static struct dentry *hfi1_dbg_root; + +#define private2dd(file) (file_inode(file)->i_private) +#define private2ppd(file) (file_inode(file)->i_private) + +#define DEBUGFS_SEQ_FILE_OPS(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ +} +#define DEBUGFS_SEQ_FILE_OPEN(name) \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ +} + +#define DEBUGFS_FILE_OPS(name) \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release \ +} + +#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ +do { \ + struct dentry *ent; \ + ent = debugfs_create_file(name, mode, parent, \ + data, ops); \ + if (!ent) \ + pr_warn("create of %s failed\n", name); \ +} while (0) + + +#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ + DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO) + +static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + struct hfi1_opcode_stats_perctx *opstats; + + rcu_read_lock(); + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + + +static void _opcode_stats_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _opcode_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + for (j = 0; j < dd->first_user_ctxt; j++) { + if (!dd->rcd[j]) + continue; + n_packets += dd->rcd[j]->opstats->stats[i].n_packets; + n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu\n", i, + (unsigned long long) n_packets, + (unsigned long long) n_bytes); + + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(opcode_stats); +DEBUGFS_SEQ_FILE_OPEN(opcode_stats) +DEBUGFS_FILE_OPS(opcode_stats); + +static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (!*pos) + return SEQ_START_TOKEN; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) + return pos; + + ++*pos; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void _ctx_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _ctx_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos; + loff_t i, j; + u64 n_packets = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "Ctx:npkts\n"); + return 0; + } + + spos = v; + i = *spos; + + if (!dd->rcd[i]) + return SEQ_SKIP; + + for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++) + n_packets += dd->rcd[i]->opstats->stats[j].n_packets; + + if (!n_packets) + return SEQ_SKIP; + + seq_printf(s, " %llu:%llu\n", i, n_packets); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(ctx_stats); +DEBUGFS_SEQ_FILE_OPEN(ctx_stats) +DEBUGFS_FILE_OPS(ctx_stats); + +static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + struct qp_iter *iter; + loff_t n = *pos; + + rcu_read_lock(); + iter = qp_iter_init(s->private); + if (!iter) + return NULL; + + while (n--) { + if (qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, + loff_t *pos) +{ + struct qp_iter *iter = iter_ptr; + + (*pos)++; + + if (qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) +{ + struct qp_iter *iter = iter_ptr; + + if (!iter) + return 0; + + qp_iter_print(s, iter); + + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(qp_stats); +DEBUGFS_SEQ_FILE_OPEN(qp_stats) +DEBUGFS_FILE_OPS(qp_stats); + +static void *_sdes_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + struct hfi1_ibdev *ibd; + struct hfi1_devdata *dd; + + rcu_read_lock(); + ibd = (struct hfi1_ibdev *)s->private; + dd = dd_from_dev(ibd); + if (!dd->per_sdma || *pos >= dd->num_sdma) + return NULL; + return pos; +} + +static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + ++*pos; + if (!dd->per_sdma || *pos >= dd->num_sdma) + return NULL; + return pos; +} + + +static void _sdes_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _sdes_seq_show(struct seq_file *s, void *v) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + loff_t *spos = v; + loff_t i = *spos; + + sdma_seqfile_dump_sde(s, &dd->per_sdma[i]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(sdes); +DEBUGFS_SEQ_FILE_OPEN(sdes) +DEBUGFS_FILE_OPS(sdes); + +/* read the per-device counters */ +static ssize_t dev_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + rcu_read_lock(); + dd = private2dd(file); + avail = hfi1_read_cntrs(dd, *ppos, NULL, &counters); + rval = simple_read_from_buffer(buf, count, ppos, counters, avail); + rcu_read_unlock(); + return rval; +} + +/* read the per-device counters */ +static ssize_t dev_names_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + rcu_read_lock(); + dd = private2dd(file); + avail = hfi1_read_cntrs(dd, *ppos, &names, NULL); + rval = simple_read_from_buffer(buf, count, ppos, names, avail); + rcu_read_unlock(); + return rval; +} + +struct counter_info { + char *name; + const struct file_operations ops; +}; + +/* + * Could use file_inode(file)->i_ino to figure out which file, + * instead of separate routine for each, but for now, this works... + */ + +/* read the per-port names (same for each port) */ +static ssize_t portnames_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + rcu_read_lock(); + dd = private2dd(file); + /* port number n/a here since names are constant */ + avail = hfi1_read_portcntrs(dd, *ppos, 0, &names, NULL); + rval = simple_read_from_buffer(buf, count, ppos, names, avail); + rcu_read_unlock(); + return rval; +} + +/* read the per-port counters */ +static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + ssize_t rval; + + rcu_read_lock(); + ppd = private2ppd(file); + dd = ppd->dd; + avail = hfi1_read_portcntrs(dd, *ppos, ppd->port - 1, NULL, &counters); + rval = simple_read_from_buffer(buf, count, ppos, counters, avail); + rcu_read_unlock(); + return rval; +} + +/* + * read the per-port QSFP data for ppd + */ +static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + char *tmp; + int ret; + + rcu_read_lock(); + ppd = private2ppd(file); + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) { + rcu_read_unlock(); + return -ENOMEM; + } + + ret = qsfp_dump(ppd, tmp, PAGE_SIZE); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, ppos, tmp, ret); + rcu_read_unlock(); + kfree(tmp); + return ret; +} + +/* Do an i2c write operation on the chain for the given HFI. */ +static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int i2c_addr; + int offset; + int total_written; + + rcu_read_lock(); + ppd = private2ppd(file); + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + ret = copy_from_user(buff, buf, count); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + i2c_addr = (*ppos >> 16) & 0xff; + offset = *ppos & 0xffff; + + total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count); + if (total_written < 0) { + ret = total_written; + goto _free; + } + + *ppos += total_written; + + ret = total_written; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do an i2c write operation on chain for HFI 0. */ +static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_write(file, buf, count, ppos, 0); +} + +/* Do an i2c write operation on chain for HFI 1. */ +static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_write(file, buf, count, ppos, 1); +} + +/* Do an i2c read operation on the chain for the given HFI. */ +static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int i2c_addr; + int offset; + int total_read; + + rcu_read_lock(); + ppd = private2ppd(file); + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + i2c_addr = (*ppos >> 16) & 0xff; + offset = *ppos & 0xffff; + + total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count); + if (total_read < 0) { + ret = total_read; + goto _free; + } + + *ppos += total_read; + + ret = copy_to_user(buf, buff, total_read); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + ret = total_read; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do an i2c read operation on chain for HFI 0. */ +static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_read(file, buf, count, ppos, 0); +} + +/* Do an i2c read operation on chain for HFI 1. */ +static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_read(file, buf, count, ppos, 1); +} + +/* Do a QSFP write operation on the i2c chain for the given HFI. */ +static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int total_written; + + rcu_read_lock(); + if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */ + ret = -EINVAL; + goto _return; + } + + ppd = private2ppd(file); + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + ret = copy_from_user(buff, buf, count); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + total_written = qsfp_write(ppd, target, *ppos, buff, count); + if (total_written < 0) { + ret = total_written; + goto _free; + } + + *ppos += total_written; + + ret = total_written; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do a QSFP write operation on i2c chain for HFI 0. */ +static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_write(file, buf, count, ppos, 0); +} + +/* Do a QSFP write operation on i2c chain for HFI 1. */ +static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_write(file, buf, count, ppos, 1); +} + +/* Do a QSFP read operation on the i2c chain for the given HFI. */ +static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int total_read; + + rcu_read_lock(); + if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */ + ret = -EINVAL; + goto _return; + } + + ppd = private2ppd(file); + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + total_read = qsfp_read(ppd, target, *ppos, buff, count); + if (total_read < 0) { + ret = total_read; + goto _free; + } + + *ppos += total_read; + + ret = copy_to_user(buf, buff, total_read); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + ret = total_read; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do a QSFP read operation on i2c chain for HFI 0. */ +static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_read(file, buf, count, ppos, 0); +} + +/* Do a QSFP read operation on i2c chain for HFI 1. */ +static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_read(file, buf, count, ppos, 1); +} + +#define DEBUGFS_OPS(nm, readroutine, writeroutine) \ +{ \ + .name = nm, \ + .ops = { \ + .read = readroutine, \ + .write = writeroutine, \ + .llseek = generic_file_llseek, \ + }, \ +} + +static const struct counter_info cntr_ops[] = { + DEBUGFS_OPS("counter_names", dev_names_read, NULL), + DEBUGFS_OPS("counters", dev_counters_read, NULL), + DEBUGFS_OPS("portcounter_names", portnames_read, NULL), +}; + +static const struct counter_info port_cntr_ops[] = { + DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL), + DEBUGFS_OPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write), + DEBUGFS_OPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write), + DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL), + DEBUGFS_OPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write), + DEBUGFS_OPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write), +}; + +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) +{ + char name[sizeof("port0counters") + 1]; + char link[10]; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_pportdata *ppd; + int unit = dd->unit; + int i, j; + + if (!hfi1_dbg_root) + return; + snprintf(name, sizeof(name), "%s_%d", class_name(), unit); + snprintf(link, sizeof(link), "%d", unit); + ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root); + if (!ibd->hfi1_ibdev_dbg) { + pr_warn("create of %s failed\n", name); + return; + } + ibd->hfi1_ibdev_link = + debugfs_create_symlink(link, hfi1_dbg_root, name); + if (!ibd->hfi1_ibdev_link) { + pr_warn("create of %s symlink failed\n", name); + return; + } + DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd); + DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd); + DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd); + DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd); + /* dev counter files */ + for (i = 0; i < ARRAY_SIZE(cntr_ops); i++) + DEBUGFS_FILE_CREATE(cntr_ops[i].name, + ibd->hfi1_ibdev_dbg, + dd, + &cntr_ops[i].ops, S_IRUGO); + /* per port files */ + for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++) + for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) { + snprintf(name, + sizeof(name), + port_cntr_ops[i].name, + j + 1); + DEBUGFS_FILE_CREATE(name, + ibd->hfi1_ibdev_dbg, + ppd, + &port_cntr_ops[i].ops, + port_cntr_ops[i].ops.write == NULL ? + S_IRUGO : S_IRUGO|S_IWUSR); + } +} + +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) +{ + if (!hfi1_dbg_root) + goto out; + debugfs_remove(ibd->hfi1_ibdev_link); + debugfs_remove_recursive(ibd->hfi1_ibdev_dbg); +out: + ibd->hfi1_ibdev_dbg = NULL; + synchronize_rcu(); +} + +/* + * driver stats field names, one line per stat, single string. Used by + * programs like hfistats to print the stats in a way which works for + * different versions of drivers, without changing program source. + * if hfi1_ib_stats changes, this needs to change. Names need to be + * 12 chars or less (w/o newline), for proper display by hfistats utility. + */ +static const char * const hfi1_statnames[] = { + /* must be element 0*/ + "KernIntr", + "ErrorIntr", + "Tx_Errs", + "Rcv_Errs", + "H/W_Errs", + "NoPIOBufs", + "CtxtsOpen", + "RcvLen_Errs", + "EgrBufFull", + "EgrHdrFull" +}; + +static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + rcu_read_lock(); + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void *_driver_stats_names_seq_next( + struct seq_file *s, + void *v, + loff_t *pos) +{ + ++*pos; + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void _driver_stats_names_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _driver_stats_names_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + + seq_printf(s, "%s\n", hfi1_statnames[*spos]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(driver_stats_names); +DEBUGFS_SEQ_FILE_OPEN(driver_stats_names) +DEBUGFS_FILE_OPS(driver_stats_names); + +static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + rcu_read_lock(); + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void _driver_stats_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static u64 hfi1_sps_ints(void) +{ + unsigned long flags; + struct hfi1_devdata *dd; + u64 sps_ints = 0; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + list_for_each_entry(dd, &hfi1_dev_list, list) { + sps_ints += get_all_cpu_total(dd->int_counter); + } + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + return sps_ints; +} + +static int _driver_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + char *buffer; + u64 *stats = (u64 *)&hfi1_stats; + size_t sz = seq_get_buf(s, &buffer); + + if (sz < sizeof(u64)) + return SEQ_SKIP; + /* special case for interrupts */ + if (*spos == 0) + *(u64 *)buffer = hfi1_sps_ints(); + else + *(u64 *)buffer = stats[*spos]; + seq_commit(s, sizeof(u64)); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(driver_stats); +DEBUGFS_SEQ_FILE_OPEN(driver_stats) +DEBUGFS_FILE_OPS(driver_stats); + +void hfi1_dbg_init(void) +{ + hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL); + if (!hfi1_dbg_root) + pr_warn("init of debugfs failed\n"); + DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL); + DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL); +} + +void hfi1_dbg_exit(void) +{ + debugfs_remove_recursive(hfi1_dbg_root); + hfi1_dbg_root = NULL; +} + +#endif diff --git a/drivers/staging/rdma/hfi1/debugfs.h b/drivers/staging/rdma/hfi1/debugfs.h new file mode 100644 index 000000000..92d6fe146 --- /dev/null +++ b/drivers/staging/rdma/hfi1/debugfs.h @@ -0,0 +1,78 @@ +#ifndef _HFI1_DEBUGFS_H +#define _HFI1_DEBUGFS_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +struct hfi1_ibdev; +#ifdef CONFIG_DEBUG_FS +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd); +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd); +void hfi1_dbg_init(void); +void hfi1_dbg_exit(void); +#else +static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) +{ +} + +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) +{ +} + +void hfi1_dbg_init(void) +{ +} + +void hfi1_dbg_exit(void) +{ +} + +#endif + +#endif /* _HFI1_DEBUGFS_H */ diff --git a/drivers/staging/rdma/hfi1/device.c b/drivers/staging/rdma/hfi1/device.c new file mode 100644 index 000000000..bc26a5392 --- /dev/null +++ b/drivers/staging/rdma/hfi1/device.c @@ -0,0 +1,184 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include + +#include "hfi.h" +#include "device.h" + +static struct class *class; +static struct class *user_class; +static dev_t hfi1_dev; + +int hfi1_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev *cdev, struct device **devp, + bool user_accessible) +{ + const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor); + struct device *device = NULL; + int ret; + + cdev_init(cdev, fops); + cdev->owner = THIS_MODULE; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + pr_err("Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto done; + } + + if (user_accessible) + device = device_create(user_class, NULL, dev, NULL, "%s", name); + else + device = device_create(class, NULL, dev, NULL, "%s", name); + + if (!IS_ERR(device)) + goto done; + ret = PTR_ERR(device); + device = NULL; + pr_err("Could not create device for minor %d, %s (err %d)\n", + minor, name, -ret); + cdev_del(cdev); +done: + *devp = device; + return ret; +} + +void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp) +{ + struct device *device = *devp; + + if (device) { + device_unregister(device); + *devp = NULL; + + cdev_del(cdev); + } +} + +static const char *hfi1_class_name = "hfi1"; + +const char *class_name(void) +{ + return hfi1_class_name; +} + +static char *hfi1_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "%s", dev_name(dev)); +} + +static const char *hfi1_class_name_user = "hfi1_user"; +const char *class_name_user(void) +{ + return hfi1_class_name_user; +} + +static char *hfi1_user_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "%s", dev_name(dev)); +} + +int __init dev_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME); + if (ret < 0) { + pr_err("Could not allocate chrdev region (err %d)\n", -ret); + goto done; + } + + class = class_create(THIS_MODULE, class_name()); + if (IS_ERR(class)) { + ret = PTR_ERR(class); + pr_err("Could not create device class (err %d)\n", -ret); + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); + goto done; + } + class->devnode = hfi1_devnode; + + user_class = class_create(THIS_MODULE, class_name_user()); + if (IS_ERR(user_class)) { + ret = PTR_ERR(user_class); + pr_err("Could not create device class for user accessible files (err %d)\n", + -ret); + class_destroy(class); + class = NULL; + user_class = NULL; + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); + goto done; + } + user_class->devnode = hfi1_user_devnode; + +done: + return ret; +} + +void dev_cleanup(void) +{ + class_destroy(class); + class = NULL; + + class_destroy(user_class); + user_class = NULL; + + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); +} diff --git a/drivers/staging/rdma/hfi1/device.h b/drivers/staging/rdma/hfi1/device.h new file mode 100644 index 000000000..2850ff739 --- /dev/null +++ b/drivers/staging/rdma/hfi1/device.h @@ -0,0 +1,62 @@ +#ifndef _HFI1_DEVICE_H +#define _HFI1_DEVICE_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +int hfi1_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev *cdev, struct device **devp, + bool user_accessible); +void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp); +const char *class_name(void); +int __init dev_init(void); +void dev_cleanup(void); + +#endif /* _HFI1_DEVICE_H */ diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c new file mode 100644 index 000000000..3e8d5ac4c --- /dev/null +++ b/drivers/staging/rdma/hfi1/diag.c @@ -0,0 +1,1873 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains support for diagnostic functions. It is accessed by + * opening the hfi1_diag device, normally minor number 129. Diagnostic use + * of the chip may render the chip or board unusable until the driver + * is unloaded, or in some cases, until the system is rebooted. + * + * Accesses to the chip through this interface are not similar to going + * through the /sys/bus/pci resource mmap interface. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "hfi.h" +#include "device.h" +#include "common.h" +#include "trace.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt +#define snoop_dbg(fmt, ...) \ + hfi1_cdbg(SNOOP, fmt, ##__VA_ARGS__) + +/* Snoop option mask */ +#define SNOOP_DROP_SEND (1 << 0) +#define SNOOP_USE_METADATA (1 << 1) + +static u8 snoop_flags; + +/* + * Extract packet length from LRH header. + * Why & 0x7FF? Because len is only 11 bits in case it wasn't 0'd we throw the + * bogus bits away. This is in Dwords so multiply by 4 to get size in bytes + */ +#define HFI1_GET_PKT_LEN(x) (((be16_to_cpu((x)->lrh[2]) & 0x7FF)) << 2) + +enum hfi1_filter_status { + HFI1_FILTER_HIT, + HFI1_FILTER_ERR, + HFI1_FILTER_MISS +}; + +/* snoop processing functions */ +rhf_rcv_function_ptr snoop_rhf_rcv_functions[8] = { + [RHF_RCV_TYPE_EXPECTED] = snoop_recv_handler, + [RHF_RCV_TYPE_EAGER] = snoop_recv_handler, + [RHF_RCV_TYPE_IB] = snoop_recv_handler, + [RHF_RCV_TYPE_ERROR] = snoop_recv_handler, + [RHF_RCV_TYPE_BYPASS] = snoop_recv_handler, + [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID7] = process_receive_invalid +}; + +/* Snoop packet structure */ +struct snoop_packet { + struct list_head list; + u32 total_len; + u8 data[]; +}; + +/* Do not make these an enum or it will blow up the capture_md */ +#define PKT_DIR_EGRESS 0x0 +#define PKT_DIR_INGRESS 0x1 + +/* Packet capture metadata returned to the user with the packet. */ +struct capture_md { + u8 port; + u8 dir; + u8 reserved[6]; + union { + u64 pbc; + u64 rhf; + } u; +}; + +static atomic_t diagpkt_count = ATOMIC_INIT(0); +static struct cdev diagpkt_cdev; +static struct device *diagpkt_device; + +static ssize_t diagpkt_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diagpkt_file_ops = { + .owner = THIS_MODULE, + .write = diagpkt_write, + .llseek = noop_llseek, +}; + +/* + * This is used for communication with user space for snoop extended IOCTLs + */ +struct hfi1_link_info { + __be64 node_guid; + u8 port_mode; + u8 port_state; + u16 link_speed_active; + u16 link_width_active; + u16 vl15_init; + u8 port_number; + /* + * Add padding to make this a full IB SMP payload. Note: changing the + * size of this structure will make the IOCTLs created with _IOWR + * change. + * Be sure to run tests on all IOCTLs when making changes to this + * structure. + */ + u8 res[47]; +}; + +/* + * This starts our ioctl sequence numbers *way* off from the ones + * defined in ib_core. + */ +#define SNOOP_CAPTURE_VERSION 0x1 + +#define IB_IOCTL_MAGIC 0x1b /* See Documentation/ioctl-number.txt */ +#define HFI1_SNOOP_IOC_MAGIC IB_IOCTL_MAGIC +#define HFI1_SNOOP_IOC_BASE_SEQ 0x80 + +#define HFI1_SNOOP_IOCGETLINKSTATE \ + _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ) +#define HFI1_SNOOP_IOCSETLINKSTATE \ + _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+1) +#define HFI1_SNOOP_IOCCLEARQUEUE \ + _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+2) +#define HFI1_SNOOP_IOCCLEARFILTER \ + _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+3) +#define HFI1_SNOOP_IOCSETFILTER \ + _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+4) +#define HFI1_SNOOP_IOCGETVERSION \ + _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+5) +#define HFI1_SNOOP_IOCSET_OPTS \ + _IO(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6) + +/* + * These offsets +6/+7 could change, but these are already known and used + * IOCTL numbers so don't change them without a good reason. + */ +#define HFI1_SNOOP_IOCGETLINKSTATE_EXTRA \ + _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+6, \ + struct hfi1_link_info) +#define HFI1_SNOOP_IOCSETLINKSTATE_EXTRA \ + _IOWR(HFI1_SNOOP_IOC_MAGIC, HFI1_SNOOP_IOC_BASE_SEQ+7, \ + struct hfi1_link_info) + +static int hfi1_snoop_open(struct inode *in, struct file *fp); +static ssize_t hfi1_snoop_read(struct file *fp, char __user *data, + size_t pkt_len, loff_t *off); +static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); +static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); +static unsigned int hfi1_snoop_poll(struct file *fp, + struct poll_table_struct *wait); +static int hfi1_snoop_release(struct inode *in, struct file *fp); + +struct hfi1_packet_filter_command { + int opcode; + int length; + void *value_ptr; +}; + +/* Can't re-use PKT_DIR_*GRESS here because 0 means no packets for this */ +#define HFI1_SNOOP_INGRESS 0x1 +#define HFI1_SNOOP_EGRESS 0x2 + +enum hfi1_packet_filter_opcodes { + FILTER_BY_LID, + FILTER_BY_DLID, + FILTER_BY_MAD_MGMT_CLASS, + FILTER_BY_QP_NUMBER, + FILTER_BY_PKT_TYPE, + FILTER_BY_SERVICE_LEVEL, + FILTER_BY_PKEY, + FILTER_BY_DIRECTION, +}; + +static const struct file_operations snoop_file_ops = { + .owner = THIS_MODULE, + .open = hfi1_snoop_open, + .read = hfi1_snoop_read, + .unlocked_ioctl = hfi1_ioctl, + .poll = hfi1_snoop_poll, + .write = hfi1_snoop_write, + .release = hfi1_snoop_release +}; + +struct hfi1_filter_array { + int (*filter)(void *, void *, void *); +}; + +static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value); +static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value); +static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data, + void *value); +static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value); +static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data, + void *value); +static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data, + void *value); +static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value); +static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value); + +static struct hfi1_filter_array hfi1_filters[] = { + { hfi1_filter_lid }, + { hfi1_filter_dlid }, + { hfi1_filter_mad_mgmt_class }, + { hfi1_filter_qp_number }, + { hfi1_filter_ibpacket_type }, + { hfi1_filter_ib_service_level }, + { hfi1_filter_ib_pkey }, + { hfi1_filter_direction }, +}; + +#define HFI1_MAX_FILTERS ARRAY_SIZE(hfi1_filters) +#define HFI1_DIAG_MINOR_BASE 129 + +static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name); + +int hfi1_diag_add(struct hfi1_devdata *dd) +{ + char name[16]; + int ret = 0; + + snprintf(name, sizeof(name), "%s_diagpkt%d", class_name(), + dd->unit); + /* + * Do this for each device as opposed to the normal diagpkt + * interface which is one per host + */ + ret = hfi1_snoop_add(dd, name); + if (ret) + dd_dev_err(dd, "Unable to init snoop/capture device"); + + snprintf(name, sizeof(name), "%s_diagpkt", class_name()); + if (atomic_inc_return(&diagpkt_count) == 1) { + ret = hfi1_cdev_init(HFI1_DIAGPKT_MINOR, name, + &diagpkt_file_ops, &diagpkt_cdev, + &diagpkt_device, false); + } + + return ret; +} + +/* this must be called w/ dd->snoop_in_lock held */ +static void drain_snoop_list(struct list_head *queue) +{ + struct list_head *pos, *q; + struct snoop_packet *packet; + + list_for_each_safe(pos, q, queue) { + packet = list_entry(pos, struct snoop_packet, list); + list_del(pos); + kfree(packet); + } +} + +static void hfi1_snoop_remove(struct hfi1_devdata *dd) +{ + unsigned long flags = 0; + + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + drain_snoop_list(&dd->hfi1_snoop.queue); + hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev); + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); +} + +void hfi1_diag_remove(struct hfi1_devdata *dd) +{ + + hfi1_snoop_remove(dd); + if (atomic_dec_and_test(&diagpkt_count)) + hfi1_cdev_cleanup(&diagpkt_cdev, &diagpkt_device); + hfi1_cdev_cleanup(&dd->diag_cdev, &dd->diag_device); +} + + +/* + * Allocated structure shared between the credit return mechanism and + * diagpkt_send(). + */ +struct diagpkt_wait { + struct completion credits_returned; + int code; + atomic_t count; +}; + +/* + * When each side is finished with the structure, they call this. + * The last user frees the structure. + */ +static void put_diagpkt_wait(struct diagpkt_wait *wait) +{ + if (atomic_dec_and_test(&wait->count)) + kfree(wait); +} + +/* + * Callback from the credit return code. Set the complete, which + * will let diapkt_send() continue. + */ +static void diagpkt_complete(void *arg, int code) +{ + struct diagpkt_wait *wait = (struct diagpkt_wait *)arg; + + wait->code = code; + complete(&wait->credits_returned); + put_diagpkt_wait(wait); /* finished with the structure */ +} + +/** + * diagpkt_send - send a packet + * @dp: diag packet descriptor + */ +static ssize_t diagpkt_send(struct diag_pkt *dp) +{ + struct hfi1_devdata *dd; + struct send_context *sc; + struct pio_buf *pbuf; + u32 *tmpbuf = NULL; + ssize_t ret = 0; + u32 pkt_len, total_len; + pio_release_cb credit_cb = NULL; + void *credit_arg = NULL; + struct diagpkt_wait *wait = NULL; + + dd = hfi1_lookup(dp->unit); + if (!dd || !(dd->flags & HFI1_PRESENT) || !dd->kregbase) { + ret = -ENODEV; + goto bail; + } + if (!(dd->flags & HFI1_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -ENODEV; + goto bail; + } + + if (dp->version != _DIAG_PKT_VERS) { + dd_dev_err(dd, "Invalid version %u for diagpkt_write\n", + dp->version); + ret = -EINVAL; + goto bail; + } + + /* send count must be an exact number of dwords */ + if (dp->len & 3) { + ret = -EINVAL; + goto bail; + } + + /* there is only port 1 */ + if (dp->port != 1) { + ret = -EINVAL; + goto bail; + } + + /* need a valid context */ + if (dp->sw_index >= dd->num_send_contexts) { + ret = -EINVAL; + goto bail; + } + /* can only use kernel contexts */ + if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) { + ret = -EINVAL; + goto bail; + } + /* must be allocated */ + sc = dd->send_contexts[dp->sw_index].sc; + if (!sc) { + ret = -EINVAL; + goto bail; + } + /* must be enabled */ + if (!(sc->flags & SCF_ENABLED)) { + ret = -EINVAL; + goto bail; + } + + /* allocate a buffer and copy the data in */ + tmpbuf = vmalloc(dp->len); + if (!tmpbuf) { + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmpbuf, + (const void __user *) (unsigned long) dp->data, + dp->len)) { + ret = -EFAULT; + goto bail; + } + + /* + * pkt_len is how much data we have to write, includes header and data. + * total_len is length of the packet in Dwords plus the PBC should not + * include the CRC. + */ + pkt_len = dp->len >> 2; + total_len = pkt_len + 2; /* PBC + packet */ + + /* if 0, fill in a default */ + if (dp->pbc == 0) { + struct hfi1_pportdata *ppd = dd->pport; + + hfi1_cdbg(PKT, "Generating PBC"); + dp->pbc = create_pbc(ppd, 0, 0, 0, total_len); + } else { + hfi1_cdbg(PKT, "Using passed in PBC"); + } + + hfi1_cdbg(PKT, "Egress PBC content is 0x%llx", dp->pbc); + + /* + * The caller wants to wait until the packet is sent and to + * check for errors. The best we can do is wait until + * the buffer credits are returned and check if any packet + * error has occurred. If there are any late errors, this + * could miss it. If there are other senders who generate + * an error, this may find it. However, in general, it + * should catch most. + */ + if (dp->flags & F_DIAGPKT_WAIT) { + /* always force a credit return */ + dp->pbc |= PBC_CREDIT_RETURN; + /* turn on credit return interrupts */ + sc_add_credit_return_intr(sc); + wait = kmalloc(sizeof(*wait), GFP_KERNEL); + if (!wait) { + ret = -ENOMEM; + goto bail; + } + init_completion(&wait->credits_returned); + atomic_set(&wait->count, 2); + wait->code = PRC_OK; + + credit_cb = diagpkt_complete; + credit_arg = wait; + } + + pbuf = sc_buffer_alloc(sc, total_len, credit_cb, credit_arg); + if (!pbuf) { + /* + * No send buffer means no credit callback. Undo + * the wait set-up that was done above. We free wait + * because the callback will never be called. + */ + if (dp->flags & F_DIAGPKT_WAIT) { + sc_del_credit_return_intr(sc); + kfree(wait); + wait = NULL; + } + ret = -ENOSPC; + goto bail; + } + + pio_copy(dd, pbuf, dp->pbc, tmpbuf, pkt_len); + /* no flush needed as the HW knows the packet size */ + + ret = sizeof(*dp); + + if (dp->flags & F_DIAGPKT_WAIT) { + /* wait for credit return */ + ret = wait_for_completion_interruptible( + &wait->credits_returned); + /* + * If the wait returns an error, the wait was interrupted, + * e.g. with a ^C in the user program. The callback is + * still pending. This is OK as the wait structure is + * kmalloc'ed and the structure will free itself when + * all users are done with it. + * + * A context disable occurs on a send context restart, so + * include that in the list of errors below to check for. + * NOTE: PRC_FILL_ERR is at best informational and cannot + * be depended on. + */ + if (!ret && (((wait->code & PRC_STATUS_ERR) + || (wait->code & PRC_FILL_ERR) + || (wait->code & PRC_SC_DISABLE)))) + ret = -EIO; + + put_diagpkt_wait(wait); /* finished with the structure */ + sc_del_credit_return_intr(sc); + } + +bail: + vfree(tmpbuf); + return ret; +} + +static ssize_t diagpkt_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + struct hfi1_devdata *dd; + struct send_context *sc; + u8 vl; + + struct diag_pkt dp; + + if (count != sizeof(dp)) + return -EINVAL; + + if (copy_from_user(&dp, data, sizeof(dp))) + return -EFAULT; + + /* + * The Send Context is derived from the PbcVL value + * if PBC is populated + */ + if (dp.pbc) { + dd = hfi1_lookup(dp.unit); + if (dd == NULL) + return -ENODEV; + vl = (dp.pbc >> PBC_VL_SHIFT) & PBC_VL_MASK; + sc = dd->vld[vl].sc; + if (sc) { + dp.sw_index = sc->sw_index; + hfi1_cdbg( + PKT, + "Packet sent over VL %d via Send Context %u(%u)", + vl, sc->sw_index, sc->hw_context); + } + } + + return diagpkt_send(&dp); +} + +static int hfi1_snoop_add(struct hfi1_devdata *dd, const char *name) +{ + int ret = 0; + + dd->hfi1_snoop.mode_flag = 0; + spin_lock_init(&dd->hfi1_snoop.snoop_lock); + INIT_LIST_HEAD(&dd->hfi1_snoop.queue); + init_waitqueue_head(&dd->hfi1_snoop.waitq); + + ret = hfi1_cdev_init(HFI1_SNOOP_CAPTURE_BASE + dd->unit, name, + &snoop_file_ops, + &dd->hfi1_snoop.cdev, &dd->hfi1_snoop.class_dev, + false); + + if (ret) { + dd_dev_err(dd, "Couldn't create %s device: %d", name, ret); + hfi1_cdev_cleanup(&dd->hfi1_snoop.cdev, + &dd->hfi1_snoop.class_dev); + } + + return ret; +} + +static struct hfi1_devdata *hfi1_dd_from_sc_inode(struct inode *in) +{ + int unit = iminor(in) - HFI1_SNOOP_CAPTURE_BASE; + struct hfi1_devdata *dd = NULL; + + dd = hfi1_lookup(unit); + return dd; + +} + +/* clear or restore send context integrity checks */ +static void adjust_integrity_checks(struct hfi1_devdata *dd) +{ + struct send_context *sc; + unsigned long sc_flags; + int i; + + spin_lock_irqsave(&dd->sc_lock, sc_flags); + for (i = 0; i < dd->num_send_contexts; i++) { + int enable; + + sc = dd->send_contexts[i].sc; + + if (!sc) + continue; /* not allocated */ + + enable = likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) && + dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE; + + set_pio_integrity(sc); + + if (enable) /* take HFI_CAP_* flags into account */ + hfi1_init_ctxt(sc); + } + spin_unlock_irqrestore(&dd->sc_lock, sc_flags); +} + +static int hfi1_snoop_open(struct inode *in, struct file *fp) +{ + int ret; + int mode_flag = 0; + unsigned long flags = 0; + struct hfi1_devdata *dd; + struct list_head *queue; + + mutex_lock(&hfi1_mutex); + + dd = hfi1_dd_from_sc_inode(in); + if (dd == NULL) { + ret = -ENODEV; + goto bail; + } + + /* + * File mode determines snoop or capture. Some existing user + * applications expect the capture device to be able to be opened RDWR + * because they expect a dedicated capture device. For this reason we + * support a module param to force capture mode even if the file open + * mode matches snoop. + */ + if ((fp->f_flags & O_ACCMODE) == O_RDONLY) { + snoop_dbg("Capture Enabled"); + mode_flag = HFI1_PORT_CAPTURE_MODE; + } else if ((fp->f_flags & O_ACCMODE) == O_RDWR) { + snoop_dbg("Snoop Enabled"); + mode_flag = HFI1_PORT_SNOOP_MODE; + } else { + snoop_dbg("Invalid"); + ret = -EINVAL; + goto bail; + } + queue = &dd->hfi1_snoop.queue; + + /* + * We are not supporting snoop and capture at the same time. + */ + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + if (dd->hfi1_snoop.mode_flag) { + ret = -EBUSY; + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + goto bail; + } + + dd->hfi1_snoop.mode_flag = mode_flag; + drain_snoop_list(queue); + + dd->hfi1_snoop.filter_callback = NULL; + dd->hfi1_snoop.filter_value = NULL; + + /* + * Send side packet integrity checks are not helpful when snooping so + * disable and re-enable when we stop snooping. + */ + if (mode_flag == HFI1_PORT_SNOOP_MODE) { + /* clear after snoop mode is on */ + adjust_integrity_checks(dd); /* clear */ + + /* + * We also do not want to be doing the DLID LMC check for + * ingressed packets. + */ + dd->hfi1_snoop.dcc_cfg = read_csr(dd, DCC_CFG_PORT_CONFIG1); + write_csr(dd, DCC_CFG_PORT_CONFIG1, + (dd->hfi1_snoop.dcc_cfg >> 32) << 32); + } + + /* + * As soon as we set these function pointers the recv and send handlers + * are active. This is a race condition so we must make sure to drain + * the queue and init filter values above. Technically we should add + * locking here but all that will happen is on recv a packet will get + * allocated and get stuck on the snoop_lock before getting added to the + * queue. Same goes for send. + */ + dd->rhf_rcv_function_map = snoop_rhf_rcv_functions; + dd->process_pio_send = snoop_send_pio_handler; + dd->process_dma_send = snoop_send_pio_handler; + dd->pio_inline_send = snoop_inline_pio_send; + + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + ret = 0; + +bail: + mutex_unlock(&hfi1_mutex); + + return ret; +} + +static int hfi1_snoop_release(struct inode *in, struct file *fp) +{ + unsigned long flags = 0; + struct hfi1_devdata *dd; + int mode_flag; + + dd = hfi1_dd_from_sc_inode(in); + if (dd == NULL) + return -ENODEV; + + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + + /* clear the snoop mode before re-adjusting send context CSRs */ + mode_flag = dd->hfi1_snoop.mode_flag; + dd->hfi1_snoop.mode_flag = 0; + + /* + * Drain the queue and clear the filters we are done with it. Don't + * forget to restore the packet integrity checks + */ + drain_snoop_list(&dd->hfi1_snoop.queue); + if (mode_flag == HFI1_PORT_SNOOP_MODE) { + /* restore after snoop mode is clear */ + adjust_integrity_checks(dd); /* restore */ + + /* + * Also should probably reset the DCC_CONFIG1 register for DLID + * checking on incoming packets again. Use the value saved when + * opening the snoop device. + */ + write_csr(dd, DCC_CFG_PORT_CONFIG1, dd->hfi1_snoop.dcc_cfg); + } + + dd->hfi1_snoop.filter_callback = NULL; + kfree(dd->hfi1_snoop.filter_value); + dd->hfi1_snoop.filter_value = NULL; + + /* + * User is done snooping and capturing, return control to the normal + * handler. Re-enable SDMA handling. + */ + dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; + dd->process_pio_send = hfi1_verbs_send_pio; + dd->process_dma_send = hfi1_verbs_send_dma; + dd->pio_inline_send = pio_copy; + + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + + snoop_dbg("snoop/capture device released"); + + return 0; +} + +static unsigned int hfi1_snoop_poll(struct file *fp, + struct poll_table_struct *wait) +{ + int ret = 0; + unsigned long flags = 0; + + struct hfi1_devdata *dd; + + dd = hfi1_dd_from_sc_inode(fp->f_inode); + if (dd == NULL) + return -ENODEV; + + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + + poll_wait(fp, &dd->hfi1_snoop.waitq, wait); + if (!list_empty(&dd->hfi1_snoop.queue)) + ret |= POLLIN | POLLRDNORM; + + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + return ret; + +} + +static ssize_t hfi1_snoop_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + struct diag_pkt dpkt; + struct hfi1_devdata *dd; + size_t ret; + u8 byte_two, sl, sc5, sc4, vl, byte_one; + struct send_context *sc; + u32 len; + u64 pbc; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + + dd = hfi1_dd_from_sc_inode(fp->f_inode); + if (dd == NULL) + return -ENODEV; + + ppd = dd->pport; + snoop_dbg("received %lu bytes from user", count); + + memset(&dpkt, 0, sizeof(struct diag_pkt)); + dpkt.version = _DIAG_PKT_VERS; + dpkt.unit = dd->unit; + dpkt.port = 1; + + if (likely(!(snoop_flags & SNOOP_USE_METADATA))) { + /* + * We need to generate the PBC and not let diagpkt_send do it, + * to do this we need the VL and the length in dwords. + * The VL can be determined by using the SL and looking up the + * SC. Then the SC can be converted into VL. The exception to + * this is those packets which are from an SMI queue pair. + * Since we can't detect anything about the QP here we have to + * rely on the SC. If its 0xF then we assume its SMI and + * do not look at the SL. + */ + if (copy_from_user(&byte_one, data, 1)) + return -EINVAL; + + if (copy_from_user(&byte_two, data+1, 1)) + return -EINVAL; + + sc4 = (byte_one >> 4) & 0xf; + if (sc4 == 0xF) { + snoop_dbg("Detected VL15 packet ignoring SL in packet"); + vl = sc4; + } else { + sl = (byte_two >> 4) & 0xf; + ibp = to_iport(&dd->verbs_dev.ibdev, 1); + sc5 = ibp->sl_to_sc[sl]; + vl = sc_to_vlt(dd, sc5); + if (vl != sc4) { + snoop_dbg("VL %d does not match SC %d of packet", + vl, sc4); + return -EINVAL; + } + } + + sc = dd->vld[vl].sc; /* Look up the context based on VL */ + if (sc) { + dpkt.sw_index = sc->sw_index; + snoop_dbg("Sending on context %u(%u)", sc->sw_index, + sc->hw_context); + } else { + snoop_dbg("Could not find context for vl %d", vl); + return -EINVAL; + } + + len = (count >> 2) + 2; /* Add in PBC */ + pbc = create_pbc(ppd, 0, 0, vl, len); + } else { + if (copy_from_user(&pbc, data, sizeof(pbc))) + return -EINVAL; + vl = (pbc >> PBC_VL_SHIFT) & PBC_VL_MASK; + sc = dd->vld[vl].sc; /* Look up the context based on VL */ + if (sc) { + dpkt.sw_index = sc->sw_index; + } else { + snoop_dbg("Could not find context for vl %d", vl); + return -EINVAL; + } + data += sizeof(pbc); + count -= sizeof(pbc); + } + dpkt.len = count; + dpkt.data = (unsigned long)data; + + snoop_dbg("PBC: vl=0x%llx Length=0x%llx", + (pbc >> 12) & 0xf, + (pbc & 0xfff)); + + dpkt.pbc = pbc; + ret = diagpkt_send(&dpkt); + /* + * diagpkt_send only returns number of bytes in the diagpkt so patch + * that up here before returning. + */ + if (ret == sizeof(dpkt)) + return count; + + return ret; +} + +static ssize_t hfi1_snoop_read(struct file *fp, char __user *data, + size_t pkt_len, loff_t *off) +{ + ssize_t ret = 0; + unsigned long flags = 0; + struct snoop_packet *packet = NULL; + struct hfi1_devdata *dd; + + dd = hfi1_dd_from_sc_inode(fp->f_inode); + if (dd == NULL) + return -ENODEV; + + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + + while (list_empty(&dd->hfi1_snoop.queue)) { + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + + if (fp->f_flags & O_NONBLOCK) + return -EAGAIN; + + if (wait_event_interruptible( + dd->hfi1_snoop.waitq, + !list_empty(&dd->hfi1_snoop.queue))) + return -EINTR; + + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + } + + if (!list_empty(&dd->hfi1_snoop.queue)) { + packet = list_entry(dd->hfi1_snoop.queue.next, + struct snoop_packet, list); + list_del(&packet->list); + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + if (pkt_len >= packet->total_len) { + if (copy_to_user(data, packet->data, + packet->total_len)) + ret = -EFAULT; + else + ret = packet->total_len; + } else + ret = -EINVAL; + + kfree(packet); + } else + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + + return ret; +} + +static long hfi1_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) +{ + struct hfi1_devdata *dd; + void *filter_value = NULL; + long ret = 0; + int value = 0; + u8 physState = 0; + u8 linkState = 0; + u16 devState = 0; + unsigned long flags = 0; + unsigned long *argp = NULL; + struct hfi1_packet_filter_command filter_cmd = {0}; + int mode_flag = 0; + struct hfi1_pportdata *ppd = NULL; + unsigned int index; + struct hfi1_link_info link_info; + + dd = hfi1_dd_from_sc_inode(fp->f_inode); + if (dd == NULL) + return -ENODEV; + + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + + mode_flag = dd->hfi1_snoop.mode_flag; + + if (((_IOC_DIR(cmd) & _IOC_READ) + && !access_ok(VERIFY_WRITE, (void __user *)arg, _IOC_SIZE(cmd))) + || ((_IOC_DIR(cmd) & _IOC_WRITE) + && !access_ok(VERIFY_READ, (void __user *)arg, _IOC_SIZE(cmd)))) { + ret = -EFAULT; + } else if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + } else if ((mode_flag & HFI1_PORT_CAPTURE_MODE) && + (cmd != HFI1_SNOOP_IOCCLEARQUEUE) && + (cmd != HFI1_SNOOP_IOCCLEARFILTER) && + (cmd != HFI1_SNOOP_IOCSETFILTER)) { + /* Capture devices are allowed only 3 operations + * 1.Clear capture queue + * 2.Clear capture filter + * 3.Set capture filter + * Other are invalid. + */ + ret = -EINVAL; + } else { + switch (cmd) { + case HFI1_SNOOP_IOCSETLINKSTATE: + snoop_dbg("HFI1_SNOOP_IOCSETLINKSTATE is not valid"); + ret = -EINVAL; + break; + + case HFI1_SNOOP_IOCSETLINKSTATE_EXTRA: + memset(&link_info, 0, sizeof(link_info)); + + if (copy_from_user(&link_info, + (struct hfi1_link_info __user *)arg, + sizeof(link_info))) + ret = -EFAULT; + + value = link_info.port_state; + index = link_info.port_number; + if (index > dd->num_pports - 1) { + ret = -EINVAL; + break; + } + + ppd = &dd->pport[index]; + if (!ppd) { + ret = -EINVAL; + break; + } + + /* What we want to transition to */ + physState = (value >> 4) & 0xF; + linkState = value & 0xF; + snoop_dbg("Setting link state 0x%x", value); + + switch (linkState) { + case IB_PORT_NOP: + if (physState == 0) + break; + /* fall through */ + case IB_PORT_DOWN: + switch (physState) { + case 0: + devState = HLS_DN_DOWNDEF; + break; + case 2: + devState = HLS_DN_POLL; + break; + case 3: + devState = HLS_DN_DISABLE; + break; + default: + ret = -EINVAL; + goto done; + } + ret = set_link_state(ppd, devState); + break; + case IB_PORT_ARMED: + ret = set_link_state(ppd, HLS_UP_ARMED); + if (!ret) + send_idle_sma(dd, SMA_IDLE_ARM); + break; + case IB_PORT_ACTIVE: + ret = set_link_state(ppd, HLS_UP_ACTIVE); + if (!ret) + send_idle_sma(dd, SMA_IDLE_ACTIVE); + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + break; + /* fall through */ + case HFI1_SNOOP_IOCGETLINKSTATE: + case HFI1_SNOOP_IOCGETLINKSTATE_EXTRA: + if (cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) { + memset(&link_info, 0, sizeof(link_info)); + if (copy_from_user(&link_info, + (struct hfi1_link_info __user *)arg, + sizeof(link_info))) + ret = -EFAULT; + index = link_info.port_number; + } else { + ret = __get_user(index, (int __user *) arg); + if (ret != 0) + break; + } + + if (index > dd->num_pports - 1) { + ret = -EINVAL; + break; + } + + ppd = &dd->pport[index]; + if (!ppd) { + ret = -EINVAL; + break; + } + value = hfi1_ibphys_portstate(ppd); + value <<= 4; + value |= driver_lstate(ppd); + + snoop_dbg("Link port | Link State: %d", value); + + if ((cmd == HFI1_SNOOP_IOCGETLINKSTATE_EXTRA) || + (cmd == HFI1_SNOOP_IOCSETLINKSTATE_EXTRA)) { + link_info.port_state = value; + link_info.node_guid = cpu_to_be64(ppd->guid); + link_info.link_speed_active = + ppd->link_speed_active; + link_info.link_width_active = + ppd->link_width_active; + if (copy_to_user( + (struct hfi1_link_info __user *)arg, + &link_info, sizeof(link_info))) + ret = -EFAULT; + } else { + ret = __put_user(value, (int __user *)arg); + } + break; + + case HFI1_SNOOP_IOCCLEARQUEUE: + snoop_dbg("Clearing snoop queue"); + drain_snoop_list(&dd->hfi1_snoop.queue); + break; + + case HFI1_SNOOP_IOCCLEARFILTER: + snoop_dbg("Clearing filter"); + if (dd->hfi1_snoop.filter_callback) { + /* Drain packets first */ + drain_snoop_list(&dd->hfi1_snoop.queue); + dd->hfi1_snoop.filter_callback = NULL; + } + kfree(dd->hfi1_snoop.filter_value); + dd->hfi1_snoop.filter_value = NULL; + break; + + case HFI1_SNOOP_IOCSETFILTER: + snoop_dbg("Setting filter"); + /* just copy command structure */ + argp = (unsigned long *)arg; + if (copy_from_user(&filter_cmd, (void __user *)argp, + sizeof(filter_cmd))) { + ret = -EFAULT; + break; + } + if (filter_cmd.opcode >= HFI1_MAX_FILTERS) { + pr_alert("Invalid opcode in request\n"); + ret = -EINVAL; + break; + } + + snoop_dbg("Opcode %d Len %d Ptr %p", + filter_cmd.opcode, filter_cmd.length, + filter_cmd.value_ptr); + + filter_value = kzalloc( + filter_cmd.length * sizeof(u8), + GFP_KERNEL); + if (!filter_value) { + pr_alert("Not enough memory\n"); + ret = -ENOMEM; + break; + } + /* copy remaining data from userspace */ + if (copy_from_user((u8 *)filter_value, + (void __user *)filter_cmd.value_ptr, + filter_cmd.length)) { + kfree(filter_value); + ret = -EFAULT; + break; + } + /* Drain packets first */ + drain_snoop_list(&dd->hfi1_snoop.queue); + dd->hfi1_snoop.filter_callback = + hfi1_filters[filter_cmd.opcode].filter; + /* just in case we see back to back sets */ + kfree(dd->hfi1_snoop.filter_value); + dd->hfi1_snoop.filter_value = filter_value; + + break; + case HFI1_SNOOP_IOCGETVERSION: + value = SNOOP_CAPTURE_VERSION; + snoop_dbg("Getting version: %d", value); + ret = __put_user(value, (int __user *)arg); + break; + case HFI1_SNOOP_IOCSET_OPTS: + snoop_flags = 0; + ret = __get_user(value, (int __user *) arg); + if (ret != 0) + break; + + snoop_dbg("Setting snoop option %d", value); + if (value & SNOOP_DROP_SEND) + snoop_flags |= SNOOP_DROP_SEND; + if (value & SNOOP_USE_METADATA) + snoop_flags |= SNOOP_USE_METADATA; + break; + default: + ret = -ENOTTY; + break; + } + } +done: + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + return ret; +} + +static void snoop_list_add_tail(struct snoop_packet *packet, + struct hfi1_devdata *dd) +{ + unsigned long flags = 0; + + spin_lock_irqsave(&dd->hfi1_snoop.snoop_lock, flags); + if (likely((dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) || + (dd->hfi1_snoop.mode_flag & HFI1_PORT_CAPTURE_MODE))) { + list_add_tail(&packet->list, &dd->hfi1_snoop.queue); + snoop_dbg("Added packet to list"); + } + + /* + * Technically we can could have closed the snoop device while waiting + * on the above lock and it is gone now. The snoop mode_flag will + * prevent us from adding the packet to the queue though. + */ + + spin_unlock_irqrestore(&dd->hfi1_snoop.snoop_lock, flags); + wake_up_interruptible(&dd->hfi1_snoop.waitq); +} + +static inline int hfi1_filter_check(void *val, const char *msg) +{ + if (!val) { + snoop_dbg("Error invalid %s value for filter", msg); + return HFI1_FILTER_ERR; + } + return 0; +} + +static int hfi1_filter_lid(void *ibhdr, void *packet_data, void *value) +{ + struct hfi1_ib_header *hdr; + int ret; + + ret = hfi1_filter_check(ibhdr, "header"); + if (ret) + return ret; + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + hdr = (struct hfi1_ib_header *)ibhdr; + + if (*((u16 *)value) == be16_to_cpu(hdr->lrh[3])) /* matches slid */ + return HFI1_FILTER_HIT; /* matched */ + + return HFI1_FILTER_MISS; /* Not matched */ +} + +static int hfi1_filter_dlid(void *ibhdr, void *packet_data, void *value) +{ + struct hfi1_ib_header *hdr; + int ret; + + ret = hfi1_filter_check(ibhdr, "header"); + if (ret) + return ret; + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + + hdr = (struct hfi1_ib_header *)ibhdr; + + if (*((u16 *)value) == be16_to_cpu(hdr->lrh[1])) + return HFI1_FILTER_HIT; + + return HFI1_FILTER_MISS; +} + +/* Not valid for outgoing packets, send handler passes null for data*/ +static int hfi1_filter_mad_mgmt_class(void *ibhdr, void *packet_data, + void *value) +{ + struct hfi1_ib_header *hdr; + struct hfi1_other_headers *ohdr = NULL; + struct ib_smp *smp = NULL; + u32 qpn = 0; + int ret; + + ret = hfi1_filter_check(ibhdr, "header"); + if (ret) + return ret; + ret = hfi1_filter_check(packet_data, "packet_data"); + if (ret) + return ret; + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + + hdr = (struct hfi1_ib_header *)ibhdr; + + /* Check for GRH */ + if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; /* LRH + BTH + DETH */ + else + ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */ + + qpn = be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF; + if (qpn <= 1) { + smp = (struct ib_smp *)packet_data; + if (*((u8 *)value) == smp->mgmt_class) + return HFI1_FILTER_HIT; + else + return HFI1_FILTER_MISS; + } + return HFI1_FILTER_ERR; +} + +static int hfi1_filter_qp_number(void *ibhdr, void *packet_data, void *value) +{ + + struct hfi1_ib_header *hdr; + struct hfi1_other_headers *ohdr = NULL; + int ret; + + ret = hfi1_filter_check(ibhdr, "header"); + if (ret) + return ret; + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + + hdr = (struct hfi1_ib_header *)ibhdr; + + /* Check for GRH */ + if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; /* LRH + BTH + DETH */ + else + ohdr = &hdr->u.l.oth; /* LRH + GRH + BTH + DETH */ + if (*((u32 *)value) == (be32_to_cpu(ohdr->bth[1]) & 0x00FFFFFF)) + return HFI1_FILTER_HIT; + + return HFI1_FILTER_MISS; +} + +static int hfi1_filter_ibpacket_type(void *ibhdr, void *packet_data, + void *value) +{ + u32 lnh = 0; + u8 opcode = 0; + struct hfi1_ib_header *hdr; + struct hfi1_other_headers *ohdr = NULL; + int ret; + + ret = hfi1_filter_check(ibhdr, "header"); + if (ret) + return ret; + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + + hdr = (struct hfi1_ib_header *)ibhdr; + + lnh = (be16_to_cpu(hdr->lrh[0]) & 3); + + if (lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + return HFI1_FILTER_ERR; + + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + + if (*((u8 *)value) == ((opcode >> 5) & 0x7)) + return HFI1_FILTER_HIT; + + return HFI1_FILTER_MISS; +} + +static int hfi1_filter_ib_service_level(void *ibhdr, void *packet_data, + void *value) +{ + struct hfi1_ib_header *hdr; + int ret; + + ret = hfi1_filter_check(ibhdr, "header"); + if (ret) + return ret; + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + + hdr = (struct hfi1_ib_header *)ibhdr; + + if ((*((u8 *)value)) == ((be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF)) + return HFI1_FILTER_HIT; + + return HFI1_FILTER_MISS; +} + +static int hfi1_filter_ib_pkey(void *ibhdr, void *packet_data, void *value) +{ + + u32 lnh = 0; + struct hfi1_ib_header *hdr; + struct hfi1_other_headers *ohdr = NULL; + int ret; + + ret = hfi1_filter_check(ibhdr, "header"); + if (ret) + return ret; + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + + hdr = (struct hfi1_ib_header *)ibhdr; + + lnh = (be16_to_cpu(hdr->lrh[0]) & 3); + if (lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + return HFI1_FILTER_ERR; + + /* P_key is 16-bit entity, however top most bit indicates + * type of membership. 0 for limited and 1 for Full. + * Limited members cannot accept information from other + * Limited members, but communication is allowed between + * every other combination of membership. + * Hence we'll omit comparing top-most bit while filtering + */ + + if ((*(u16 *)value & 0x7FFF) == + ((be32_to_cpu(ohdr->bth[0])) & 0x7FFF)) + return HFI1_FILTER_HIT; + + return HFI1_FILTER_MISS; +} + +/* + * If packet_data is NULL then this is coming from one of the send functions. + * Thus we know if its an ingressed or egressed packet. + */ +static int hfi1_filter_direction(void *ibhdr, void *packet_data, void *value) +{ + u8 user_dir = *(u8 *)value; + int ret; + + ret = hfi1_filter_check(value, "user"); + if (ret) + return ret; + + if (packet_data) { + /* Incoming packet */ + if (user_dir & HFI1_SNOOP_INGRESS) + return HFI1_FILTER_HIT; + } else { + /* Outgoing packet */ + if (user_dir & HFI1_SNOOP_EGRESS) + return HFI1_FILTER_HIT; + } + + return HFI1_FILTER_MISS; +} + +/* + * Allocate a snoop packet. The structure that is stored in the ring buffer, not + * to be confused with an hfi packet type. + */ +static struct snoop_packet *allocate_snoop_packet(u32 hdr_len, + u32 data_len, + u32 md_len) +{ + + struct snoop_packet *packet = NULL; + + packet = kzalloc(sizeof(struct snoop_packet) + hdr_len + data_len + + md_len, + GFP_ATOMIC | __GFP_NOWARN); + if (likely(packet)) + INIT_LIST_HEAD(&packet->list); + + + return packet; +} + +/* + * Instead of having snoop and capture code intermixed with the recv functions, + * both the interrupt handler and hfi1_ib_rcv() we are going to hijack the call + * and land in here for snoop/capture but if not enabled the call will go + * through as before. This gives us a single point to constrain all of the snoop + * snoop recv logic. There is nothing special that needs to happen for bypass + * packets. This routine should not try to look into the packet. It just copied + * it. There is no guarantee for filters when it comes to bypass packets as + * there is no specific support. Bottom line is this routine does now even know + * what a bypass packet is. + */ +int snoop_recv_handler(struct hfi1_packet *packet) +{ + struct hfi1_pportdata *ppd = packet->rcd->ppd; + struct hfi1_ib_header *hdr = packet->hdr; + int header_size = packet->hlen; + void *data = packet->ebuf; + u32 tlen = packet->tlen; + struct snoop_packet *s_packet = NULL; + int ret; + int snoop_mode = 0; + u32 md_len = 0; + struct capture_md md; + + snoop_dbg("PACKET IN: hdr size %d tlen %d data %p", header_size, tlen, + data); + + trace_snoop_capture(ppd->dd, header_size, hdr, tlen - header_size, + data); + + if (!ppd->dd->hfi1_snoop.filter_callback) { + snoop_dbg("filter not set"); + ret = HFI1_FILTER_HIT; + } else { + ret = ppd->dd->hfi1_snoop.filter_callback(hdr, data, + ppd->dd->hfi1_snoop.filter_value); + } + + switch (ret) { + case HFI1_FILTER_ERR: + snoop_dbg("Error in filter call"); + break; + case HFI1_FILTER_MISS: + snoop_dbg("Filter Miss"); + break; + case HFI1_FILTER_HIT: + + if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) + snoop_mode = 1; + if ((snoop_mode == 0) || + unlikely(snoop_flags & SNOOP_USE_METADATA)) + md_len = sizeof(struct capture_md); + + + s_packet = allocate_snoop_packet(header_size, + tlen - header_size, + md_len); + + if (unlikely(s_packet == NULL)) { + dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n"); + break; + } + + if (md_len > 0) { + memset(&md, 0, sizeof(struct capture_md)); + md.port = 1; + md.dir = PKT_DIR_INGRESS; + md.u.rhf = packet->rhf; + memcpy(s_packet->data, &md, md_len); + } + + /* We should always have a header */ + if (hdr) { + memcpy(s_packet->data + md_len, hdr, header_size); + } else { + dd_dev_err(ppd->dd, "Unable to copy header to snoop/capture packet\n"); + kfree(s_packet); + break; + } + + /* + * Packets with no data are possible. If there is no data needed + * to take care of the last 4 bytes which are normally included + * with data buffers and are included in tlen. Since we kzalloc + * the buffer we do not need to set any values but if we decide + * not to use kzalloc we should zero them. + */ + if (data) + memcpy(s_packet->data + header_size + md_len, data, + tlen - header_size); + + s_packet->total_len = tlen + md_len; + snoop_list_add_tail(s_packet, ppd->dd); + + /* + * If we are snooping the packet not capturing then throw away + * after adding to the list. + */ + snoop_dbg("Capturing packet"); + if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) { + snoop_dbg("Throwing packet away"); + /* + * If we are dropping the packet we still may need to + * handle the case where error flags are set, this is + * normally done by the type specific handler but that + * won't be called in this case. + */ + if (unlikely(rhf_err_flags(packet->rhf))) + handle_eflags(packet); + + /* throw the packet on the floor */ + return RHF_RCV_CONTINUE; + } + break; + default: + break; + } + + /* + * We do not care what type of packet came in here - just pass it off + * to the normal handler. + */ + return ppd->dd->normal_rhf_rcv_functions[rhf_rcv_type(packet->rhf)] + (packet); +} + +/* + * Handle snooping and capturing packets when sdma is being used. + */ +int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc) +{ + pr_alert("Snooping/Capture of Send DMA Packets Is Not Supported!\n"); + snoop_dbg("Unsupported Operation"); + return hfi1_verbs_send_dma(qp, ibhdr, hdrwords, ss, len, plen, dwords, + 0); +} + +/* + * Handle snooping and capturing packets when pio is being used. Does not handle + * bypass packets. The only way to send a bypass packet currently is to use the + * diagpkt interface. When that interface is enable snoop/capture is not. + */ +int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ahdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct snoop_packet *s_packet = NULL; + u32 *hdr = (u32 *)&ahdr->ibh; + u32 length = 0; + struct hfi1_sge_state temp_ss; + void *data = NULL; + void *data_start = NULL; + int ret; + int snoop_mode = 0; + int md_len = 0; + struct capture_md md; + u32 vl; + u32 hdr_len = hdrwords << 2; + u32 tlen = HFI1_GET_PKT_LEN(&ahdr->ibh); + + md.u.pbc = 0; + + snoop_dbg("PACKET OUT: hdrword %u len %u plen %u dwords %u tlen %u", + hdrwords, len, plen, dwords, tlen); + if (ppd->dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) + snoop_mode = 1; + if ((snoop_mode == 0) || + unlikely(snoop_flags & SNOOP_USE_METADATA)) + md_len = sizeof(struct capture_md); + + /* not using ss->total_len as arg 2 b/c that does not count CRC */ + s_packet = allocate_snoop_packet(hdr_len, tlen - hdr_len, md_len); + + if (unlikely(s_packet == NULL)) { + dd_dev_warn_ratelimited(ppd->dd, "Unable to allocate snoop/capture packet\n"); + goto out; + } + + s_packet->total_len = tlen + md_len; + + if (md_len > 0) { + memset(&md, 0, sizeof(struct capture_md)); + md.port = 1; + md.dir = PKT_DIR_EGRESS; + if (likely(pbc == 0)) { + vl = be16_to_cpu(ahdr->ibh.lrh[0]) >> 12; + md.u.pbc = create_pbc(ppd, 0, qp->s_srate, vl, plen); + } else { + md.u.pbc = 0; + } + memcpy(s_packet->data, &md, md_len); + } else { + md.u.pbc = pbc; + } + + /* Copy header */ + if (likely(hdr)) { + memcpy(s_packet->data + md_len, hdr, hdr_len); + } else { + dd_dev_err(ppd->dd, + "Unable to copy header to snoop/capture packet\n"); + kfree(s_packet); + goto out; + } + + if (ss) { + data = s_packet->data + hdr_len + md_len; + data_start = data; + + /* + * Copy SGE State + * The update_sge() function below will not modify the + * individual SGEs in the array. It will make a copy each time + * and operate on that. So we only need to copy this instance + * and it won't impact PIO. + */ + temp_ss = *ss; + length = len; + + snoop_dbg("Need to copy %d bytes", length); + while (length) { + void *addr = temp_ss.sge.vaddr; + u32 slen = temp_ss.sge.length; + + if (slen > length) { + slen = length; + snoop_dbg("slen %d > len %d", slen, length); + } + snoop_dbg("copy %d to %p", slen, addr); + memcpy(data, addr, slen); + update_sge(&temp_ss, slen); + length -= slen; + data += slen; + snoop_dbg("data is now %p bytes left %d", data, length); + } + snoop_dbg("Completed SGE copy"); + } + + /* + * Why do the filter check down here? Because the event tracing has its + * own filtering and we need to have the walked the SGE list. + */ + if (!ppd->dd->hfi1_snoop.filter_callback) { + snoop_dbg("filter not set\n"); + ret = HFI1_FILTER_HIT; + } else { + ret = ppd->dd->hfi1_snoop.filter_callback( + &ahdr->ibh, + NULL, + ppd->dd->hfi1_snoop.filter_value); + } + + switch (ret) { + case HFI1_FILTER_ERR: + snoop_dbg("Error in filter call"); + /* fall through */ + case HFI1_FILTER_MISS: + snoop_dbg("Filter Miss"); + kfree(s_packet); + break; + case HFI1_FILTER_HIT: + snoop_dbg("Capturing packet"); + snoop_list_add_tail(s_packet, ppd->dd); + + if (unlikely((snoop_flags & SNOOP_DROP_SEND) && + (ppd->dd->hfi1_snoop.mode_flag & + HFI1_PORT_SNOOP_MODE))) { + unsigned long flags; + + snoop_dbg("Dropping packet"); + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_send_complete( + qp, + qp->s_wqe, + IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_rc_send_complete(qp, &ahdr->ibh); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + return 0; + } + break; + default: + kfree(s_packet); + break; + } +out: + return hfi1_verbs_send_pio(qp, ahdr, hdrwords, ss, len, plen, dwords, + md.u.pbc); +} + +/* + * Callers of this must pass a hfi1_ib_header type for the from ptr. Currently + * this can be used anywhere, but the intention is for inline ACKs for RC and + * CCA packets. We don't restrict this usage though. + */ +void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count) +{ + int snoop_mode = 0; + int md_len = 0; + struct capture_md md; + struct snoop_packet *s_packet = NULL; + + /* + * count is in dwords so we need to convert to bytes. + * We also need to account for CRC which would be tacked on by hardware. + */ + int packet_len = (count << 2) + 4; + int ret; + + snoop_dbg("ACK OUT: len %d", packet_len); + + if (!dd->hfi1_snoop.filter_callback) { + snoop_dbg("filter not set"); + ret = HFI1_FILTER_HIT; + } else { + ret = dd->hfi1_snoop.filter_callback( + (struct hfi1_ib_header *)from, + NULL, + dd->hfi1_snoop.filter_value); + } + + switch (ret) { + case HFI1_FILTER_ERR: + snoop_dbg("Error in filter call"); + /* fall through */ + case HFI1_FILTER_MISS: + snoop_dbg("Filter Miss"); + break; + case HFI1_FILTER_HIT: + snoop_dbg("Capturing packet"); + if (dd->hfi1_snoop.mode_flag & HFI1_PORT_SNOOP_MODE) + snoop_mode = 1; + if ((snoop_mode == 0) || + unlikely(snoop_flags & SNOOP_USE_METADATA)) + md_len = sizeof(struct capture_md); + + s_packet = allocate_snoop_packet(packet_len, 0, md_len); + + if (unlikely(s_packet == NULL)) { + dd_dev_warn_ratelimited(dd, "Unable to allocate snoop/capture packet\n"); + goto inline_pio_out; + } + + s_packet->total_len = packet_len + md_len; + + /* Fill in the metadata for the packet */ + if (md_len > 0) { + memset(&md, 0, sizeof(struct capture_md)); + md.port = 1; + md.dir = PKT_DIR_EGRESS; + md.u.pbc = pbc; + memcpy(s_packet->data, &md, md_len); + } + + /* Add the packet data which is a single buffer */ + memcpy(s_packet->data + md_len, from, packet_len); + + snoop_list_add_tail(s_packet, dd); + + if (unlikely((snoop_flags & SNOOP_DROP_SEND) && snoop_mode)) { + snoop_dbg("Dropping packet"); + return; + } + break; + default: + break; + } + +inline_pio_out: + pio_copy(dd, pbuf, pbc, from, count); + +} diff --git a/drivers/staging/rdma/hfi1/dma.c b/drivers/staging/rdma/hfi1/dma.c new file mode 100644 index 000000000..e03bd7351 --- /dev/null +++ b/drivers/staging/rdma/hfi1/dma.c @@ -0,0 +1,186 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include + +#include "verbs.h" + +#define BAD_DMA_ADDRESS ((u64) 0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + return (u64) cpu_addr; +} + +static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + if (offset + size > PAGE_SIZE) + return BAD_DMA_ADDRESS; + + addr = (u64) page_address(page); + if (addr) + addr += offset; + + return addr; +} + +static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + for_each_sg(sgl, sg, nents, i) { + addr = (u64) page_address(sg_page(sg)); + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void hfi1_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64) addr; + return addr; +} + +static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops hfi1_dma_mapping_ops = { + .mapping_error = hfi1_mapping_error, + .map_single = hfi1_dma_map_single, + .unmap_single = hfi1_dma_unmap_single, + .map_page = hfi1_dma_map_page, + .unmap_page = hfi1_dma_unmap_page, + .map_sg = hfi1_map_sg, + .unmap_sg = hfi1_unmap_sg, + .sync_single_for_cpu = hfi1_sync_single_for_cpu, + .sync_single_for_device = hfi1_sync_single_for_device, + .alloc_coherent = hfi1_dma_alloc_coherent, + .free_coherent = hfi1_dma_free_coherent +}; diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c new file mode 100644 index 000000000..c0a59001e --- /dev/null +++ b/drivers/staging/rdma/hfi1/driver.c @@ -0,0 +1,1241 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "sdma.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the initialization code. + */ +const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n"; + +DEFINE_SPINLOCK(hfi1_devs_lock); +LIST_HEAD(hfi1_dev_list); +DEFINE_MUTEX(hfi1_mutex); /* general driver use */ + +unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; +module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO); +MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192"); + +unsigned int hfi1_cu = 1; +module_param_named(cu, hfi1_cu, uint, S_IRUGO); +MODULE_PARM_DESC(cu, "Credit return units"); + +unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT; +static int hfi1_caps_set(const char *, const struct kernel_param *); +static int hfi1_caps_get(char *, const struct kernel_param *); +static const struct kernel_param_ops cap_ops = { + .set = hfi1_caps_set, + .get = hfi1_caps_get +}; +module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Intel Omni-Path Architecture driver"); +MODULE_VERSION(HFI1_DRIVER_VERSION); + +/* + * MAX_PKT_RCV is the max # if packets processed per receive interrupt. + */ +#define MAX_PKT_RECV 64 +#define EGR_HEAD_UPDATE_THRESHOLD 16 + +struct hfi1_ib_stats hfi1_stats; + +static int hfi1_caps_set(const char *val, const struct kernel_param *kp) +{ + int ret = 0; + unsigned long *cap_mask_ptr = (unsigned long *)kp->arg, + cap_mask = *cap_mask_ptr, value, diff, + write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_WRITABLE_MASK); + + ret = kstrtoul(val, 0, &value); + if (ret) { + pr_warn("Invalid module parameter value for 'cap_mask'\n"); + goto done; + } + /* Get the changed bits (except the locked bit) */ + diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK); + + /* Remove any bits that are not allowed to change after driver load */ + if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) { + pr_warn("Ignoring non-writable capability bits %#lx\n", + diff & ~write_mask); + diff &= write_mask; + } + + /* Mask off any reserved bits */ + diff &= ~HFI1_CAP_RESERVED_MASK; + /* Clear any previously set and changing bits */ + cap_mask &= ~diff; + /* Update the bits with the new capability */ + cap_mask |= (value & diff); + /* Check for any kernel/user restrictions */ + diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^ + ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT); + cap_mask &= ~diff; + /* Set the bitmask to the final set */ + *cap_mask_ptr = cap_mask; +done: + return ret; +} + +static int hfi1_caps_get(char *buffer, const struct kernel_param *kp) +{ + unsigned long cap_mask = *(unsigned long *)kp->arg; + + cap_mask &= ~HFI1_CAP_LOCKED_SMASK; + cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT); + + return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask); +} + +const char *get_unit_name(int unit) +{ + static char iname[16]; + + snprintf(iname, sizeof(iname), DRIVER_NAME"_%u", unit); + return iname; +} + +/* + * Return count of units with at least one port ACTIVE. + */ +int hfi1_count_active_units(void) +{ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + unsigned long flags; + int pidx, nunits_active = 0; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + list_for_each_entry(dd, &hfi1_dev_list, list) { + if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase) + continue; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && ppd->linkup) { + nunits_active++; + break; + } + } + } + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + return nunits_active; +} + +/* + * Return count of all units, optionally return in arguments + * the number of usable (present) units, and the number of + * ports that are up. + */ +int hfi1_count_units(int *npresentp, int *nupp) +{ + int nunits = 0, npresent = 0, nup = 0; + struct hfi1_devdata *dd; + unsigned long flags; + int pidx; + struct hfi1_pportdata *ppd; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + + list_for_each_entry(dd, &hfi1_dev_list, list) { + nunits++; + if ((dd->flags & HFI1_PRESENT) && dd->kregbase) + npresent++; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && ppd->linkup) + nup++; + } + } + + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + + return nunits; +} + +/* + * Get address of eager buffer from it's index (allocated in chunks, not + * contiguous). + */ +static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf, + u8 *update) +{ + u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf); + + *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset; + return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) + + (offset * RCV_BUF_BLOCK_SIZE)); +} + +/* + * Validate and encode the a given RcvArray Buffer size. + * The function will check whether the given size falls within + * allowed size ranges for the respective type and, optionally, + * return the proper encoding. + */ +inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded) +{ + if (unlikely(!IS_ALIGNED(size, PAGE_SIZE))) + return 0; + if (unlikely(size < MIN_EAGER_BUFFER)) + return 0; + if (size > + (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER)) + return 0; + if (encoded) + *encoded = ilog2(size / PAGE_SIZE) + 1; + return 1; +} + +static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, + struct hfi1_packet *packet) +{ + struct hfi1_message_header *rhdr = packet->hdr; + u32 rte = rhf_rcv_type_err(packet->rhf); + int lnh = be16_to_cpu(rhdr->lrh[0]) & 3; + struct hfi1_ibport *ibp = &ppd->ibport_data; + + if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR)) + return; + + if (packet->rhf & RHF_TID_ERR) { + /* For TIDERR and RC QPs preemptively schedule a NAK */ + struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr; + struct hfi1_other_headers *ohdr = NULL; + u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + u16 lid = be16_to_cpu(hdr->lrh[1]); + u32 qp_num; + u32 rcv_flags = 0; + + /* Sanity check packet */ + if (tlen < 24) + goto drop; + + /* Check for GRH */ + if (lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) { + u32 vtf; + + ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + rcv_flags |= HFI1_HAS_GRH; + } else + goto drop; + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + if (lid < HFI1_MULTICAST_LID_BASE) { + struct hfi1_qp *qp; + + rcu_read_lock(); + qp = hfi1_lookup_qpn(ibp, qp_num); + if (!qp) { + rcu_read_unlock(); + goto drop; + } + + /* + * Handle only RC QPs - for other QP types drop error + * packet. + */ + spin_lock(&qp->r_lock); + + /* Check for valid receive state. */ + if (!(ib_hfi1_state_ops[qp->state] & + HFI1_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + hfi1_rc_hdrerr( + rcd, + hdr, + rcv_flags, + qp); + break; + default: + /* For now don't handle any other QP types */ + break; + } + + spin_unlock(&qp->r_lock); + rcu_read_unlock(); + } /* Unicast QP */ + } /* Valid packet with TIDErr */ + + /* handle "RcvTypeErr" flags */ + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + { + u32 opcode; + void *ebuf = NULL; + __be32 *bth = NULL; + + if (rhf_use_egr_bfr(packet->rhf)) + ebuf = packet->ebuf; + + if (ebuf == NULL) + goto drop; /* this should never happen */ + + if (lnh == HFI1_LRH_BTH) + bth = (__be32 *)ebuf; + else if (lnh == HFI1_LRH_GRH) + bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh)); + else + goto drop; + + opcode = be32_to_cpu(bth[0]) >> 24; + opcode &= 0xff; + + if (opcode == IB_OPCODE_CNP) { + /* + * Only in pre-B0 h/w is the CNP_OPCODE handled + * via this code path (errata 291394). + */ + struct hfi1_qp *qp = NULL; + u32 lqpn, rqpn; + u16 rlid; + u8 svc_type, sl, sc5; + + sc5 = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf; + if (rhf_dc_info(packet->rhf)) + sc5 |= 0x10; + sl = ibp->sc_to_sl[sc5]; + + lqpn = be32_to_cpu(bth[1]) & HFI1_QPN_MASK; + rcu_read_lock(); + qp = hfi1_lookup_qpn(ibp, lqpn); + if (qp == NULL) { + rcu_read_unlock(); + goto drop; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_UD: + rlid = 0; + rqpn = 0; + svc_type = IB_CC_SVCTYPE_UD; + break; + case IB_QPT_UC: + rlid = be16_to_cpu(rhdr->lrh[3]); + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_UC; + break; + default: + goto drop; + } + + process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); + rcu_read_unlock(); + } + + packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK; + break; + } + default: + break; + } + +drop: + return; +} + +static inline void init_packet(struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet) +{ + + packet->rsize = rcd->rcvhdrqentsize; /* words */ + packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */ + packet->rcd = rcd; + packet->updegr = 0; + packet->etail = -1; + packet->rhf_addr = (__le32 *) rcd->rcvhdrq + rcd->head + + rcd->dd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); + packet->rhqoff = rcd->head; + packet->numpkt = 0; + packet->rcv_flags = 0; +} + +#ifndef CONFIG_PRESCAN_RXQ +static void prescan_rxq(struct hfi1_packet *packet) {} +#else /* CONFIG_PRESCAN_RXQ */ +static int prescan_receive_queue; + +static void process_ecn(struct hfi1_qp *qp, struct hfi1_ib_header *hdr, + struct hfi1_other_headers *ohdr, + u64 rhf, struct ib_grh *grh) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + u32 bth1; + u8 sc5, svc_type; + int is_fecn, is_becn; + + switch (qp->ibqp.qp_type) { + case IB_QPT_UD: + svc_type = IB_CC_SVCTYPE_UD; + break; + case IB_QPT_UC: /* LATER */ + case IB_QPT_RC: /* LATER */ + default: + return; + } + + is_fecn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) & + HFI1_FECN_MASK; + is_becn = (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) & + HFI1_BECN_MASK; + + sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + if (rhf_dc_info(rhf)) + sc5 |= 0x10; + + if (is_fecn) { + u32 src_qpn = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK; + u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); + u16 dlid = be16_to_cpu(hdr->lrh[1]); + u16 slid = be16_to_cpu(hdr->lrh[3]); + + return_cnp(ibp, qp, src_qpn, pkey, dlid, slid, sc5, grh); + } + + if (is_becn) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 lqpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + u8 sl = ibp->sc_to_sl[sc5]; + + process_becn(ppd, sl, 0, lqpn, 0, svc_type); + } + + /* turn off BECN, or FECN */ + bth1 = be32_to_cpu(ohdr->bth[1]); + bth1 &= ~(HFI1_FECN_MASK << HFI1_FECN_SHIFT); + bth1 &= ~(HFI1_BECN_MASK << HFI1_BECN_SHIFT); + ohdr->bth[1] = cpu_to_be32(bth1); +} + +struct ps_mdata { + struct hfi1_ctxtdata *rcd; + u32 rsize; + u32 maxcnt; + u32 ps_head; + u32 ps_tail; + u32 ps_seq; +}; + +static inline void init_ps_mdata(struct ps_mdata *mdata, + struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + mdata->rcd = rcd; + mdata->rsize = packet->rsize; + mdata->maxcnt = packet->maxcnt; + + if (rcd->ps_state.initialized == 0) { + mdata->ps_head = packet->rhqoff; + rcd->ps_state.initialized++; + } else + mdata->ps_head = rcd->ps_state.ps_head; + + if (HFI1_CAP_IS_KSET(DMA_RTAIL)) { + mdata->ps_tail = packet->hdrqtail; + mdata->ps_seq = 0; /* not used with DMA_RTAIL */ + } else { + mdata->ps_tail = 0; /* used only with DMA_RTAIL*/ + mdata->ps_seq = rcd->seq_cnt; + } +} + +static inline int ps_done(struct ps_mdata *mdata, u64 rhf) +{ + if (HFI1_CAP_IS_KSET(DMA_RTAIL)) + return mdata->ps_head == mdata->ps_tail; + return mdata->ps_seq != rhf_rcv_seq(rhf); +} + +static inline void update_ps_mdata(struct ps_mdata *mdata) +{ + struct hfi1_ctxtdata *rcd = mdata->rcd; + + mdata->ps_head += mdata->rsize; + if (mdata->ps_head > mdata->maxcnt) + mdata->ps_head = 0; + rcd->ps_state.ps_head = mdata->ps_head; + if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) { + if (++mdata->ps_seq > 13) + mdata->ps_seq = 1; + } +} + +/* + * prescan_rxq - search through the receive queue looking for packets + * containing Excplicit Congestion Notifications (FECNs, or BECNs). + * When an ECN is found, process the Congestion Notification, and toggle + * it off. + */ +static void prescan_rxq(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ps_mdata mdata; + + if (!prescan_receive_queue) + return; + + init_ps_mdata(&mdata, packet); + + while (1) { + struct hfi1_devdata *dd = rcd->dd; + struct hfi1_ibport *ibp = &rcd->ppd->ibport_data; + __le32 *rhf_addr = (__le32 *) rcd->rcvhdrq + mdata.ps_head + + dd->rhf_offset; + struct hfi1_qp *qp; + struct hfi1_ib_header *hdr; + struct hfi1_other_headers *ohdr; + struct ib_grh *grh = NULL; + u64 rhf = rhf_to_cpu(rhf_addr); + u32 etype = rhf_rcv_type(rhf), qpn; + int is_ecn = 0; + u8 lnh; + + if (ps_done(&mdata, rhf)) + break; + + if (etype != RHF_RCV_TYPE_IB) + goto next; + + hdr = (struct hfi1_ib_header *) + hfi1_get_msgheader(dd, rhf_addr); + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + + if (lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) { + ohdr = &hdr->u.l.oth; + grh = &hdr->u.l.grh; + } else + goto next; /* just in case */ + + is_ecn |= be32_to_cpu(ohdr->bth[1]) & + (HFI1_FECN_MASK << HFI1_FECN_SHIFT); + is_ecn |= be32_to_cpu(ohdr->bth[1]) & + (HFI1_BECN_MASK << HFI1_BECN_SHIFT); + + if (!is_ecn) + goto next; + + qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + rcu_read_lock(); + qp = hfi1_lookup_qpn(ibp, qpn); + + if (qp == NULL) { + rcu_read_unlock(); + goto next; + } + + process_ecn(qp, hdr, ohdr, rhf, grh); + rcu_read_unlock(); +next: + update_ps_mdata(&mdata); + } +} +#endif /* CONFIG_PRESCAN_RXQ */ + +#define RCV_PKT_OK 0x0 +#define RCV_PKT_MAX 0x1 + +static inline int process_rcv_packet(struct hfi1_packet *packet) +{ + int ret = RCV_PKT_OK; + + packet->hdr = hfi1_get_msgheader(packet->rcd->dd, + packet->rhf_addr); + packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; + packet->etype = rhf_rcv_type(packet->rhf); + /* total length */ + packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + /* retrieve eager buffer details */ + packet->ebuf = NULL; + if (rhf_use_egr_bfr(packet->rhf)) { + packet->etail = rhf_egr_index(packet->rhf); + packet->ebuf = get_egrbuf(packet->rcd, packet->rhf, + &packet->updegr); + /* + * Prefetch the contents of the eager buffer. It is + * OK to send a negative length to prefetch_range(). + * The +2 is the size of the RHF. + */ + prefetch_range(packet->ebuf, + packet->tlen - ((packet->rcd->rcvhdrqentsize - + (rhf_hdrq_offset(packet->rhf)+2)) * 4)); + } + + /* + * Call a type specific handler for the packet. We + * should be able to trust that etype won't be beyond + * the range of valid indexes. If so something is really + * wrong and we can probably just let things come + * crashing down. There is no need to eat another + * comparison in this performance critical code. + */ + packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet); + packet->numpkt++; + + /* Set up for the next packet */ + packet->rhqoff += packet->rsize; + if (packet->rhqoff >= packet->maxcnt) + packet->rhqoff = 0; + + if (packet->numpkt == MAX_PKT_RECV) { + ret = RCV_PKT_MAX; + this_cpu_inc(*packet->rcd->dd->rcv_limit); + } + + packet->rhf_addr = (__le32 *) packet->rcd->rcvhdrq + packet->rhqoff + + packet->rcd->dd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); + + return ret; +} + +static inline void process_rcv_update(int last, struct hfi1_packet *packet) +{ + /* + * Update head regs etc., every 16 packets, if not last pkt, + * to help prevent rcvhdrq overflows, when many packets + * are processed and queue is nearly full. + * Don't request an interrupt for intermediate updates. + */ + if (!last && !(packet->numpkt & 0xf)) { + update_usrhead(packet->rcd, packet->rhqoff, packet->updegr, + packet->etail, 0, 0); + packet->updegr = 0; + } + packet->rcv_flags = 0; +} + +static inline void finish_packet(struct hfi1_packet *packet) +{ + + /* + * Nothing we need to free for the packet. + * + * The only thing we need to do is a final update and call for an + * interrupt + */ + update_usrhead(packet->rcd, packet->rcd->head, packet->updegr, + packet->etail, rcv_intr_dynamic, packet->numpkt); + +} + +static inline void process_rcv_qp_work(struct hfi1_packet *packet) +{ + + struct hfi1_ctxtdata *rcd; + struct hfi1_qp *qp, *nqp; + + rcd = packet->rcd; + rcd->head = packet->rhqoff; + + /* + * Iterate over all QPs waiting to respond. + * The list won't change since the IRQ is only run on one CPU. + */ + list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) { + list_del_init(&qp->rspwait); + if (qp->r_flags & HFI1_R_RSP_NAK) { + qp->r_flags &= ~HFI1_R_RSP_NAK; + hfi1_send_rc_ack(rcd, qp, 0); + } + if (qp->r_flags & HFI1_R_RSP_SEND) { + unsigned long flags; + + qp->r_flags &= ~HFI1_R_RSP_SEND; + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_hfi1_state_ops[qp->state] & + HFI1_PROCESS_OR_FLUSH_SEND) + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +/* + * Handle receive interrupts when using the no dma rtail option. + */ +void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd) +{ + u32 seq; + int last = 0; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + seq = rhf_rcv_seq(packet.rhf); + if (seq != rcd->seq_cnt) + goto bail; + + prescan_rxq(&packet); + + while (!last) { + last = process_rcv_packet(&packet); + seq = rhf_rcv_seq(packet.rhf); + if (++rcd->seq_cnt > 13) + rcd->seq_cnt = 1; + if (seq != rcd->seq_cnt) + last = 1; + process_rcv_update(last, &packet); + } + process_rcv_qp_work(&packet); +bail: + finish_packet(&packet); +} + +void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd) +{ + u32 hdrqtail; + int last = 0; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + hdrqtail = get_rcvhdrtail(rcd); + if (packet.rhqoff == hdrqtail) + goto bail; + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + + prescan_rxq(&packet); + + while (!last) { + last = process_rcv_packet(&packet); + if (packet.rhqoff == hdrqtail) + last = 1; + process_rcv_update(last, &packet); + } + process_rcv_qp_work(&packet); +bail: + finish_packet(&packet); + +} + +static inline void set_all_nodma_rtail(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->first_user_ctxt; i++) + dd->rcd[i]->do_interrupt = + &handle_receive_interrupt_nodma_rtail; +} + +static inline void set_all_dma_rtail(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->first_user_ctxt; i++) + dd->rcd[i]->do_interrupt = + &handle_receive_interrupt_dma_rtail; +} + +/* + * handle_receive_interrupt - receive a packet + * @rcd: the context + * + * Called from interrupt handler for errors or receive interrupt. + * This is the slow path interrupt handler. + */ +void handle_receive_interrupt(struct hfi1_ctxtdata *rcd) +{ + + struct hfi1_devdata *dd = rcd->dd; + u32 hdrqtail; + int last = 0, needset = 1; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + + if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) { + u32 seq = rhf_rcv_seq(packet.rhf); + + if (seq != rcd->seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = get_rcvhdrtail(rcd); + if (packet.rhqoff == hdrqtail) + goto bail; + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + } + + prescan_rxq(&packet); + + while (!last) { + + if (unlikely(dd->do_drop && atomic_xchg(&dd->drop_packet, + DROP_PACKET_OFF) == DROP_PACKET_ON)) { + dd->do_drop = 0; + + /* On to the next packet */ + packet.rhqoff += packet.rsize; + packet.rhf_addr = (__le32 *) rcd->rcvhdrq + + packet.rhqoff + + dd->rhf_offset; + packet.rhf = rhf_to_cpu(packet.rhf_addr); + + } else { + last = process_rcv_packet(&packet); + } + + if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) { + u32 seq = rhf_rcv_seq(packet.rhf); + + if (++rcd->seq_cnt > 13) + rcd->seq_cnt = 1; + if (seq != rcd->seq_cnt) + last = 1; + if (needset) { + dd_dev_info(dd, + "Switching to NO_DMA_RTAIL\n"); + set_all_nodma_rtail(dd); + needset = 0; + } + } else { + if (packet.rhqoff == hdrqtail) + last = 1; + if (needset) { + dd_dev_info(dd, + "Switching to DMA_RTAIL\n"); + set_all_dma_rtail(dd); + needset = 0; + } + } + + process_rcv_update(last, &packet); + } + + process_rcv_qp_work(&packet); + +bail: + /* + * Always write head at end, and setup rcv interrupt, even + * if no packets were processed. + */ + finish_packet(&packet); +} + +/* + * Convert a given MTU size to the on-wire MAD packet enumeration. + * Return -1 if the size is invalid. + */ +int mtu_to_enum(u32 mtu, int default_if_bad) +{ + switch (mtu) { + case 0: return OPA_MTU_0; + case 256: return OPA_MTU_256; + case 512: return OPA_MTU_512; + case 1024: return OPA_MTU_1024; + case 2048: return OPA_MTU_2048; + case 4096: return OPA_MTU_4096; + case 8192: return OPA_MTU_8192; + case 10240: return OPA_MTU_10240; + } + return default_if_bad; +} + +u16 enum_to_mtu(int mtu) +{ + switch (mtu) { + case OPA_MTU_0: return 0; + case OPA_MTU_256: return 256; + case OPA_MTU_512: return 512; + case OPA_MTU_1024: return 1024; + case OPA_MTU_2048: return 2048; + case OPA_MTU_4096: return 4096; + case OPA_MTU_8192: return 8192; + case OPA_MTU_10240: return 10240; + default: return 0xffff; + } +} + +/* + * set_mtu - set the MTU + * @ppd: the per port data + * + * We can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. We do not deal with what happens + * to programs that are already running when the size changes. + */ +int set_mtu(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + int i, drain, ret = 0, is_up = 0; + + ppd->ibmtu = 0; + for (i = 0; i < ppd->vls_supported; i++) + if (ppd->ibmtu < dd->vld[i].mtu) + ppd->ibmtu = dd->vld[i].mtu; + ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd); + + mutex_lock(&ppd->hls_lock); + if (ppd->host_link_state == HLS_UP_INIT + || ppd->host_link_state == HLS_UP_ARMED + || ppd->host_link_state == HLS_UP_ACTIVE) + is_up = 1; + + drain = !is_ax(dd) && is_up; + + if (drain) + /* + * MTU is specified per-VL. To ensure that no packet gets + * stuck (due, e.g., to the MTU for the packet's VL being + * reduced), empty the per-VL FIFOs before adjusting MTU. + */ + ret = stop_drain_data_vls(dd); + + if (ret) { + dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n", + __func__); + goto err; + } + + hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0); + + if (drain) + open_fill_data_vls(dd); /* reopen all VLs */ + +err: + mutex_unlock(&ppd->hls_lock); + + return ret; +} + +int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc) +{ + struct hfi1_devdata *dd = ppd->dd; + + ppd->lid = lid; + ppd->lmc = lmc; + hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0); + + dd_dev_info(dd, "IB%u:%u got a lid: 0x%x\n", dd->unit, ppd->port, lid); + + return 0; +} + +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDs, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<dd; + int timeoff; + int ph_idx; + + if (!(dd->flags & HFI1_INITTED)) + return; + + ph_idx = ppd->led_override_phase++ & 1; + ppd->led_override = ppd->led_override_vals[ph_idx]; + timeoff = ppd->led_override_timeoff; + + /* + * don't re-fire the timer if user asked for it to be off; we let + * it fire one more time after they turn it off to simplify + */ + if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) + mod_timer(&ppd->led_override_timer, jiffies + timeoff); +} + +void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val) +{ + struct hfi1_devdata *dd = ppd->dd; + int timeoff, freq; + + if (!(dd->flags & HFI1_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + ppd->led_override_vals[0] = val & 0xF; + ppd->led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + ppd->led_override_vals[0] = val & 0xF; + ppd->led_override_vals[1] = val & 0xF; + } + ppd->led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&ppd->led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&ppd->led_override_timer); + ppd->led_override_timer.function = run_led_override; + ppd->led_override_timer.data = (unsigned long) ppd; + ppd->led_override_timer.expires = jiffies + 1; + add_timer(&ppd->led_override_timer); + } else { + if (ppd->led_override_vals[0] || ppd->led_override_vals[1]) + mod_timer(&ppd->led_override_timer, jiffies + 1); + atomic_dec(&ppd->led_override_timer_active); + } +} + +/** + * hfi1_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user contexts are open that use chip resources + */ +int hfi1_reset_device(int unit) +{ + int ret, i; + struct hfi1_devdata *dd = hfi1_lookup(unit); + struct hfi1_pportdata *ppd; + unsigned long flags; + int pidx; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + dd_dev_info(dd, "Reset on unit %u requested\n", unit); + + if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) { + dd_dev_info(dd, + "Invalid unit number %u or not initialized or not present\n", + unit); + ret = -ENXIO; + goto bail; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (dd->rcd) + for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) { + if (!dd->rcd[i] || !dd->rcd[i]->cnt) + continue; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + ret = -EBUSY; + goto bail; + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (atomic_read(&ppd->led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + ppd->led_override = LED_OVER_BOTH_OFF; + } + if (dd->flags & HFI1_HAS_SEND_DMA) + sdma_exit(dd); + + hfi1_reset_cpu_counters(dd); + + ret = hfi1_init(dd, 1); + + if (ret) + dd_dev_err(dd, + "Reinitialize unit %u after reset failed with %d\n", + unit, ret); + else + dd_dev_info(dd, "Reinitialized unit %u after resetting\n", + unit); + +bail: + return ret; +} + +void handle_eflags(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + u32 rte = rhf_rcv_type_err(packet->rhf); + + dd_dev_err(rcd->dd, + "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n", + rcd->ctxt, packet->rhf, + packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "", + packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "", + packet->rhf & RHF_DC_ERR ? "dc " : "", + packet->rhf & RHF_TID_ERR ? "tid " : "", + packet->rhf & RHF_LEN_ERR ? "len " : "", + packet->rhf & RHF_ECC_ERR ? "ecc " : "", + packet->rhf & RHF_VCRC_ERR ? "vcrc " : "", + packet->rhf & RHF_ICRC_ERR ? "icrc " : "", + rte); + + rcv_hdrerr(rcd, rcd->ppd, packet); +} + +/* + * The following functions are called by the interrupt handler. They are type + * specific handlers for each packet type. + */ +int process_receive_ib(struct hfi1_packet *packet) +{ + trace_hfi1_rcvhdr(packet->rcd->ppd->dd, + packet->rcd->ctxt, + rhf_err_flags(packet->rhf), + RHF_RCV_TYPE_IB, + packet->hlen, + packet->tlen, + packet->updegr, + rhf_egr_index(packet->rhf)); + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return RHF_RCV_CONTINUE; + } + + hfi1_ib_rcv(packet); + return RHF_RCV_CONTINUE; +} + +int process_receive_bypass(struct hfi1_packet *packet) +{ + if (unlikely(rhf_err_flags(packet->rhf))) + handle_eflags(packet); + + dd_dev_err(packet->rcd->dd, + "Bypass packets are not supported in normal operation. Dropping\n"); + return RHF_RCV_CONTINUE; +} + +int process_receive_error(struct hfi1_packet *packet) +{ + handle_eflags(packet); + + if (unlikely(rhf_err_flags(packet->rhf))) + dd_dev_err(packet->rcd->dd, + "Unhandled error packet received. Dropping.\n"); + + return RHF_RCV_CONTINUE; +} + +int kdeth_process_expected(struct hfi1_packet *packet) +{ + if (unlikely(rhf_err_flags(packet->rhf))) + handle_eflags(packet); + + dd_dev_err(packet->rcd->dd, + "Unhandled expected packet received. Dropping.\n"); + return RHF_RCV_CONTINUE; +} + +int kdeth_process_eager(struct hfi1_packet *packet) +{ + if (unlikely(rhf_err_flags(packet->rhf))) + handle_eflags(packet); + + dd_dev_err(packet->rcd->dd, + "Unhandled eager packet received. Dropping.\n"); + return RHF_RCV_CONTINUE; +} + +int process_receive_invalid(struct hfi1_packet *packet) +{ + dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n", + rhf_rcv_type(packet->rhf)); + return RHF_RCV_CONTINUE; +} diff --git a/drivers/staging/rdma/hfi1/eprom.c b/drivers/staging/rdma/hfi1/eprom.c new file mode 100644 index 000000000..b61d3ae93 --- /dev/null +++ b/drivers/staging/rdma/hfi1/eprom.c @@ -0,0 +1,475 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include "hfi.h" +#include "common.h" +#include "eprom.h" + +/* + * The EPROM is logically divided into two partitions: + * partition 0: the first 128K, visible from PCI ROM BAR + * partition 1: the rest + */ +#define P0_SIZE (128 * 1024) +#define P1_START P0_SIZE + +/* largest erase size supported by the controller */ +#define SIZE_32KB (32 * 1024) +#define MASK_32KB (SIZE_32KB - 1) + +/* controller page size, in bytes */ +#define EP_PAGE_SIZE 256 +#define EEP_PAGE_MASK (EP_PAGE_SIZE - 1) + +/* controller commands */ +#define CMD_SHIFT 24 +#define CMD_NOP (0) +#define CMD_PAGE_PROGRAM(addr) ((0x02 << CMD_SHIFT) | addr) +#define CMD_READ_DATA(addr) ((0x03 << CMD_SHIFT) | addr) +#define CMD_READ_SR1 ((0x05 << CMD_SHIFT)) +#define CMD_WRITE_ENABLE ((0x06 << CMD_SHIFT)) +#define CMD_SECTOR_ERASE_32KB(addr) ((0x52 << CMD_SHIFT) | addr) +#define CMD_CHIP_ERASE ((0x60 << CMD_SHIFT)) +#define CMD_READ_MANUF_DEV_ID ((0x90 << CMD_SHIFT)) +#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT)) + +/* controller interface speeds */ +#define EP_SPEED_FULL 0x2 /* full speed */ + +/* controller status register 1 bits */ +#define SR1_BUSY 0x1ull /* the BUSY bit in SR1 */ + +/* sleep length while waiting for controller */ +#define WAIT_SLEEP_US 100 /* must be larger than 5 (see usage) */ +#define COUNT_DELAY_SEC(n) ((n) * (1000000/WAIT_SLEEP_US)) + +/* GPIO pins */ +#define EPROM_WP_N (1ull << 14) /* EPROM write line */ + +/* + * Use the EP mutex to guard against other callers from within the driver. + * Also covers usage of eprom_available. + */ +static DEFINE_MUTEX(eprom_mutex); +static int eprom_available; /* default: not available */ + +/* + * Turn on external enable line that allows writing on the flash. + */ +static void write_enable(struct hfi1_devdata *dd) +{ + /* raise signal */ + write_csr(dd, ASIC_GPIO_OUT, + read_csr(dd, ASIC_GPIO_OUT) | EPROM_WP_N); + /* raise enable */ + write_csr(dd, ASIC_GPIO_OE, + read_csr(dd, ASIC_GPIO_OE) | EPROM_WP_N); +} + +/* + * Turn off external enable line that allows writing on the flash. + */ +static void write_disable(struct hfi1_devdata *dd) +{ + /* lower signal */ + write_csr(dd, ASIC_GPIO_OUT, + read_csr(dd, ASIC_GPIO_OUT) & ~EPROM_WP_N); + /* lower enable */ + write_csr(dd, ASIC_GPIO_OE, + read_csr(dd, ASIC_GPIO_OE) & ~EPROM_WP_N); +} + +/* + * Wait for the device to become not busy. Must be called after all + * write or erase operations. + */ +static int wait_for_not_busy(struct hfi1_devdata *dd) +{ + unsigned long count = 0; + u64 reg; + int ret = 0; + + /* starts page mode */ + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_SR1); + while (1) { + udelay(WAIT_SLEEP_US); + usleep_range(WAIT_SLEEP_US - 5, WAIT_SLEEP_US + 5); + count++; + reg = read_csr(dd, ASIC_EEP_DATA); + if ((reg & SR1_BUSY) == 0) + break; + /* 200s is the largest time for a 128Mb device */ + if (count > COUNT_DELAY_SEC(200)) { + dd_dev_err(dd, "waited too long for SPI FLASH busy to clear - failing\n"); + ret = -ETIMEDOUT; + break; /* break, not goto - must stop page mode */ + } + } + + /* stop page mode with a NOP */ + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); + + return ret; +} + +/* + * Read the device ID from the SPI controller. + */ +static u32 read_device_id(struct hfi1_devdata *dd) +{ + /* read the Manufacture Device ID */ + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_MANUF_DEV_ID); + return (u32)read_csr(dd, ASIC_EEP_DATA); +} + +/* + * Erase the whole flash. + */ +static int erase_chip(struct hfi1_devdata *dd) +{ + int ret; + + write_enable(dd); + + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE); + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_CHIP_ERASE); + ret = wait_for_not_busy(dd); + + write_disable(dd); + + return ret; +} + +/* + * Erase a range using the 32KB erase command. + */ +static int erase_32kb_range(struct hfi1_devdata *dd, u32 start, u32 end) +{ + int ret = 0; + + if (end < start) + return -EINVAL; + + if ((start & MASK_32KB) || (end & MASK_32KB)) { + dd_dev_err(dd, + "%s: non-aligned range (0x%x,0x%x) for a 32KB erase\n", + __func__, start, end); + return -EINVAL; + } + + write_enable(dd); + + for (; start < end; start += SIZE_32KB) { + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE); + write_csr(dd, ASIC_EEP_ADDR_CMD, + CMD_SECTOR_ERASE_32KB(start)); + ret = wait_for_not_busy(dd); + if (ret) + goto done; + } + +done: + write_disable(dd); + + return ret; +} + +/* + * Read a 256 byte (64 dword) EPROM page. + * All callers have verified the offset is at a page boundary. + */ +static void read_page(struct hfi1_devdata *dd, u32 offset, u32 *result) +{ + int i; + + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_READ_DATA(offset)); + for (i = 0; i < EP_PAGE_SIZE/sizeof(u32); i++) + result[i] = (u32)read_csr(dd, ASIC_EEP_DATA); + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_NOP); /* close open page */ +} + +/* + * Read length bytes starting at offset. Copy to user address addr. + */ +static int read_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) +{ + u32 offset; + u32 buffer[EP_PAGE_SIZE/sizeof(u32)]; + int ret = 0; + + /* reject anything not on an EPROM page boundary */ + if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK)) + return -EINVAL; + + for (offset = 0; offset < len; offset += EP_PAGE_SIZE) { + read_page(dd, start + offset, buffer); + if (copy_to_user((void __user *)(addr + offset), + buffer, EP_PAGE_SIZE)) { + ret = -EFAULT; + goto done; + } + } + +done: + return ret; +} + +/* + * Write a 256 byte (64 dword) EPROM page. + * All callers have verified the offset is at a page boundary. + */ +static int write_page(struct hfi1_devdata *dd, u32 offset, u32 *data) +{ + int i; + + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_WRITE_ENABLE); + write_csr(dd, ASIC_EEP_DATA, data[0]); + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_PAGE_PROGRAM(offset)); + for (i = 1; i < EP_PAGE_SIZE/sizeof(u32); i++) + write_csr(dd, ASIC_EEP_DATA, data[i]); + /* will close the open page */ + return wait_for_not_busy(dd); +} + +/* + * Write length bytes starting at offset. Read from user address addr. + */ +static int write_length(struct hfi1_devdata *dd, u32 start, u32 len, u64 addr) +{ + u32 offset; + u32 buffer[EP_PAGE_SIZE/sizeof(u32)]; + int ret = 0; + + /* reject anything not on an EPROM page boundary */ + if ((start & EEP_PAGE_MASK) || (len & EEP_PAGE_MASK)) + return -EINVAL; + + write_enable(dd); + + for (offset = 0; offset < len; offset += EP_PAGE_SIZE) { + if (copy_from_user(buffer, (void __user *)(addr + offset), + EP_PAGE_SIZE)) { + ret = -EFAULT; + goto done; + } + ret = write_page(dd, start + offset, buffer); + if (ret) + goto done; + } + +done: + write_disable(dd); + return ret; +} + +/* + * Perform the given operation on the EPROM. Called from user space. The + * user credentials have already been checked. + * + * Return 0 on success, -ERRNO on error + */ +int handle_eprom_command(const struct hfi1_cmd *cmd) +{ + struct hfi1_devdata *dd; + u32 dev_id; + int ret = 0; + + /* + * The EPROM is per-device, so use unit 0 as that will always + * exist. + */ + dd = hfi1_lookup(0); + if (!dd) { + pr_err("%s: cannot find unit 0!\n", __func__); + return -EINVAL; + } + + /* lock against other callers touching the ASIC block */ + mutex_lock(&eprom_mutex); + + /* some platforms do not have an EPROM */ + if (!eprom_available) { + ret = -ENOSYS; + goto done_asic; + } + + /* lock against the other HFI on another OS */ + ret = acquire_hw_mutex(dd); + if (ret) { + dd_dev_err(dd, + "%s: unable to acquire hw mutex, no EPROM support\n", + __func__); + goto done_asic; + } + + dd_dev_info(dd, "%s: cmd: type %d, len 0x%x, addr 0x%016llx\n", + __func__, cmd->type, cmd->len, cmd->addr); + + switch (cmd->type) { + case HFI1_CMD_EP_INFO: + if (cmd->len != sizeof(u32)) { + ret = -ERANGE; + break; + } + dev_id = read_device_id(dd); + /* addr points to a u32 user buffer */ + if (copy_to_user((void __user *)cmd->addr, &dev_id, + sizeof(u32))) + ret = -EFAULT; + break; + case HFI1_CMD_EP_ERASE_CHIP: + ret = erase_chip(dd); + break; + case HFI1_CMD_EP_ERASE_P0: + if (cmd->len != P0_SIZE) { + ret = -ERANGE; + break; + } + ret = erase_32kb_range(dd, 0, cmd->len); + break; + case HFI1_CMD_EP_ERASE_P1: + /* check for overflow */ + if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) { + ret = -ERANGE; + break; + } + ret = erase_32kb_range(dd, P1_START, P1_START + cmd->len); + break; + case HFI1_CMD_EP_READ_P0: + if (cmd->len != P0_SIZE) { + ret = -ERANGE; + break; + } + ret = read_length(dd, 0, cmd->len, cmd->addr); + break; + case HFI1_CMD_EP_READ_P1: + /* check for overflow */ + if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) { + ret = -ERANGE; + break; + } + ret = read_length(dd, P1_START, cmd->len, cmd->addr); + break; + case HFI1_CMD_EP_WRITE_P0: + if (cmd->len > P0_SIZE) { + ret = -ERANGE; + break; + } + ret = write_length(dd, 0, cmd->len, cmd->addr); + break; + case HFI1_CMD_EP_WRITE_P1: + /* check for overflow */ + if (P1_START + cmd->len > ASIC_EEP_ADDR_CMD_EP_ADDR_MASK) { + ret = -ERANGE; + break; + } + ret = write_length(dd, P1_START, cmd->len, cmd->addr); + break; + default: + dd_dev_err(dd, "%s: unexpected command %d\n", + __func__, cmd->type); + ret = -EINVAL; + break; + } + + release_hw_mutex(dd); +done_asic: + mutex_unlock(&eprom_mutex); + return ret; +} + +/* + * Initialize the EPROM handler. + */ +int eprom_init(struct hfi1_devdata *dd) +{ + int ret = 0; + + /* only the discrete chip has an EPROM, nothing to do */ + if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0) + return 0; + + /* lock against other callers */ + mutex_lock(&eprom_mutex); + if (eprom_available) /* already initialized */ + goto done_asic; + + /* + * Lock against the other HFI on another OS - the mutex above + * would have caught anything in this driver. It is OK if + * both OSes reset the EPROM - as long as they don't do it at + * the same time. + */ + ret = acquire_hw_mutex(dd); + if (ret) { + dd_dev_err(dd, + "%s: unable to acquire hw mutex, no EPROM support\n", + __func__); + goto done_asic; + } + + /* reset EPROM to be sure it is in a good state */ + + /* set reset */ + write_csr(dd, ASIC_EEP_CTL_STAT, + ASIC_EEP_CTL_STAT_EP_RESET_SMASK); + /* clear reset, set speed */ + write_csr(dd, ASIC_EEP_CTL_STAT, + EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT); + + /* wake the device with command "release powerdown NoID" */ + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID); + + eprom_available = 1; + release_hw_mutex(dd); +done_asic: + mutex_unlock(&eprom_mutex); + return ret; +} diff --git a/drivers/staging/rdma/hfi1/eprom.h b/drivers/staging/rdma/hfi1/eprom.h new file mode 100644 index 000000000..64a64276b --- /dev/null +++ b/drivers/staging/rdma/hfi1/eprom.h @@ -0,0 +1,55 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +struct hfi1_cmd; +struct hfi1_devdata; + +int eprom_init(struct hfi1_devdata *dd); +int handle_eprom_command(const struct hfi1_cmd *cmd); diff --git a/drivers/staging/rdma/hfi1/file_ops.c b/drivers/staging/rdma/hfi1/file_ops.c new file mode 100644 index 000000000..72d38500d --- /dev/null +++ b/drivers/staging/rdma/hfi1/file_ops.c @@ -0,0 +1,2144 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "pio.h" +#include "device.h" +#include "common.h" +#include "trace.h" +#include "user_sdma.h" +#include "eprom.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */ + +/* + * File operation functions + */ +static int hfi1_file_open(struct inode *, struct file *); +static int hfi1_file_close(struct inode *, struct file *); +static ssize_t hfi1_file_write(struct file *, const char __user *, + size_t, loff_t *); +static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *); +static unsigned int hfi1_poll(struct file *, struct poll_table_struct *); +static int hfi1_file_mmap(struct file *, struct vm_area_struct *); + +static u64 kvirt_to_phys(void *); +static int assign_ctxt(struct file *, struct hfi1_user_info *); +static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *); +static int user_init(struct file *); +static int get_ctxt_info(struct file *, void __user *, __u32); +static int get_base_info(struct file *, void __user *, __u32); +static int setup_ctxt(struct file *); +static int setup_subctxt(struct hfi1_ctxtdata *); +static int get_user_context(struct file *, struct hfi1_user_info *, + int, unsigned); +static int find_shared_ctxt(struct file *, const struct hfi1_user_info *); +static int allocate_ctxt(struct file *, struct hfi1_devdata *, + struct hfi1_user_info *); +static unsigned int poll_urgent(struct file *, struct poll_table_struct *); +static unsigned int poll_next(struct file *, struct poll_table_struct *); +static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long); +static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); +static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); +static int vma_fault(struct vm_area_struct *, struct vm_fault *); +static int exp_tid_setup(struct file *, struct hfi1_tid_info *); +static int exp_tid_free(struct file *, struct hfi1_tid_info *); +static void unlock_exp_tids(struct hfi1_ctxtdata *); + +static const struct file_operations hfi1_file_ops = { + .owner = THIS_MODULE, + .write = hfi1_file_write, + .write_iter = hfi1_write_iter, + .open = hfi1_file_open, + .release = hfi1_file_close, + .poll = hfi1_poll, + .mmap = hfi1_file_mmap, + .llseek = noop_llseek, +}; + +static struct vm_operations_struct vm_ops = { + .fault = vma_fault, +}; + +/* + * Types of memories mapped into user processes' space + */ +enum mmap_types { + PIO_BUFS = 1, + PIO_BUFS_SOP, + PIO_CRED, + RCV_HDRQ, + RCV_EGRBUF, + UREGS, + EVENTS, + STATUS, + RTAIL, + SUBCTXT_UREGS, + SUBCTXT_RCV_HDRQ, + SUBCTXT_EGRBUF, + SDMA_COMP +}; + +/* + * Masks and offsets defining the mmap tokens + */ +#define HFI1_MMAP_OFFSET_MASK 0xfffULL +#define HFI1_MMAP_OFFSET_SHIFT 0 +#define HFI1_MMAP_SUBCTXT_MASK 0xfULL +#define HFI1_MMAP_SUBCTXT_SHIFT 12 +#define HFI1_MMAP_CTXT_MASK 0xffULL +#define HFI1_MMAP_CTXT_SHIFT 16 +#define HFI1_MMAP_TYPE_MASK 0xfULL +#define HFI1_MMAP_TYPE_SHIFT 24 +#define HFI1_MMAP_MAGIC_MASK 0xffffffffULL +#define HFI1_MMAP_MAGIC_SHIFT 32 + +#define HFI1_MMAP_MAGIC 0xdabbad00 + +#define HFI1_MMAP_TOKEN_SET(field, val) \ + (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT) +#define HFI1_MMAP_TOKEN_GET(field, token) \ + (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK) +#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr) \ + (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \ + HFI1_MMAP_TOKEN_SET(TYPE, type) | \ + HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \ + HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \ + HFI1_MMAP_TOKEN_SET(OFFSET, ((unsigned long)addr & ~PAGE_MASK))) + +#define EXP_TID_SET(field, value) \ + (((value) & EXP_TID_TID##field##_MASK) << \ + EXP_TID_TID##field##_SHIFT) +#define EXP_TID_CLEAR(tid, field) { \ + (tid) &= ~(EXP_TID_TID##field##_MASK << \ + EXP_TID_TID##field##_SHIFT); \ + } +#define EXP_TID_RESET(tid, field, value) do { \ + EXP_TID_CLEAR(tid, field); \ + (tid) |= EXP_TID_SET(field, value); \ + } while (0) + +#define dbg(fmt, ...) \ + pr_info(fmt, ##__VA_ARGS__) + + +static inline int is_valid_mmap(u64 token) +{ + return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC); +} + +static int hfi1_file_open(struct inode *inode, struct file *fp) +{ + /* The real work is performed later in assign_ctxt() */ + fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL); + if (fp->private_data) /* no cpu affinity by default */ + ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1; + return fp->private_data ? 0 : -ENOMEM; +} + +static ssize_t hfi1_file_write(struct file *fp, const char __user *data, + size_t count, loff_t *offset) +{ + const struct hfi1_cmd __user *ucmd; + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_cmd cmd; + struct hfi1_user_info uinfo; + struct hfi1_tid_info tinfo; + ssize_t consumed = 0, copy = 0, ret = 0; + void *dest = NULL; + __u64 user_val = 0; + int uctxt_required = 1; + int must_be_root = 0; + + if (count < sizeof(cmd)) { + ret = -EINVAL; + goto bail; + } + + ucmd = (const struct hfi1_cmd __user *)data; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) { + ret = -EFAULT; + goto bail; + } + + consumed = sizeof(cmd); + + switch (cmd.type) { + case HFI1_CMD_ASSIGN_CTXT: + uctxt_required = 0; /* assigned user context not required */ + copy = sizeof(uinfo); + dest = &uinfo; + break; + case HFI1_CMD_SDMA_STATUS_UPD: + case HFI1_CMD_CREDIT_UPD: + copy = 0; + break; + case HFI1_CMD_TID_UPDATE: + case HFI1_CMD_TID_FREE: + copy = sizeof(tinfo); + dest = &tinfo; + break; + case HFI1_CMD_USER_INFO: + case HFI1_CMD_RECV_CTRL: + case HFI1_CMD_POLL_TYPE: + case HFI1_CMD_ACK_EVENT: + case HFI1_CMD_CTXT_INFO: + case HFI1_CMD_SET_PKEY: + case HFI1_CMD_CTXT_RESET: + copy = 0; + user_val = cmd.addr; + break; + case HFI1_CMD_EP_INFO: + case HFI1_CMD_EP_ERASE_CHIP: + case HFI1_CMD_EP_ERASE_P0: + case HFI1_CMD_EP_ERASE_P1: + case HFI1_CMD_EP_READ_P0: + case HFI1_CMD_EP_READ_P1: + case HFI1_CMD_EP_WRITE_P0: + case HFI1_CMD_EP_WRITE_P1: + uctxt_required = 0; /* assigned user context not required */ + must_be_root = 1; /* validate user */ + copy = 0; + break; + default: + ret = -EINVAL; + goto bail; + } + + /* If the command comes with user data, copy it. */ + if (copy) { + if (copy_from_user(dest, (void __user *)cmd.addr, copy)) { + ret = -EFAULT; + goto bail; + } + consumed += copy; + } + + /* + * Make sure there is a uctxt when needed. + */ + if (uctxt_required && !uctxt) { + ret = -EINVAL; + goto bail; + } + + /* only root can do these operations */ + if (must_be_root && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto bail; + } + + switch (cmd.type) { + case HFI1_CMD_ASSIGN_CTXT: + ret = assign_ctxt(fp, &uinfo); + if (ret < 0) + goto bail; + ret = setup_ctxt(fp); + if (ret) + goto bail; + ret = user_init(fp); + break; + case HFI1_CMD_CTXT_INFO: + ret = get_ctxt_info(fp, (void __user *)(unsigned long) + user_val, cmd.len); + break; + case HFI1_CMD_USER_INFO: + ret = get_base_info(fp, (void __user *)(unsigned long) + user_val, cmd.len); + break; + case HFI1_CMD_SDMA_STATUS_UPD: + break; + case HFI1_CMD_CREDIT_UPD: + if (uctxt && uctxt->sc) + sc_return_credits(uctxt->sc); + break; + case HFI1_CMD_TID_UPDATE: + ret = exp_tid_setup(fp, &tinfo); + if (!ret) { + unsigned long addr; + /* + * Copy the number of tidlist entries we used + * and the length of the buffer we registered. + * These fields are adjacent in the structure so + * we can copy them at the same time. + */ + addr = (unsigned long)cmd.addr + + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt) + + sizeof(tinfo.length))) + ret = -EFAULT; + } + break; + case HFI1_CMD_TID_FREE: + ret = exp_tid_free(fp, &tinfo); + break; + case HFI1_CMD_RECV_CTRL: + ret = manage_rcvq(uctxt, subctxt_fp(fp), (int)user_val); + break; + case HFI1_CMD_POLL_TYPE: + uctxt->poll_type = (typeof(uctxt->poll_type))user_val; + break; + case HFI1_CMD_ACK_EVENT: + ret = user_event_ack(uctxt, subctxt_fp(fp), user_val); + break; + case HFI1_CMD_SET_PKEY: + if (HFI1_CAP_IS_USET(PKEY_CHECK)) + ret = set_ctxt_pkey(uctxt, subctxt_fp(fp), user_val); + else + ret = -EPERM; + break; + case HFI1_CMD_CTXT_RESET: { + struct send_context *sc; + struct hfi1_devdata *dd; + + if (!uctxt || !uctxt->dd || !uctxt->sc) { + ret = -EINVAL; + break; + } + /* + * There is no protection here. User level has to + * guarantee that no one will be writing to the send + * context while it is being re-initialized. + * If user level breaks that guarantee, it will break + * it's own context and no one else's. + */ + dd = uctxt->dd; + sc = uctxt->sc; + /* + * Wait until the interrupt handler has marked the + * context as halted or frozen. Report error if we time + * out. + */ + wait_event_interruptible_timeout( + sc->halt_wait, (sc->flags & SCF_HALTED), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (!(sc->flags & SCF_HALTED)) { + ret = -ENOLCK; + break; + } + /* + * If the send context was halted due to a Freeze, + * wait until the device has been "unfrozen" before + * resetting the context. + */ + if (sc->flags & SCF_FROZEN) { + wait_event_interruptible_timeout( + dd->event_queue, + !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (dd->flags & HFI1_FROZEN) { + ret = -ENOLCK; + break; + } + if (dd->flags & HFI1_FORCED_FREEZE) { + /* Don't allow context reset if we are into + * forced freeze */ + ret = -ENODEV; + break; + } + sc_disable(sc); + ret = sc_enable(sc); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, + uctxt->ctxt); + } else + ret = sc_restart(sc); + if (!ret) + sc_return_credits(sc); + break; + } + case HFI1_CMD_EP_INFO: + case HFI1_CMD_EP_ERASE_CHIP: + case HFI1_CMD_EP_ERASE_P0: + case HFI1_CMD_EP_ERASE_P1: + case HFI1_CMD_EP_READ_P0: + case HFI1_CMD_EP_READ_P1: + case HFI1_CMD_EP_WRITE_P0: + case HFI1_CMD_EP_WRITE_P1: + ret = handle_eprom_command(&cmd); + break; + } + + if (ret >= 0) + ret = consumed; +bail: + return ret; +} + +static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) +{ + struct hfi1_user_sdma_pkt_q *pq; + struct hfi1_user_sdma_comp_q *cq; + int ret = 0, done = 0, reqs = 0; + unsigned long dim = from->nr_segs; + + if (!user_sdma_comp_fp(kiocb->ki_filp) || + !user_sdma_pkt_fp(kiocb->ki_filp)) { + ret = -EIO; + goto done; + } + + if (!iter_is_iovec(from) || !dim) { + ret = -EINVAL; + goto done; + } + + hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)", + ctxt_fp(kiocb->ki_filp)->ctxt, subctxt_fp(kiocb->ki_filp), + dim); + pq = user_sdma_pkt_fp(kiocb->ki_filp); + cq = user_sdma_comp_fp(kiocb->ki_filp); + + if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { + ret = -ENOSPC; + goto done; + } + + while (dim) { + unsigned long count = 0; + + ret = hfi1_user_sdma_process_request( + kiocb->ki_filp, (struct iovec *)(from->iov + done), + dim, &count); + if (ret) + goto done; + dim -= count; + done += count; + reqs++; + } +done: + return ret ? ret : reqs; +} + +static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct hfi1_ctxtdata *uctxt; + struct hfi1_devdata *dd; + unsigned long flags, pfn; + u64 token = vma->vm_pgoff << PAGE_SHIFT, + memaddr = 0; + u8 subctxt, mapio = 0, vmf = 0, type; + ssize_t memlen = 0; + int ret = 0; + u16 ctxt; + + uctxt = ctxt_fp(fp); + if (!is_valid_mmap(token) || !uctxt || + !(vma->vm_flags & VM_SHARED)) { + ret = -EINVAL; + goto done; + } + dd = uctxt->dd; + ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token); + subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token); + type = HFI1_MMAP_TOKEN_GET(TYPE, token); + if (ctxt != uctxt->ctxt || subctxt != subctxt_fp(fp)) { + ret = -EINVAL; + goto done; + } + + flags = vma->vm_flags; + + switch (type) { + case PIO_BUFS: + case PIO_BUFS_SOP: + memaddr = ((dd->physaddr + TXE_PIO_SEND) + + /* chip pio base */ + (uctxt->sc->hw_context * (1 << 16))) + + /* 64K PIO space / ctxt */ + (type == PIO_BUFS_SOP ? + (TXE_PIO_SIZE / 2) : 0); /* sop? */ + /* + * Map only the amount allocated to the context, not the + * entire available context's PIO space. + */ + memlen = ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE, + PAGE_SIZE); + flags &= ~VM_MAYREAD; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + mapio = 1; + break; + case PIO_CRED: + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + /* + * The credit return location for this context could be on the + * second or third page allocated for credit returns (if number + * of enabled contexts > 64 and 128 respectively). + */ + memaddr = dd->cr_base[uctxt->numa_id].pa + + (((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK); + memlen = PAGE_SIZE; + flags &= ~VM_MAYWRITE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + /* + * The driver has already allocated memory for credit + * returns and programmed it into the chip. Has that + * memory been flagged as non-cached? + */ + /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */ + mapio = 1; + break; + case RCV_HDRQ: + memaddr = uctxt->rcvhdrq_phys; + memlen = uctxt->rcvhdrq_size; + break; + case RCV_EGRBUF: { + unsigned long addr; + int i; + /* + * The RcvEgr buffer need to be handled differently + * as multiple non-contiguous pages need to be mapped + * into the user process. + */ + memlen = uctxt->egrbufs.size; + if ((vma->vm_end - vma->vm_start) != memlen) { + dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n", + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + if (vma->vm_flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + vma->vm_flags &= ~VM_MAYWRITE; + addr = vma->vm_start; + for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { + ret = remap_pfn_range( + vma, addr, + uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT, + uctxt->egrbufs.buffers[i].len, + vma->vm_page_prot); + if (ret < 0) + goto done; + addr += uctxt->egrbufs.buffers[i].len; + } + ret = 0; + goto done; + } + case UREGS: + /* + * Map only the page that contains this context's user + * registers. + */ + memaddr = (unsigned long) + (dd->physaddr + RXE_PER_CONTEXT_USER) + + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE); + /* + * TidFlow table is on the same page as the rest of the + * user registers. + */ + memlen = PAGE_SIZE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + mapio = 1; + break; + case EVENTS: + /* + * Use the page where this context's flags are. User level + * knows where it's own bitmap is within the page. + */ + memaddr = ((unsigned long)dd->events + + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK; + memlen = PAGE_SIZE; + /* + * v3.7 removes VM_RESERVED but the effect is kept by + * using VM_IO. + */ + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case STATUS: + memaddr = kvirt_to_phys((void *)dd->status); + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + break; + case RTAIL: + if (!HFI1_CAP_IS_USET(DMA_RTAIL)) { + /* + * If the memory allocation failed, the context alloc + * also would have failed, so we would never get here + */ + ret = -EINVAL; + goto done; + } + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + memaddr = uctxt->rcvhdrqtailaddr_phys; + memlen = PAGE_SIZE; + flags &= ~VM_MAYWRITE; + break; + case SUBCTXT_UREGS: + memaddr = (u64)uctxt->subctxt_uregbase; + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_RCV_HDRQ: + memaddr = (u64)uctxt->subctxt_rcvhdr_base; + memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_EGRBUF: + memaddr = (u64)uctxt->subctxt_rcvegrbuf; + memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + flags &= ~VM_MAYWRITE; + vmf = 1; + break; + case SDMA_COMP: { + struct hfi1_user_sdma_comp_q *cq; + + if (!user_sdma_comp_fp(fp)) { + ret = -EFAULT; + goto done; + } + cq = user_sdma_comp_fp(fp); + memaddr = (u64)cq->comps; + memlen = ALIGN(sizeof(*cq->comps) * cq->nentries, PAGE_SIZE); + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + } + default: + ret = -EINVAL; + break; + } + + if ((vma->vm_end - vma->vm_start) != memlen) { + hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu", + uctxt->ctxt, subctxt_fp(fp), + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + + vma->vm_flags = flags; + dd_dev_info(dd, + "%s: %u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", + __func__, ctxt, subctxt, type, mapio, vmf, memaddr, memlen, + vma->vm_end - vma->vm_start, vma->vm_flags); + pfn = (unsigned long)(memaddr >> PAGE_SHIFT); + if (vmf) { + vma->vm_pgoff = pfn; + vma->vm_ops = &vm_ops; + ret = 0; + } else if (mapio) { + ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen, + vma->vm_page_prot); + } else { + ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen, + vma->vm_page_prot); + } +done: + return ret; +} + +/* + * Local (non-chip) user memory is not mapped right away but as it is + * accessed by the user-level code. + */ +static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + + get_page(page); + vmf->page = page; + + return 0; +} + +static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt) +{ + struct hfi1_ctxtdata *uctxt; + unsigned pollflag; + + uctxt = ctxt_fp(fp); + if (!uctxt) + pollflag = POLLERR; + else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT) + pollflag = poll_urgent(fp, pt); + else if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV) + pollflag = poll_next(fp, pt); + else /* invalid */ + pollflag = POLLERR; + + return pollflag; +} + +static int hfi1_file_close(struct inode *inode, struct file *fp) +{ + struct hfi1_filedata *fdata = fp->private_data; + struct hfi1_ctxtdata *uctxt = fdata->uctxt; + struct hfi1_devdata *dd; + unsigned long flags, *ev; + + fp->private_data = NULL; + + if (!uctxt) + goto done; + + hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); + dd = uctxt->dd; + mutex_lock(&hfi1_mutex); + + flush_wc(); + /* drain user sdma queue */ + if (fdata->pq) + hfi1_user_sdma_free_queues(fdata); + + /* + * Clear any left over, unhandled events so the next process that + * gets this context doesn't get confused. + */ + ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fdata->subctxt; + *ev = 0; + + if (--uctxt->cnt) { + uctxt->active_slaves &= ~(1 << fdata->subctxt); + uctxt->subpid[fdata->subctxt] = 0; + mutex_unlock(&hfi1_mutex); + goto done; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + /* + * Disable receive context and interrupt available, reset all + * RcvCtxtCtrl bits to default values. + */ + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_TIDFLOW_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS | + HFI1_RCVCTRL_NO_RHQ_DROP_DIS | + HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt); + /* Clear the context's J_KEY */ + hfi1_clear_ctxt_jkey(dd, uctxt->ctxt); + /* + * Reset context integrity checks to default. + * (writes to CSRs probably belong in chip.c) + */ + write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE, + hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type)); + sc_disable(uctxt->sc); + uctxt->pid = 0; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + dd->rcd[uctxt->ctxt] = NULL; + uctxt->rcvwait_to = 0; + uctxt->piowait_to = 0; + uctxt->rcvnowait = 0; + uctxt->pionowait = 0; + uctxt->event_flags = 0; + + hfi1_clear_tids(uctxt); + hfi1_clear_ctxt_pkey(dd, uctxt->ctxt); + + if (uctxt->tid_pg_list) + unlock_exp_tids(uctxt); + + hfi1_stats.sps_ctxts--; + dd->freectxts++; + mutex_unlock(&hfi1_mutex); + hfi1_free_ctxtdata(dd, uctxt); +done: + kfree(fdata); + return 0; +} + +/* + * Convert kernel *virtual* addresses to physical addresses. + * This is used to vmalloc'ed addresses. + */ +static u64 kvirt_to_phys(void *addr) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(addr); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) +{ + int i_minor, ret = 0; + unsigned swmajor, swminor, alg = HFI1_ALG_ACROSS; + + swmajor = uinfo->userversion >> 16; + if (swmajor != HFI1_USER_SWMAJOR) { + ret = -ENODEV; + goto done; + } + + swminor = uinfo->userversion & 0xffff; + + if (uinfo->hfi1_alg < HFI1_ALG_COUNT) + alg = uinfo->hfi1_alg; + + mutex_lock(&hfi1_mutex); + /* First, lets check if we need to setup a shared context? */ + if (uinfo->subctxt_cnt) + ret = find_shared_ctxt(fp, uinfo); + + /* + * We execute the following block if we couldn't find a + * shared context or if context sharing is not required. + */ + if (!ret) { + i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE; + ret = get_user_context(fp, uinfo, i_minor - 1, alg); + } + mutex_unlock(&hfi1_mutex); +done: + return ret; +} + +static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo, + int devno, unsigned alg) +{ + struct hfi1_devdata *dd = NULL; + int ret = 0, devmax, npresent, nup, dev; + + devmax = hfi1_count_units(&npresent, &nup); + if (!npresent) { + ret = -ENXIO; + goto done; + } + if (!nup) { + ret = -ENETDOWN; + goto done; + } + if (devno >= 0) { + dd = hfi1_lookup(devno); + if (!dd) + ret = -ENODEV; + else if (!dd->freectxts) + ret = -EBUSY; + } else { + struct hfi1_devdata *pdd; + + if (alg == HFI1_ALG_ACROSS) { + unsigned free = 0U; + + for (dev = 0; dev < devmax; dev++) { + pdd = hfi1_lookup(dev); + if (pdd && pdd->freectxts && + pdd->freectxts > free) { + dd = pdd; + free = pdd->freectxts; + } + } + } else { + for (dev = 0; dev < devmax; dev++) { + pdd = hfi1_lookup(dev); + if (pdd && pdd->freectxts) { + dd = pdd; + break; + } + } + } + if (!dd) + ret = -EBUSY; + } +done: + return ret ? ret : allocate_ctxt(fp, dd, uinfo); +} + +static int find_shared_ctxt(struct file *fp, + const struct hfi1_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + + devmax = hfi1_count_units(NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct hfi1_devdata *dd = hfi1_lookup(ndev); + + /* device portion of usable() */ + if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase)) + continue; + for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) { + struct hfi1_ctxtdata *uctxt = dd->rcd[i]; + + /* Skip ctxts which are not yet open */ + if (!uctxt || !uctxt->cnt) + continue; + /* Skip ctxt if it doesn't match the requested one */ + if (memcmp(uctxt->uuid, uinfo->uuid, + sizeof(uctxt->uuid)) || + uctxt->subctxt_id != uinfo->subctxt_id || + uctxt->subctxt_cnt != uinfo->subctxt_cnt) + continue; + + /* Verify the sharing process matches the master */ + if (uctxt->userversion != uinfo->userversion || + uctxt->cnt >= uctxt->subctxt_cnt) { + ret = -EINVAL; + goto done; + } + ctxt_fp(fp) = uctxt; + subctxt_fp(fp) = uctxt->cnt++; + uctxt->subpid[subctxt_fp(fp)] = current->pid; + uctxt->active_slaves |= 1 << subctxt_fp(fp); + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, + struct hfi1_user_info *uinfo) +{ + struct hfi1_ctxtdata *uctxt; + unsigned ctxt; + int ret; + + if (dd->flags & HFI1_FROZEN) { + /* + * Pick an error that is unique from all other errors + * that are returned so the user process knows that + * it tried to allocate while the SPC was frozen. It + * it should be able to retry with success in a short + * while. + */ + return -EIO; + } + + for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++) + if (!dd->rcd[ctxt]) + break; + + if (ctxt == dd->num_rcv_contexts) + return -EBUSY; + + uctxt = hfi1_create_ctxtdata(dd->pport, ctxt); + if (!uctxt) { + dd_dev_err(dd, + "Unable to allocate ctxtdata memory, failing open\n"); + return -ENOMEM; + } + /* + * Allocate and enable a PIO send context. + */ + uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, + uctxt->numa_id); + if (!uctxt->sc) + return -ENOMEM; + + dbg("allocated send context %u(%u)\n", uctxt->sc->sw_index, + uctxt->sc->hw_context); + ret = sc_enable(uctxt->sc); + if (ret) + return ret; + /* + * Setup shared context resources if the user-level has requested + * shared contexts and this is the 'master' process. + * This has to be done here so the rest of the sub-contexts find the + * proper master. + */ + if (uinfo->subctxt_cnt && !subctxt_fp(fp)) { + ret = init_subctxts(uctxt, uinfo); + /* + * On error, we don't need to disable and de-allocate the + * send context because it will be done during file close + */ + if (ret) + return ret; + } + uctxt->userversion = uinfo->userversion; + uctxt->pid = current->pid; + uctxt->flags = HFI1_CAP_UGET(MASK); + init_waitqueue_head(&uctxt->wait); + strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); + memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)); + uctxt->jkey = generate_jkey(current_uid()); + INIT_LIST_HEAD(&uctxt->sdma_queues); + spin_lock_init(&uctxt->sdma_qlock); + hfi1_stats.sps_ctxts++; + dd->freectxts--; + ctxt_fp(fp) = uctxt; + + return 0; +} + +static int init_subctxts(struct hfi1_ctxtdata *uctxt, + const struct hfi1_user_info *uinfo) +{ + int ret = 0; + unsigned num_subctxts; + + num_subctxts = uinfo->subctxt_cnt; + if (num_subctxts > HFI1_MAX_SHARED_CTXTS) { + ret = -EINVAL; + goto bail; + } + + uctxt->subctxt_cnt = uinfo->subctxt_cnt; + uctxt->subctxt_id = uinfo->subctxt_id; + uctxt->active_slaves = 1; + uctxt->redirect_seq_cnt = 1; + set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); +bail: + return ret; +} + +static int setup_subctxt(struct hfi1_ctxtdata *uctxt) +{ + int ret = 0; + unsigned num_subctxts = uctxt->subctxt_cnt; + + uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE); + if (!uctxt->subctxt_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* We can take the size of the RcvHdr Queue from the master */ + uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size * + num_subctxts); + if (!uctxt->subctxt_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size * + num_subctxts); + if (!uctxt->subctxt_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + goto bail; +bail_rhdr: + vfree(uctxt->subctxt_rcvhdr_base); +bail_ureg: + vfree(uctxt->subctxt_uregbase); + uctxt->subctxt_uregbase = NULL; +bail: + return ret; +} + +static int user_init(struct file *fp) +{ + int ret; + unsigned int rcvctrl_ops = 0; + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + + /* make sure that the context has already been setup */ + if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) { + ret = -EFAULT; + goto done; + } + + /* + * Subctxts don't need to initialize anything since master + * has done it. + */ + if (subctxt_fp(fp)) { + ret = wait_event_interruptible(uctxt->wait, + !test_bit(HFI1_CTXT_MASTER_UNINIT, + &uctxt->event_flags)); + goto done; + } + + /* initialize poll variables... */ + uctxt->urgent = 0; + uctxt->urgent_poll = 0; + + /* + * Now enable the ctxt for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ctxts, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + if (uctxt->rcvhdrtail_kvaddr) + clear_rcvhdrtail(uctxt); + + /* Setup J_KEY before enabling the context */ + hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey); + + rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP)) + rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; + /* + * Ignore the bit in the flags for now until proper + * support for multiple packet per rcv array entry is + * added. + */ + if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt); + + /* Notify any waiting slaves */ + if (uctxt->subctxt_cnt) { + clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } + ret = 0; + +done: + return ret; +} + +static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) +{ + struct hfi1_ctxt_info cinfo; + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_filedata *fd = fp->private_data; + int ret = 0; + + memset(&cinfo, 0, sizeof(cinfo)); + ret = hfi1_get_base_kinfo(uctxt, &cinfo); + if (ret < 0) + goto done; + cinfo.num_active = hfi1_count_active_units(); + cinfo.unit = uctxt->dd->unit; + cinfo.ctxt = uctxt->ctxt; + cinfo.subctxt = subctxt_fp(fp); + cinfo.rcvtids = roundup(uctxt->egrbufs.alloced, + uctxt->dd->rcv_entries.group_size) + + uctxt->expected_count; + cinfo.credits = uctxt->sc->credits; + cinfo.numa_node = uctxt->numa_id; + cinfo.rec_cpu = fd->rec_cpu_num; + cinfo.send_ctxt = uctxt->sc->hw_context; + + cinfo.egrtids = uctxt->egrbufs.alloced; + cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2; + cinfo.sdma_ring_size = user_sdma_comp_fp(fp)->nentries; + cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size; + + trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, subctxt_fp(fp), cinfo); + if (copy_to_user(ubase, &cinfo, sizeof(cinfo))) + ret = -EFAULT; +done: + return ret; +} + +static int setup_ctxt(struct file *fp) +{ + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_devdata *dd = uctxt->dd; + int ret = 0; + + /* + * Context should be set up only once (including allocation and + * programming of eager buffers. This is done if context sharing + * is not requested or by the master process. + */ + if (!uctxt->subctxt_cnt || !subctxt_fp(fp)) { + ret = hfi1_init_ctxt(uctxt->sc); + if (ret) + goto done; + + /* Now allocate the RcvHdr queue and eager buffers. */ + ret = hfi1_create_rcvhdrq(dd, uctxt); + if (ret) + goto done; + ret = hfi1_setup_eagerbufs(uctxt); + if (ret) + goto done; + if (uctxt->subctxt_cnt && !subctxt_fp(fp)) { + ret = setup_subctxt(uctxt); + if (ret) + goto done; + } + /* Setup Expected Rcv memories */ + uctxt->tid_pg_list = vzalloc(uctxt->expected_count * + sizeof(struct page **)); + if (!uctxt->tid_pg_list) { + ret = -ENOMEM; + goto done; + } + uctxt->physshadow = vzalloc(uctxt->expected_count * + sizeof(*uctxt->physshadow)); + if (!uctxt->physshadow) { + ret = -ENOMEM; + goto done; + } + /* allocate expected TID map and initialize the cursor */ + atomic_set(&uctxt->tidcursor, 0); + uctxt->numtidgroups = uctxt->expected_count / + dd->rcv_entries.group_size; + uctxt->tidmapcnt = uctxt->numtidgroups / BITS_PER_LONG + + !!(uctxt->numtidgroups % BITS_PER_LONG); + uctxt->tidusemap = kzalloc_node(uctxt->tidmapcnt * + sizeof(*uctxt->tidusemap), + GFP_KERNEL, uctxt->numa_id); + if (!uctxt->tidusemap) { + ret = -ENOMEM; + goto done; + } + /* + * In case that the number of groups is not a multiple of + * 64 (the number of groups in a tidusemap element), mark + * the extra ones as used. This will effectively make them + * permanently used and should never be assigned. Otherwise, + * the code which checks how many free groups we have will + * get completely confused about the state of the bits. + */ + if (uctxt->numtidgroups % BITS_PER_LONG) + uctxt->tidusemap[uctxt->tidmapcnt - 1] = + ~((1ULL << (uctxt->numtidgroups % + BITS_PER_LONG)) - 1); + trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0, + uctxt->tidusemap, uctxt->tidmapcnt); + } + ret = hfi1_user_sdma_alloc_queues(uctxt, fp); + if (ret) + goto done; + + set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags); +done: + return ret; +} + +static int get_base_info(struct file *fp, void __user *ubase, __u32 len) +{ + struct hfi1_base_info binfo; + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_devdata *dd = uctxt->dd; + ssize_t sz; + unsigned offset; + int ret = 0; + + trace_hfi1_uctxtdata(uctxt->dd, uctxt); + + memset(&binfo, 0, sizeof(binfo)); + binfo.hw_version = dd->revision; + binfo.sw_version = HFI1_KERN_SWVERSION; + binfo.bthqp = kdeth_qp; + binfo.jkey = uctxt->jkey; + /* + * If more than 64 contexts are enabled the allocated credit + * return will span two or three contiguous pages. Since we only + * map the page containing the context's credit return address, + * we need to calculate the offset in the proper page. + */ + offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE; + binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt, + subctxt_fp(fp), offset); + binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt, + subctxt_fp(fp), + uctxt->sc->base_addr); + binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP, + uctxt->ctxt, + subctxt_fp(fp), + uctxt->sc->base_addr); + binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt, + subctxt_fp(fp), + uctxt->rcvhdrq); + binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt, + subctxt_fp(fp), + uctxt->egrbufs.rcvtids[0].phys); + binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt, + subctxt_fp(fp), 0); + /* + * user regs are at + * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE)) + */ + binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt, + subctxt_fp(fp), 0); + offset = ((((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + subctxt_fp(fp)) * + sizeof(*dd->events)) & ~PAGE_MASK; + binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt, + subctxt_fp(fp), + offset); + binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt, + subctxt_fp(fp), + dd->status); + if (HFI1_CAP_IS_USET(DMA_RTAIL)) + binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt, + subctxt_fp(fp), 0); + if (uctxt->subctxt_cnt) { + binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS, + uctxt->ctxt, + subctxt_fp(fp), 0); + binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ, + uctxt->ctxt, + subctxt_fp(fp), 0); + binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF, + uctxt->ctxt, + subctxt_fp(fp), 0); + } + sz = (len < sizeof(binfo)) ? len : sizeof(binfo); + if (copy_to_user(ubase, &binfo, sz)) + ret = -EFAULT; + return ret; +} + +static unsigned int poll_urgent(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_devdata *dd = uctxt->dd; + unsigned pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (uctxt->urgent != uctxt->urgent_poll) { + pollflag = POLLIN | POLLRDNORM; + uctxt->urgent_poll = uctxt->urgent; + } else { + pollflag = 0; + set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags); + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static unsigned int poll_next(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_devdata *dd = uctxt->dd; + unsigned pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (hdrqempty(uctxt)) { + set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt); + pollflag = 0; + } else + pollflag = POLLIN | POLLRDNORM; + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +/* + * Find all user contexts in use, and set the specified bit in their + * event mask. + * See also find_ctxt() for a similar use, that is specific to send buffers. + */ +int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit) +{ + struct hfi1_ctxtdata *uctxt; + struct hfi1_devdata *dd = ppd->dd; + unsigned ctxt; + int ret = 0; + unsigned long flags; + + if (!dd->events) { + ret = -EINVAL; + goto done; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; + ctxt++) { + uctxt = dd->rcd[ctxt]; + if (uctxt) { + unsigned long *evs = dd->events + + (uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS; + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(evtbit, evs); + for (i = 1; i < uctxt->subctxt_cnt; i++) + set_bit(evtbit, evs + i); + } + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +done: + return ret; +} + +/** + * manage_rcvq - manage a context's receive queue + * @uctxt: the context + * @subctxt: the sub-context + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the context, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt, + int start_stop) +{ + struct hfi1_devdata *dd = uctxt->dd; + unsigned int rcvctrl_op; + + if (subctxt) + goto bail; + /* atomically clear receive enable ctxt. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. + */ + if (uctxt->rcvhdrtail_kvaddr) + clear_rcvhdrtail(uctxt); + rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB; + } else + rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS; + hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt); + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +/* + * clear the event notifier events for this context. + * User process then performs actions appropriate to bit having been + * set, if desired, and checks again in future. + */ +static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt, + unsigned long events) +{ + int i; + struct hfi1_devdata *dd = uctxt->dd; + unsigned long *evs; + + if (!dd->events) + return 0; + + evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + subctxt; + + for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) { + if (!test_bit(i, &events)) + continue; + clear_bit(i, evs); + } + return 0; +} + +#define num_user_pages(vaddr, len) \ + (1 + (((((unsigned long)(vaddr) + \ + (unsigned long)(len) - 1) & PAGE_MASK) - \ + ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) + +/** + * tzcnt - count the number of trailing zeros in a 64bit value + * @value: the value to be examined + * + * Returns the number of trailing least significant zeros in the + * the input value. If the value is zero, return the number of + * bits of the value. + */ +static inline u8 tzcnt(u64 value) +{ + return value ? __builtin_ctzl(value) : sizeof(value) * 8; +} + +static inline unsigned num_free_groups(unsigned long map, u16 *start) +{ + unsigned free; + u16 bitidx = *start; + + if (bitidx >= BITS_PER_LONG) + return 0; + /* "Turn off" any bits set before our bit index */ + map &= ~((1ULL << bitidx) - 1); + free = tzcnt(map) - bitidx; + while (!free && bitidx < BITS_PER_LONG) { + /* Zero out the last set bit so we look at the rest */ + map &= ~(1ULL << bitidx); + /* + * Account for the previously checked bits and advance + * the bit index. We don't have to check for bitidx + * getting bigger than BITS_PER_LONG here as it would + * mean extra instructions that we don't need. If it + * did happen, it would push free to a negative value + * which will break the loop. + */ + free = tzcnt(map) - ++bitidx; + } + *start = bitidx; + return free; +} + +static int exp_tid_setup(struct file *fp, struct hfi1_tid_info *tinfo) +{ + int ret = 0; + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_devdata *dd = uctxt->dd; + unsigned tid, mapped = 0, npages, ngroups, exp_groups, + tidpairs = uctxt->expected_count / 2; + struct page **pages; + unsigned long vaddr, tidmap[uctxt->tidmapcnt]; + dma_addr_t *phys; + u32 tidlist[tidpairs], pairidx = 0, tidcursor; + u16 useidx, idx, bitidx, tidcnt = 0; + + vaddr = tinfo->vaddr; + + if (vaddr & ~PAGE_MASK) { + ret = -EINVAL; + goto bail; + } + + npages = num_user_pages(vaddr, tinfo->length); + if (!npages) { + ret = -EINVAL; + goto bail; + } + if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, + npages * PAGE_SIZE)) { + dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, npages); + ret = -EFAULT; + goto bail; + } + + memset(tidmap, 0, sizeof(tidmap[0]) * uctxt->tidmapcnt); + memset(tidlist, 0, sizeof(tidlist[0]) * tidpairs); + + exp_groups = uctxt->expected_count / dd->rcv_entries.group_size; + /* which group set do we look at first? */ + tidcursor = atomic_read(&uctxt->tidcursor); + useidx = (tidcursor >> 16) & 0xffff; + bitidx = tidcursor & 0xffff; + + /* + * Keep going until we've mapped all pages or we've exhausted all + * RcvArray entries. + * This iterates over the number of tidmaps + 1 + * (idx <= uctxt->tidmapcnt) so we check the bitmap which we + * started from one more time for any free bits before the + * starting point bit. + */ + for (mapped = 0, idx = 0; + mapped < npages && idx <= uctxt->tidmapcnt;) { + u64 i, offset = 0; + unsigned free, pinned, pmapped = 0, bits_used; + u16 grp; + + /* + * "Reserve" the needed group bits under lock so other + * processes can't step in the middle of it. Once + * reserved, we don't need the lock anymore since we + * are guaranteed the groups. + */ + spin_lock(&uctxt->exp_lock); + if (uctxt->tidusemap[useidx] == -1ULL || + bitidx >= BITS_PER_LONG) { + /* no free groups in the set, use the next */ + useidx = (useidx + 1) % uctxt->tidmapcnt; + idx++; + bitidx = 0; + spin_unlock(&uctxt->exp_lock); + continue; + } + ngroups = ((npages - mapped) / dd->rcv_entries.group_size) + + !!((npages - mapped) % dd->rcv_entries.group_size); + + /* + * If we've gotten here, the current set of groups does have + * one or more free groups. + */ + free = num_free_groups(uctxt->tidusemap[useidx], &bitidx); + if (!free) { + /* + * Despite the check above, free could still come back + * as 0 because we don't check the entire bitmap but + * we start from bitidx. + */ + spin_unlock(&uctxt->exp_lock); + continue; + } + bits_used = min(free, ngroups); + tidmap[useidx] |= ((1ULL << bits_used) - 1) << bitidx; + uctxt->tidusemap[useidx] |= tidmap[useidx]; + spin_unlock(&uctxt->exp_lock); + + /* + * At this point, we know where in the map we have free bits. + * properly offset into the various "shadow" arrays and compute + * the RcvArray entry index. + */ + offset = ((useidx * BITS_PER_LONG) + bitidx) * + dd->rcv_entries.group_size; + pages = uctxt->tid_pg_list + offset; + phys = uctxt->physshadow + offset; + tid = uctxt->expected_base + offset; + + /* Calculate how many pages we can pin based on free bits */ + pinned = min((bits_used * dd->rcv_entries.group_size), + (npages - mapped)); + /* + * Now that we know how many free RcvArray entries we have, + * we can pin that many user pages. + */ + ret = hfi1_get_user_pages(vaddr + (mapped * PAGE_SIZE), + pinned, pages); + if (ret) { + /* + * We can't continue because the pages array won't be + * initialized. This should never happen, + * unless perhaps the user has mpin'ed the pages + * themselves. + */ + dd_dev_info(dd, + "Failed to lock addr %p, %u pages: errno %d\n", + (void *) vaddr, pinned, -ret); + /* + * Let go of the bits that we reserved since we are not + * going to use them. + */ + spin_lock(&uctxt->exp_lock); + uctxt->tidusemap[useidx] &= + ~(((1ULL << bits_used) - 1) << bitidx); + spin_unlock(&uctxt->exp_lock); + goto done; + } + /* + * How many groups do we need based on how many pages we have + * pinned? + */ + ngroups = (pinned / dd->rcv_entries.group_size) + + !!(pinned % dd->rcv_entries.group_size); + /* + * Keep programming RcvArray entries for all the free + * groups. + */ + for (i = 0, grp = 0; grp < ngroups; i++, grp++) { + unsigned j; + u32 pair_size = 0, tidsize; + /* + * This inner loop will program an entire group or the + * array of pinned pages (which ever limit is hit + * first). + */ + for (j = 0; j < dd->rcv_entries.group_size && + pmapped < pinned; j++, pmapped++, tid++) { + tidsize = PAGE_SIZE; + phys[pmapped] = hfi1_map_page(dd->pcidev, + pages[pmapped], 0, + tidsize, PCI_DMA_FROMDEVICE); + trace_hfi1_exp_rcv_set(uctxt->ctxt, + subctxt_fp(fp), + tid, vaddr, + phys[pmapped], + pages[pmapped]); + /* + * Each RcvArray entry is programmed with one + * page * worth of memory. This will handle + * the 8K MTU as well as anything smaller + * due to the fact that both entries in the + * RcvTidPair are programmed with a page. + * PSM currently does not handle anything + * bigger than 8K MTU, so should we even worry + * about 10K here? + */ + hfi1_put_tid(dd, tid, PT_EXPECTED, + phys[pmapped], + ilog2(tidsize >> PAGE_SHIFT) + 1); + pair_size += tidsize >> PAGE_SHIFT; + EXP_TID_RESET(tidlist[pairidx], LEN, pair_size); + if (!(tid % 2)) { + tidlist[pairidx] |= + EXP_TID_SET(IDX, + (tid - uctxt->expected_base) + / 2); + tidlist[pairidx] |= + EXP_TID_SET(CTRL, 1); + tidcnt++; + } else { + tidlist[pairidx] |= + EXP_TID_SET(CTRL, 2); + pair_size = 0; + pairidx++; + } + } + /* + * We've programmed the entire group (or as much of the + * group as we'll use. Now, it's time to push it out... + */ + flush_wc(); + } + mapped += pinned; + atomic_set(&uctxt->tidcursor, + (((useidx & 0xffffff) << 16) | + ((bitidx + bits_used) & 0xffffff))); + } + trace_hfi1_exp_tid_map(uctxt->ctxt, subctxt_fp(fp), 0, uctxt->tidusemap, + uctxt->tidmapcnt); + +done: + /* If we've mapped anything, copy relevant info to user */ + if (mapped) { + if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, + tidlist, sizeof(tidlist[0]) * tidcnt)) { + ret = -EFAULT; + goto done; + } + /* copy TID info to user */ + if (copy_to_user((void __user *)(unsigned long)tinfo->tidmap, + tidmap, sizeof(tidmap[0]) * uctxt->tidmapcnt)) + ret = -EFAULT; + } +bail: + /* + * Calculate mapped length. New Exp TID protocol does not "unwind" and + * report an error if it can't map the entire buffer. It just reports + * the length that was mapped. + */ + tinfo->length = mapped * PAGE_SIZE; + tinfo->tidcnt = tidcnt; + return ret; +} + +static int exp_tid_free(struct file *fp, struct hfi1_tid_info *tinfo) +{ + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_devdata *dd = uctxt->dd; + unsigned long tidmap[uctxt->tidmapcnt]; + struct page **pages; + dma_addr_t *phys; + u16 idx, bitidx, tid; + int ret = 0; + + if (copy_from_user(&tidmap, (void __user *)(unsigned long) + tinfo->tidmap, + sizeof(tidmap[0]) * uctxt->tidmapcnt)) { + ret = -EFAULT; + goto done; + } + for (idx = 0; idx < uctxt->tidmapcnt; idx++) { + unsigned long map; + + bitidx = 0; + if (!tidmap[idx]) + continue; + map = tidmap[idx]; + while ((bitidx = tzcnt(map)) < BITS_PER_LONG) { + int i, pcount = 0; + struct page *pshadow[dd->rcv_entries.group_size]; + unsigned offset = ((idx * BITS_PER_LONG) + bitidx) * + dd->rcv_entries.group_size; + + pages = uctxt->tid_pg_list + offset; + phys = uctxt->physshadow + offset; + tid = uctxt->expected_base + offset; + for (i = 0; i < dd->rcv_entries.group_size; + i++, tid++) { + if (pages[i]) { + hfi1_put_tid(dd, tid, PT_INVALID, + 0, 0); + trace_hfi1_exp_rcv_free(uctxt->ctxt, + subctxt_fp(fp), + tid, phys[i], + pages[i]); + pci_unmap_page(dd->pcidev, phys[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + pshadow[pcount] = pages[i]; + pages[i] = NULL; + pcount++; + phys[i] = 0; + } + } + flush_wc(); + hfi1_release_user_pages(pshadow, pcount); + clear_bit(bitidx, &uctxt->tidusemap[idx]); + map &= ~(1ULL<ctxt, subctxt_fp(fp), 1, uctxt->tidusemap, + uctxt->tidmapcnt); +done: + return ret; +} + +static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt) +{ + struct hfi1_devdata *dd = uctxt->dd; + unsigned tid; + + dd_dev_info(dd, "ctxt %u unlocking any locked expTID pages\n", + uctxt->ctxt); + for (tid = 0; tid < uctxt->expected_count; tid++) { + struct page *p = uctxt->tid_pg_list[tid]; + dma_addr_t phys; + + if (!p) + continue; + + phys = uctxt->physshadow[tid]; + uctxt->physshadow[tid] = 0; + uctxt->tid_pg_list[tid] = NULL; + pci_unmap_page(dd->pcidev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE); + hfi1_release_user_pages(&p, 1); + } +} + +static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt, + u16 pkey) +{ + int ret = -ENOENT, i, intable = 0; + struct hfi1_pportdata *ppd = uctxt->ppd; + struct hfi1_devdata *dd = uctxt->dd; + + if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) { + ret = -EINVAL; + goto done; + } + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) + if (pkey == ppd->pkeys[i]) { + intable = 1; + break; + } + + if (intable) + ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey); +done: + return ret; +} + +static int ui_open(struct inode *inode, struct file *filp) +{ + struct hfi1_devdata *dd; + + dd = container_of(inode->i_cdev, struct hfi1_devdata, ui_cdev); + filp->private_data = dd; /* for other methods */ + return 0; +} + +static int ui_release(struct inode *inode, struct file *filp) +{ + /* nothing to do */ + return 0; +} + +static loff_t ui_lseek(struct file *filp, loff_t offset, int whence) +{ + struct hfi1_devdata *dd = filp->private_data; + + switch (whence) { + case SEEK_SET: + break; + case SEEK_CUR: + offset += filp->f_pos; + break; + case SEEK_END: + offset = ((dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE) - + offset; + break; + default: + return -EINVAL; + } + + if (offset < 0) + return -EINVAL; + + if (offset >= (dd->kregend - dd->kregbase) + DC8051_DATA_MEM_SIZE) + return -EINVAL; + + filp->f_pos = offset; + + return filp->f_pos; +} + + +/* NOTE: assumes unsigned long is 8 bytes */ +static ssize_t ui_read(struct file *filp, char __user *buf, size_t count, + loff_t *f_pos) +{ + struct hfi1_devdata *dd = filp->private_data; + void __iomem *base = dd->kregbase; + unsigned long total, csr_off, + barlen = (dd->kregend - dd->kregbase); + u64 data; + + /* only read 8 byte quantities */ + if ((count % 8) != 0) + return -EINVAL; + /* offset must be 8-byte aligned */ + if ((*f_pos % 8) != 0) + return -EINVAL; + /* destination buffer must be 8-byte aligned */ + if ((unsigned long)buf % 8 != 0) + return -EINVAL; + /* must be in range */ + if (*f_pos + count > (barlen + DC8051_DATA_MEM_SIZE)) + return -EINVAL; + /* only set the base if we are not starting past the BAR */ + if (*f_pos < barlen) + base += *f_pos; + csr_off = *f_pos; + for (total = 0; total < count; total += 8, csr_off += 8) { + /* accessing LCB CSRs requires more checks */ + if (is_lcb_offset(csr_off)) { + if (read_lcb_csr(dd, csr_off, (u64 *)&data)) + break; /* failed */ + } + /* + * Cannot read ASIC GPIO/QSFP* clear and force CSRs without a + * false parity error. Avoid the whole issue by not reading + * them. These registers are defined as having a read value + * of 0. + */ + else if (csr_off == ASIC_GPIO_CLEAR + || csr_off == ASIC_GPIO_FORCE + || csr_off == ASIC_QSFP1_CLEAR + || csr_off == ASIC_QSFP1_FORCE + || csr_off == ASIC_QSFP2_CLEAR + || csr_off == ASIC_QSFP2_FORCE) + data = 0; + else if (csr_off >= barlen) { + /* + * read_8051_data can read more than just 8 bytes at + * a time. However, folding this into the loop and + * handling the reads in 8 byte increments allows us + * to smoothly transition from chip memory to 8051 + * memory. + */ + if (read_8051_data(dd, + (u32)(csr_off - barlen), + sizeof(data), &data)) + break; /* failed */ + } else + data = readq(base + total); + if (put_user(data, (unsigned long __user *)(buf + total))) + break; + } + *f_pos += total; + return total; +} + +/* NOTE: assumes unsigned long is 8 bytes */ +static ssize_t ui_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) +{ + struct hfi1_devdata *dd = filp->private_data; + void __iomem *base; + unsigned long total, data, csr_off; + int in_lcb; + + /* only write 8 byte quantities */ + if ((count % 8) != 0) + return -EINVAL; + /* offset must be 8-byte aligned */ + if ((*f_pos % 8) != 0) + return -EINVAL; + /* source buffer must be 8-byte aligned */ + if ((unsigned long)buf % 8 != 0) + return -EINVAL; + /* must be in range */ + if (*f_pos + count > dd->kregend - dd->kregbase) + return -EINVAL; + + base = (void __iomem *)dd->kregbase + *f_pos; + csr_off = *f_pos; + in_lcb = 0; + for (total = 0; total < count; total += 8, csr_off += 8) { + if (get_user(data, (unsigned long __user *)(buf + total))) + break; + /* accessing LCB CSRs requires a special procedure */ + if (is_lcb_offset(csr_off)) { + if (!in_lcb) { + int ret = acquire_lcb_access(dd, 1); + + if (ret) + break; + in_lcb = 1; + } + } else { + if (in_lcb) { + release_lcb_access(dd, 1); + in_lcb = 0; + } + } + writeq(data, base + total); + } + if (in_lcb) + release_lcb_access(dd, 1); + *f_pos += total; + return total; +} + +static const struct file_operations ui_file_ops = { + .owner = THIS_MODULE, + .llseek = ui_lseek, + .read = ui_read, + .write = ui_write, + .open = ui_open, + .release = ui_release, +}; +#define UI_OFFSET 192 /* device minor offset for UI devices */ +static int create_ui = 1; + +static struct cdev wildcard_cdev; +static struct device *wildcard_device; + +static atomic_t user_count = ATOMIC_INIT(0); + +static void user_remove(struct hfi1_devdata *dd) +{ + if (atomic_dec_return(&user_count) == 0) + hfi1_cdev_cleanup(&wildcard_cdev, &wildcard_device); + + hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device); + hfi1_cdev_cleanup(&dd->ui_cdev, &dd->ui_device); +} + +static int user_add(struct hfi1_devdata *dd) +{ + char name[10]; + int ret; + + if (atomic_inc_return(&user_count) == 1) { + ret = hfi1_cdev_init(0, class_name(), &hfi1_file_ops, + &wildcard_cdev, &wildcard_device, + true); + if (ret) + goto done; + } + + snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit); + ret = hfi1_cdev_init(dd->unit + 1, name, &hfi1_file_ops, + &dd->user_cdev, &dd->user_device, + true); + if (ret) + goto done; + + if (create_ui) { + snprintf(name, sizeof(name), + "%s_ui%d", class_name(), dd->unit); + ret = hfi1_cdev_init(dd->unit + UI_OFFSET, name, &ui_file_ops, + &dd->ui_cdev, &dd->ui_device, + false); + if (ret) + goto done; + } + + return 0; +done: + user_remove(dd); + return ret; +} + +/* + * Create per-unit files in /dev + */ +int hfi1_device_create(struct hfi1_devdata *dd) +{ + int r, ret; + + r = user_add(dd); + ret = hfi1_diag_add(dd); + if (r && !ret) + ret = r; + return ret; +} + +/* + * Remove per-unit files in /dev + * void, core kernel returns no errors for this stuff + */ +void hfi1_device_remove(struct hfi1_devdata *dd) +{ + user_remove(dd); + hfi1_diag_remove(dd); +} diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c new file mode 100644 index 000000000..9a02432d3 --- /dev/null +++ b/drivers/staging/rdma/hfi1/firmware.c @@ -0,0 +1,1620 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "trace.h" + +/* + * Make it easy to toggle firmware file name and if it gets loaded by + * editing the following. This may be something we do while in development + * but not necessarily something a user would ever need to use. + */ +#define DEFAULT_FW_8051_NAME_FPGA "/*(DEBLOBBED)*/" +#define DEFAULT_FW_8051_NAME_ASIC "/*(DEBLOBBED)*/" +#define DEFAULT_FW_FABRIC_NAME "/*(DEBLOBBED)*/" +#define DEFAULT_FW_SBUS_NAME "/*(DEBLOBBED)*/" +#define DEFAULT_FW_PCIE_NAME "/*(DEBLOBBED)*/" +#define DEFAULT_PLATFORM_CONFIG_NAME "/*(DEBLOBBED)*/" + +static uint fw_8051_load = 1; +static uint fw_fabric_serdes_load = 1; +static uint fw_pcie_serdes_load = 1; +static uint fw_sbus_load = 1; +static uint platform_config_load = 1; + +/* Firmware file names get set in hfi1_firmware_init() based on the above */ +static char *fw_8051_name; +static char *fw_fabric_serdes_name; +static char *fw_sbus_name; +static char *fw_pcie_serdes_name; +static char *platform_config_name; + +#define SBUS_MAX_POLL_COUNT 100 +#define SBUS_COUNTER(reg, name) \ + (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \ + ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK) + +/* + * Firmware security header. + */ +struct css_header { + u32 module_type; + u32 header_len; + u32 header_version; + u32 module_id; + u32 module_vendor; + u32 date; /* BCD yyyymmdd */ + u32 size; /* in DWORDs */ + u32 key_size; /* in DWORDs */ + u32 modulus_size; /* in DWORDs */ + u32 exponent_size; /* in DWORDs */ + u32 reserved[22]; +}; +/* expected field values */ +#define CSS_MODULE_TYPE 0x00000006 +#define CSS_HEADER_LEN 0x000000a1 +#define CSS_HEADER_VERSION 0x00010000 +#define CSS_MODULE_VENDOR 0x00008086 + +#define KEY_SIZE 256 +#define MU_SIZE 8 +#define EXPONENT_SIZE 4 + +/* the file itself */ +struct firmware_file { + struct css_header css_header; + u8 modulus[KEY_SIZE]; + u8 exponent[EXPONENT_SIZE]; + u8 signature[KEY_SIZE]; + u8 firmware[]; +}; + +struct augmented_firmware_file { + struct css_header css_header; + u8 modulus[KEY_SIZE]; + u8 exponent[EXPONENT_SIZE]; + u8 signature[KEY_SIZE]; + u8 r2[KEY_SIZE]; + u8 mu[MU_SIZE]; + u8 firmware[]; +}; + +/* augmented file size difference */ +#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \ + sizeof(struct firmware_file)) + +struct firmware_details { + /* Linux core piece */ + const struct firmware *fw; + + struct css_header *css_header; + u8 *firmware_ptr; /* pointer to binary data */ + u32 firmware_len; /* length in bytes */ + u8 *modulus; /* pointer to the modulus */ + u8 *exponent; /* pointer to the exponent */ + u8 *signature; /* pointer to the signature */ + u8 *r2; /* pointer to r2 */ + u8 *mu; /* pointer to mu */ + struct augmented_firmware_file dummy_header; +}; + +/* + * The mutex protects fw_state, fw_err, and all of the firmware_details + * variables. + */ +static DEFINE_MUTEX(fw_mutex); +enum fw_state { + FW_EMPTY, + FW_ACQUIRED, + FW_ERR +}; +static enum fw_state fw_state = FW_EMPTY; +static int fw_err; +static struct firmware_details fw_8051; +static struct firmware_details fw_fabric; +static struct firmware_details fw_pcie; +static struct firmware_details fw_sbus; +static const struct firmware *platform_config; + +/* flags for turn_off_spicos() */ +#define SPICO_SBUS 0x1 +#define SPICO_FABRIC 0x2 +#define ENABLE_SPICO_SMASK 0x1 + +/* security block commands */ +#define RSA_CMD_INIT 0x1 +#define RSA_CMD_START 0x2 + +/* security block status */ +#define RSA_STATUS_IDLE 0x0 +#define RSA_STATUS_ACTIVE 0x1 +#define RSA_STATUS_DONE 0x2 +#define RSA_STATUS_FAILED 0x3 + +/* RSA engine timeout, in ms */ +#define RSA_ENGINE_TIMEOUT 100 /* ms */ + +/* hardware mutex timeout, in ms */ +#define HM_TIMEOUT 4000 /* 4 s */ + +/* 8051 memory access timeout, in us */ +#define DC8051_ACCESS_TIMEOUT 100 /* us */ + +/* the number of fabric SerDes on the SBus */ +#define NUM_FABRIC_SERDES 4 + +/* SBus fabric SerDes addresses, one set per HFI */ +static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = { + { 0x01, 0x02, 0x03, 0x04 }, + { 0x28, 0x29, 0x2a, 0x2b } +}; + +/* SBus PCIe SerDes addresses, one set per HFI */ +static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = { + { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, + 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 }, + { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d, + 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d } +}; + +/* SBus PCIe PCS addresses, one set per HFI */ +const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = { + { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17, + 0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 }, + { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e } +}; + +/* SBus fabric SerDes broadcast addresses, one per HFI */ +static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 }; +static const u8 all_fabric_serdes_broadcast = 0xe1; + +/* SBus PCIe SerDes broadcast addresses, one per HFI */ +const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 }; +static const u8 all_pcie_serdes_broadcast = 0xe0; + +/* forwards */ +static void dispose_one_firmware(struct firmware_details *fdet); + +/* + * Read a single 64-bit value from 8051 data memory. + * + * Expects: + * o caller to have already set up data read, no auto increment + * o caller to turn off read enable when finished + * + * The address argument is a byte offset. Bits 0:2 in the address are + * ignored - i.e. the hardware will always do aligned 8-byte reads as if + * the lower bits are zero. + * + * Return 0 on success, -ENXIO on a read error (timeout). + */ +static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result) +{ + u64 reg; + int count; + + /* start the read at the given address */ + reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK) + << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT) + | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg); + + /* wait until ACCESS_COMPLETED is set */ + count = 0; + while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS) + & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK) + == 0) { + count++; + if (count > DC8051_ACCESS_TIMEOUT) { + dd_dev_err(dd, "timeout reading 8051 data\n"); + return -ENXIO; + } + ndelay(10); + } + + /* gather the data */ + *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA); + + return 0; +} + +/* + * Read 8051 data starting at addr, for len bytes. Will read in 8-byte chunks. + * Return 0 on success, -errno on error. + */ +int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result) +{ + unsigned long flags; + u32 done; + int ret = 0; + + spin_lock_irqsave(&dd->dc8051_memlock, flags); + + /* data read set-up, no auto-increment */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0); + + for (done = 0; done < len; addr += 8, done += 8, result++) { + ret = __read_8051_data(dd, addr, result); + if (ret) + break; + } + + /* turn off read enable */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0); + + spin_unlock_irqrestore(&dd->dc8051_memlock, flags); + + return ret; +} + +/* + * Write data or code to the 8051 code or data RAM. + */ +static int write_8051(struct hfi1_devdata *dd, int code, u32 start, + const u8 *data, u32 len) +{ + u64 reg; + u32 offset; + int aligned, count; + + /* check alignment */ + aligned = ((unsigned long)data & 0x7) == 0; + + /* write set-up */ + reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull) + | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg); + + reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK) + << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT) + | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg); + + /* write */ + for (offset = 0; offset < len; offset += 8) { + int bytes = len - offset; + + if (bytes < 8) { + reg = 0; + memcpy(®, &data[offset], bytes); + } else if (aligned) { + reg = *(u64 *)&data[offset]; + } else { + memcpy(®, &data[offset], 8); + } + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg); + + /* wait until ACCESS_COMPLETED is set */ + count = 0; + while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS) + & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK) + == 0) { + count++; + if (count > DC8051_ACCESS_TIMEOUT) { + dd_dev_err(dd, "timeout writing 8051 data\n"); + return -ENXIO; + } + udelay(1); + } + } + + /* turn off write access, auto increment (also sets to data access) */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0); + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0); + + return 0; +} + +/* return 0 if values match, non-zero and complain otherwise */ +static int invalid_header(struct hfi1_devdata *dd, const char *what, + u32 actual, u32 expected) +{ + if (actual == expected) + return 0; + + dd_dev_err(dd, + "invalid firmware header field %s: expected 0x%x, actual 0x%x\n", + what, expected, actual); + return 1; +} + +/* + * Verify that the static fields in the CSS header match. + */ +static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css) +{ + /* verify CSS header fields (most sizes are in DW, so add /4) */ + if (invalid_header(dd, "module_type", css->module_type, CSS_MODULE_TYPE) + || invalid_header(dd, "header_len", css->header_len, + (sizeof(struct firmware_file)/4)) + || invalid_header(dd, "header_version", + css->header_version, CSS_HEADER_VERSION) + || invalid_header(dd, "module_vendor", + css->module_vendor, CSS_MODULE_VENDOR) + || invalid_header(dd, "key_size", + css->key_size, KEY_SIZE/4) + || invalid_header(dd, "modulus_size", + css->modulus_size, KEY_SIZE/4) + || invalid_header(dd, "exponent_size", + css->exponent_size, EXPONENT_SIZE/4)) { + return -EINVAL; + } + return 0; +} + +/* + * Make sure there are at least some bytes after the prefix. + */ +static int payload_check(struct hfi1_devdata *dd, const char *name, + long file_size, long prefix_size) +{ + /* make sure we have some payload */ + if (prefix_size >= file_size) { + dd_dev_err(dd, + "firmware \"%s\", size %ld, must be larger than %ld bytes\n", + name, file_size, prefix_size); + return -EINVAL; + } + + return 0; +} + +/* + * Request the firmware from the system. Extract the pieces and fill in + * fdet. If successful, the caller will need to call dispose_one_firmware(). + * Returns 0 on success, -ERRNO on error. + */ +static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name, + struct firmware_details *fdet) +{ + struct css_header *css; + int ret; + + memset(fdet, 0, sizeof(*fdet)); + + ret = reject_firmware(&fdet->fw, name, &dd->pcidev->dev); + if (ret) { + dd_dev_err(dd, "cannot load firmware \"%s\", err %d\n", + name, ret); + return ret; + } + + /* verify the firmware */ + if (fdet->fw->size < sizeof(struct css_header)) { + dd_dev_err(dd, "firmware \"%s\" is too small\n", name); + ret = -EINVAL; + goto done; + } + css = (struct css_header *)fdet->fw->data; + + hfi1_cdbg(FIRMWARE, "Firmware %s details:", name); + hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size); + hfi1_cdbg(FIRMWARE, "CSS structure:"); + hfi1_cdbg(FIRMWARE, " module_type 0x%x", css->module_type); + hfi1_cdbg(FIRMWARE, " header_len 0x%03x (0x%03x bytes)", + css->header_len, 4 * css->header_len); + hfi1_cdbg(FIRMWARE, " header_version 0x%x", css->header_version); + hfi1_cdbg(FIRMWARE, " module_id 0x%x", css->module_id); + hfi1_cdbg(FIRMWARE, " module_vendor 0x%x", css->module_vendor); + hfi1_cdbg(FIRMWARE, " date 0x%x", css->date); + hfi1_cdbg(FIRMWARE, " size 0x%03x (0x%03x bytes)", + css->size, 4 * css->size); + hfi1_cdbg(FIRMWARE, " key_size 0x%03x (0x%03x bytes)", + css->key_size, 4 * css->key_size); + hfi1_cdbg(FIRMWARE, " modulus_size 0x%03x (0x%03x bytes)", + css->modulus_size, 4 * css->modulus_size); + hfi1_cdbg(FIRMWARE, " exponent_size 0x%03x (0x%03x bytes)", + css->exponent_size, 4 * css->exponent_size); + hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes", + fdet->fw->size - sizeof(struct firmware_file)); + + /* + * If the file does not have a valid CSS header, fail. + * Otherwise, check the CSS size field for an expected size. + * The augmented file has r2 and mu inserted after the header + * was generated, so there will be a known difference between + * the CSS header size and the actual file size. Use this + * difference to identify an augmented file. + * + * Note: css->size is in DWORDs, multiply by 4 to get bytes. + */ + ret = verify_css_header(dd, css); + if (ret) { + dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name); + } else if ((css->size*4) == fdet->fw->size) { + /* non-augmented firmware file */ + struct firmware_file *ff = (struct firmware_file *) + fdet->fw->data; + + /* make sure there are bytes in the payload */ + ret = payload_check(dd, name, fdet->fw->size, + sizeof(struct firmware_file)); + if (ret == 0) { + fdet->css_header = css; + fdet->modulus = ff->modulus; + fdet->exponent = ff->exponent; + fdet->signature = ff->signature; + fdet->r2 = fdet->dummy_header.r2; /* use dummy space */ + fdet->mu = fdet->dummy_header.mu; /* use dummy space */ + fdet->firmware_ptr = ff->firmware; + fdet->firmware_len = fdet->fw->size - + sizeof(struct firmware_file); + /* + * Header does not include r2 and mu - generate here. + * For now, fail. + */ + dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n"); + ret = -EINVAL; + } + } else if ((css->size*4) + AUGMENT_SIZE == fdet->fw->size) { + /* augmented firmware file */ + struct augmented_firmware_file *aff = + (struct augmented_firmware_file *)fdet->fw->data; + + /* make sure there are bytes in the payload */ + ret = payload_check(dd, name, fdet->fw->size, + sizeof(struct augmented_firmware_file)); + if (ret == 0) { + fdet->css_header = css; + fdet->modulus = aff->modulus; + fdet->exponent = aff->exponent; + fdet->signature = aff->signature; + fdet->r2 = aff->r2; + fdet->mu = aff->mu; + fdet->firmware_ptr = aff->firmware; + fdet->firmware_len = fdet->fw->size - + sizeof(struct augmented_firmware_file); + } + } else { + /* css->size check failed */ + dd_dev_err(dd, + "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n", + fdet->fw->size/4, (fdet->fw->size - AUGMENT_SIZE)/4, + css->size); + + ret = -EINVAL; + } + +done: + /* if returning an error, clean up after ourselves */ + if (ret) + dispose_one_firmware(fdet); + return ret; +} + +static void dispose_one_firmware(struct firmware_details *fdet) +{ + release_firmware(fdet->fw); + fdet->fw = NULL; +} + +/* + * Called by all HFIs when loading their firmware - i.e. device probe time. + * The first one will do the actual firmware load. Use a mutex to resolve + * any possible race condition. + * + * The call to this routine cannot be moved to driver load because the kernel + * call reject_firmware() requires a device which is only available after + * the first device probe. + */ +static int obtain_firmware(struct hfi1_devdata *dd) +{ + int err = 0; + + mutex_lock(&fw_mutex); + if (fw_state == FW_ACQUIRED) { + goto done; /* already acquired */ + } else if (fw_state == FW_ERR) { + err = fw_err; + goto done; /* already tried and failed */ + } + + if (fw_8051_load) { + err = obtain_one_firmware(dd, fw_8051_name, &fw_8051); + if (err) + goto done; + } + + if (fw_fabric_serdes_load) { + err = obtain_one_firmware(dd, fw_fabric_serdes_name, + &fw_fabric); + if (err) + goto done; + } + + if (fw_sbus_load) { + err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus); + if (err) + goto done; + } + + if (fw_pcie_serdes_load) { + err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie); + if (err) + goto done; + } + + if (platform_config_load) { + platform_config = NULL; + err = reject_firmware(&platform_config, platform_config_name, + &dd->pcidev->dev); + if (err) { + err = 0; + platform_config = NULL; + } + } + + /* success */ + fw_state = FW_ACQUIRED; + +done: + if (err) { + fw_err = err; + fw_state = FW_ERR; + } + mutex_unlock(&fw_mutex); + + return err; +} + +/* + * Called when the driver unloads. The timing is asymmetric with its + * counterpart, obtain_firmware(). If called at device remove time, + * then it is conceivable that another device could probe while the + * firmware is being disposed. The mutexes can be moved to do that + * safely, but then the firmware would be requested from the OS multiple + * times. + * + * No mutex is needed as the driver is unloading and there cannot be any + * other callers. + */ +void dispose_firmware(void) +{ + dispose_one_firmware(&fw_8051); + dispose_one_firmware(&fw_fabric); + dispose_one_firmware(&fw_pcie); + dispose_one_firmware(&fw_sbus); + + release_firmware(platform_config); + platform_config = NULL; + + /* retain the error state, otherwise revert to empty */ + if (fw_state != FW_ERR) + fw_state = FW_EMPTY; +} + +/* + * Write a block of data to a given array CSR. All calls will be in + * multiples of 8 bytes. + */ +static void write_rsa_data(struct hfi1_devdata *dd, int what, + const u8 *data, int nbytes) +{ + int qw_size = nbytes/8; + int i; + + if (((unsigned long)data & 0x7) == 0) { + /* aligned */ + u64 *ptr = (u64 *)data; + + for (i = 0; i < qw_size; i++, ptr++) + write_csr(dd, what + (8*i), *ptr); + } else { + /* not aligned */ + for (i = 0; i < qw_size; i++, data += 8) { + u64 value; + + memcpy(&value, data, 8); + write_csr(dd, what + (8*i), value); + } + } +} + +/* + * Write a block of data to a given CSR as a stream of writes. All calls will + * be in multiples of 8 bytes. + */ +static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what, + const u8 *data, int nbytes) +{ + u64 *ptr = (u64 *)data; + int qw_size = nbytes/8; + + for (; qw_size > 0; qw_size--, ptr++) + write_csr(dd, what, *ptr); +} + +/* + * Download the signature and start the RSA mechanism. Wait for + * RSA_ENGINE_TIMEOUT before giving up. + */ +static int run_rsa(struct hfi1_devdata *dd, const char *who, + const u8 *signature) +{ + unsigned long timeout; + u64 reg; + u32 status; + int ret = 0; + + /* write the signature */ + write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE); + + /* initialize RSA */ + write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT); + + /* + * Make sure the engine is idle and insert a delay between the two + * writes to MISC_CFG_RSA_CMD. + */ + status = (read_csr(dd, MISC_CFG_FW_CTRL) + & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK) + >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT; + if (status != RSA_STATUS_IDLE) { + dd_dev_err(dd, "%s security engine not idle - giving up\n", + who); + return -EBUSY; + } + + /* start RSA */ + write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START); + + /* + * Look for the result. + * + * The RSA engine is hooked up to two MISC errors. The driver + * masks these errors as they do not respond to the standard + * error "clear down" mechanism. Look for these errors here and + * clear them when possible. This routine will exit with the + * errors of the current run still set. + * + * MISC_FW_AUTH_FAILED_ERR + * Firmware authorization failed. This can be cleared by + * re-initializing the RSA engine, then clearing the status bit. + * Do not re-init the RSA angine immediately after a successful + * run - this will reset the current authorization. + * + * MISC_KEY_MISMATCH_ERR + * Key does not match. The only way to clear this is to load + * a matching key then clear the status bit. If this error + * is raised, it will persist outside of this routine until a + * matching key is loaded. + */ + timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies; + while (1) { + status = (read_csr(dd, MISC_CFG_FW_CTRL) + & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK) + >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT; + + if (status == RSA_STATUS_IDLE) { + /* should not happen */ + dd_dev_err(dd, "%s firmware security bad idle state\n", + who); + ret = -EINVAL; + break; + } else if (status == RSA_STATUS_DONE) { + /* finished successfully */ + break; + } else if (status == RSA_STATUS_FAILED) { + /* finished unsuccessfully */ + ret = -EINVAL; + break; + } + /* else still active */ + + if (time_after(jiffies, timeout)) { + /* + * Timed out while active. We can't reset the engine + * if it is stuck active, but run through the + * error code to see what error bits are set. + */ + dd_dev_err(dd, "%s firmware security time out\n", who); + ret = -ETIMEDOUT; + break; + } + + msleep(20); + } + + /* + * Arrive here on success or failure. Clear all RSA engine + * errors. All current errors will stick - the RSA logic is keeping + * error high. All previous errors will clear - the RSA logic + * is not keeping the error high. + */ + write_csr(dd, MISC_ERR_CLEAR, + MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK + | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK); + /* + * All that is left are the current errors. Print failure details, + * if any. + */ + reg = read_csr(dd, MISC_ERR_STATUS); + if (ret) { + if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK) + dd_dev_err(dd, "%s firmware authorization failed\n", + who); + if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK) + dd_dev_err(dd, "%s firmware key mismatch\n", who); + } + + return ret; +} + +static void load_security_variables(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + /* Security variables a. Write the modulus */ + write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE); + /* Security variables b. Write the r2 */ + write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE); + /* Security variables c. Write the mu */ + write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE); + /* Security variables d. Write the header */ + write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD, + (u8 *)fdet->css_header, sizeof(struct css_header)); +} + +/* return the 8051 firmware state */ +static inline u32 get_firmware_state(struct hfi1_devdata *dd) +{ + u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE); + + return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT) + & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK; +} + +/* + * Wait until the firmware is up and ready to take host requests. + * Return 0 on success, -ETIMEDOUT on timeout. + */ +int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout) +{ + unsigned long timeout; + + /* in the simulator, the fake 8051 is always ready */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + return 0; + + timeout = msecs_to_jiffies(mstimeout) + jiffies; + while (1) { + if (get_firmware_state(dd) == 0xa0) /* ready */ + return 0; + if (time_after(jiffies, timeout)) /* timed out */ + return -ETIMEDOUT; + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } +} + +/* + * Load the 8051 firmware. + */ +static int load_8051_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + u64 reg; + int ret; + u8 ver_a, ver_b; + + /* + * DC Reset sequence + * Load DC 8051 firmware + */ + /* + * DC reset step 1: Reset DC8051 + */ + reg = DC_DC8051_CFG_RST_M8051W_SMASK + | DC_DC8051_CFG_RST_CRAM_SMASK + | DC_DC8051_CFG_RST_DRAM_SMASK + | DC_DC8051_CFG_RST_IRAM_SMASK + | DC_DC8051_CFG_RST_SFR_SMASK; + write_csr(dd, DC_DC8051_CFG_RST, reg); + + /* + * DC reset step 2 (optional): Load 8051 data memory with link + * configuration + */ + + /* + * DC reset step 3: Load DC8051 firmware + */ + /* release all but the core reset */ + reg = DC_DC8051_CFG_RST_M8051W_SMASK; + write_csr(dd, DC_DC8051_CFG_RST, reg); + + /* Firmware load step 1 */ + load_security_variables(dd, fdet); + + /* + * Firmware load step 2. Clear MISC_CFG_FW_CTRL.FW_8051_LOADED + */ + write_csr(dd, MISC_CFG_FW_CTRL, 0); + + /* Firmware load steps 3-5 */ + ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr, + fdet->firmware_len); + if (ret) + return ret; + + /* + * DC reset step 4. Host starts the DC8051 firmware + */ + /* + * Firmware load step 6. Set MISC_CFG_FW_CTRL.FW_8051_LOADED + */ + write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK); + + /* Firmware load steps 7-10 */ + ret = run_rsa(dd, "8051", fdet->signature); + if (ret) + return ret; + + /* clear all reset bits, releasing the 8051 */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + + /* + * DC reset step 5. Wait for firmware to be ready to accept host + * requests. + */ + ret = wait_fm_ready(dd, TIMEOUT_8051_START); + if (ret) { /* timed out */ + dd_dev_err(dd, "8051 start timeout, current state 0x%x\n", + get_firmware_state(dd)); + return -ETIMEDOUT; + } + + read_misc_status(dd, &ver_a, &ver_b); + dd_dev_info(dd, "8051 firmware version %d.%d\n", + (int)ver_b, (int)ver_a); + dd->dc8051_ver = dc8051_ver(ver_b, ver_a); + + return 0; +} + +/* SBus Master broadcast address */ +#define SBUS_MASTER_BROADCAST 0xfd + +/* + * Write the SBus request register + * + * No need for masking - the arguments are sized exactly. + */ +void sbus_request(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in) +{ + write_csr(dd, ASIC_CFG_SBUS_REQUEST, + ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) + | ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) + | ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) + | ((u64)receiver_addr + << ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT)); +} + +/* + * Turn off the SBus and fabric serdes spicos. + * + * + Must be called with Sbus fast mode turned on. + * + Must be called after fabric serdes broadcast is set up. + * + Must be called before the 8051 is loaded - assumes 8051 is not loaded + * when using MISC_CFG_FW_CTRL. + */ +static void turn_off_spicos(struct hfi1_devdata *dd, int flags) +{ + /* only needed on A0 */ + if (!is_a0(dd)) + return; + + dd_dev_info(dd, "Turning off spicos:%s%s\n", + flags & SPICO_SBUS ? " SBus" : "", + flags & SPICO_FABRIC ? " fabric" : ""); + + write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK); + /* disable SBus spico */ + if (flags & SPICO_SBUS) + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01, + WRITE_SBUS_RECEIVER, 0x00000040); + + /* disable the fabric serdes spicos */ + if (flags & SPICO_FABRIC) + sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id], + 0x07, WRITE_SBUS_RECEIVER, 0x00000000); + write_csr(dd, MISC_CFG_FW_CTRL, 0); +} + +/* + * Reset all of the fabric serdes for our HFI. + */ +void fabric_serdes_reset(struct hfi1_devdata *dd) +{ + u8 ra; + + if (dd->icode != ICODE_RTL_SILICON) /* only for RTL */ + return; + + ra = fabric_serdes_broadcast[dd->hfi1_id]; + + acquire_hw_mutex(dd); + set_sbus_fast_mode(dd); + /* place SerDes in reset and disable SPICO */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); + /* wait 100 refclk cycles @ 156.25MHz => 640ns */ + udelay(1); + /* remove SerDes reset */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); + /* turn SPICO enable on */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + clear_sbus_fast_mode(dd); + release_hw_mutex(dd); +} + +/* Access to the SBus in this routine should probably be serialized */ +int sbus_request_slow(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in) +{ + u64 reg, count = 0; + + sbus_request(dd, receiver_addr, data_addr, command, data_in); + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, + ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK); + /* Wait for both DONE and RCV_DATA_VALID to go high */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) && + (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) { + if (count++ >= SBUS_MAX_POLL_COUNT) { + u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + /* + * If the loop has timed out, we are OK if DONE bit + * is set and RCV_DATA_VALID and EXECUTE counters + * are the same. If not, we cannot proceed. + */ + if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) && + (SBUS_COUNTER(counts, RCV_DATA_VALID) == + SBUS_COUNTER(counts, EXECUTE))) + break; + return -ETIMEDOUT; + } + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + } + count = 0; + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); + /* Wait for DONE to clear after EXECUTE is cleared */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) { + if (count++ >= SBUS_MAX_POLL_COUNT) + return -ETIME; + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + } + return 0; +} + +static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i, err; + const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */ + + dd_dev_info(dd, "Downloading fabric firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: place SerDes in reset and disable SPICO */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); + /* wait 100 refclk cycles @ 156.25MHz => 640ns */ + udelay(1); + /* step 3: remove SerDes reset */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); + /* step 4: assert IMEM override */ + sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000); + /* step 5: download SerDes machine code */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 6: IMEM override off */ + sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000); + /* step 7: turn ECC on */ + sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000); + + /* steps 8-11: run the RSA engine */ + err = run_rsa(dd, "fabric serdes", fdet->signature); + if (err) + return err; + + /* step 12: turn SPICO enable on */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + /* step 13: enable core hardware interrupts */ + sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000); + + return 0; +} + +static int load_sbus_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i, err; + const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */ + + dd_dev_info(dd, "Downloading SBus firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: place SPICO into reset and enable off */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0); + /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240); + /* step 4: set starting IMEM address for burst download */ + sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000); + /* step 5: download the SBus Master machine code */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 6: set IMEM_CNTL_EN off */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040); + /* step 7: turn ECC on */ + sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000); + + /* steps 8-11: run the RSA engine */ + err = run_rsa(dd, "SBus", fdet->signature); + if (err) + return err; + + /* step 12: set SPICO_ENABLE on */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140); + + return 0; +} + +static int load_pcie_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i; + const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */ + + dd_dev_info(dd, "Downloading PCIe firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: assert single step (halts the SBus Master spico) */ + sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001); + /* step 3: enable XDMEM access */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40); + /* step 4: load firmware into SBus Master XDMEM */ + /* NOTE: the dmem address, write_en, and wdata are all pre-packed, + we only need to pick up the bytes and write them */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 5: disable XDMEM access */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140); + /* step 6: allow SBus Spico to run */ + sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000); + + /* steps 7-11: run RSA, if it succeeds, firmware is available to + be swapped */ + return run_rsa(dd, "PCIe serdes", fdet->signature); +} + +/* + * Set the given broadcast values on the given list of devices. + */ +static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2, + const u8 *addrs, int count) +{ + while (--count >= 0) { + /* + * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave + * defaults for everything else. Do not read-modify-write, + * per instruction from the manufacturer. + * + * Register 0xfd: + * bits what + * ----- --------------------------------- + * 0 IGNORE_BROADCAST (default 0) + * 11:4 BROADCAST_GROUP_1 (default 0xff) + * 23:16 BROADCAST_GROUP_2 (default 0xff) + */ + sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER, + (u32)bg1 << 4 | (u32)bg2 << 16); + } +} + +int acquire_hw_mutex(struct hfi1_devdata *dd) +{ + unsigned long timeout; + int try = 0; + u8 mask = 1 << dd->hfi1_id; + u8 user; + +retry: + timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies; + while (1) { + write_csr(dd, ASIC_CFG_MUTEX, mask); + user = (u8)read_csr(dd, ASIC_CFG_MUTEX); + if (user == mask) + return 0; /* success */ + if (time_after(jiffies, timeout)) + break; /* timed out */ + msleep(20); + } + + /* timed out */ + dd_dev_err(dd, + "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n", + (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up"); + + if (try == 0) { + /* break mutex and retry */ + write_csr(dd, ASIC_CFG_MUTEX, 0); + try++; + goto retry; + } + + return -EBUSY; +} + +void release_hw_mutex(struct hfi1_devdata *dd) +{ + write_csr(dd, ASIC_CFG_MUTEX, 0); +} + +void set_sbus_fast_mode(struct hfi1_devdata *dd) +{ + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, + ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK); +} + +void clear_sbus_fast_mode(struct hfi1_devdata *dd) +{ + u64 reg, count = 0; + + reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + while (SBUS_COUNTER(reg, EXECUTE) != + SBUS_COUNTER(reg, RCV_DATA_VALID)) { + if (count++ >= SBUS_MAX_POLL_COUNT) + break; + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + } + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); +} + +int load_firmware(struct hfi1_devdata *dd) +{ + int ret; + + if (fw_sbus_load || fw_fabric_serdes_load) { + ret = acquire_hw_mutex(dd); + if (ret) + return ret; + + set_sbus_fast_mode(dd); + + /* + * The SBus contains part of the fabric firmware and so must + * also be downloaded. + */ + if (fw_sbus_load) { + turn_off_spicos(dd, SPICO_SBUS); + ret = load_sbus_firmware(dd, &fw_sbus); + if (ret) + goto clear; + } + + if (fw_fabric_serdes_load) { + set_serdes_broadcast(dd, all_fabric_serdes_broadcast, + fabric_serdes_broadcast[dd->hfi1_id], + fabric_serdes_addrs[dd->hfi1_id], + NUM_FABRIC_SERDES); + turn_off_spicos(dd, SPICO_FABRIC); + ret = load_fabric_serdes_firmware(dd, &fw_fabric); + } + +clear: + clear_sbus_fast_mode(dd); + release_hw_mutex(dd); + if (ret) + return ret; + } + + if (fw_8051_load) { + ret = load_8051_firmware(dd, &fw_8051); + if (ret) + return ret; + } + + return 0; +} + +int hfi1_firmware_init(struct hfi1_devdata *dd) +{ + /* only RTL can use these */ + if (dd->icode != ICODE_RTL_SILICON) { + fw_fabric_serdes_load = 0; + fw_pcie_serdes_load = 0; + fw_sbus_load = 0; + } + + /* no 8051 or QSFP on simulator */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + fw_8051_load = 0; + platform_config_load = 0; + } + + if (!fw_8051_name) { + if (dd->icode == ICODE_RTL_SILICON) + fw_8051_name = DEFAULT_FW_8051_NAME_ASIC; + else + fw_8051_name = DEFAULT_FW_8051_NAME_FPGA; + } + if (!fw_fabric_serdes_name) + fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME; + if (!fw_sbus_name) + fw_sbus_name = DEFAULT_FW_SBUS_NAME; + if (!fw_pcie_serdes_name) + fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME; + if (!platform_config_name) + platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME; + + return obtain_firmware(dd); +} + +int parse_platform_config(struct hfi1_devdata *dd) +{ + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + u32 *ptr = NULL; + u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0; + u32 record_idx = 0, table_type = 0, table_length_dwords = 0; + + if (platform_config == NULL) { + dd_dev_info(dd, "%s: Missing config file\n", __func__); + goto bail; + } + ptr = (u32 *)platform_config->data; + + magic_num = *ptr; + ptr++; + if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) { + dd_dev_info(dd, "%s: Bad config file\n", __func__); + goto bail; + } + + while (ptr < (u32 *)(platform_config->data + platform_config->size)) { + header1 = *ptr; + header2 = *(ptr + 1); + if (header1 != ~header2) { + dd_dev_info(dd, "%s: Failed validation at offset %ld\n", + __func__, (ptr - (u32 *)platform_config->data)); + goto bail; + } + + record_idx = *ptr & + ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1); + + table_length_dwords = (*ptr >> + PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) & + ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1); + + table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) & + ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1); + + /* Done with this set of headers */ + ptr += 2; + + if (record_idx) { + /* data table */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + pcfgcache->config_tables[table_type].num_table = + 1; + break; + case PLATFORM_CONFIG_PORT_TABLE: + pcfgcache->config_tables[table_type].num_table = + 2; + break; + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + pcfgcache->config_tables[table_type].num_table = + table_length_dwords; + break; + default: + dd_dev_info(dd, + "%s: Unknown data table %d, offset %ld\n", + __func__, table_type, + (ptr - (u32 *)platform_config->data)); + goto bail; /* We don't trust this file now */ + } + pcfgcache->config_tables[table_type].table = ptr; + } else { + /* metadata table */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + /* fall through */ + case PLATFORM_CONFIG_PORT_TABLE: + /* fall through */ + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + break; + default: + dd_dev_info(dd, + "%s: Unknown metadata table %d, offset %ld\n", + __func__, table_type, + (ptr - (u32 *)platform_config->data)); + goto bail; /* We don't trust this file now */ + } + pcfgcache->config_tables[table_type].table_metadata = + ptr; + } + + /* Calculate and check table crc */ + crc = crc32_le(~(u32)0, (unsigned char const *)ptr, + (table_length_dwords * 4)); + crc ^= ~(u32)0; + + /* Jump the table */ + ptr += table_length_dwords; + if (crc != *ptr) { + dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n", + __func__, (ptr - (u32 *)platform_config->data)); + goto bail; + } + /* Jump the CRC DWORD */ + ptr++; + } + + pcfgcache->cache_valid = 1; + return 0; +bail: + memset(pcfgcache, 0, sizeof(struct platform_config_cache)); + return -EINVAL; +} + +static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table, + int field, u32 *field_len_bits, u32 *field_start_bits) +{ + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + u32 *src_ptr = NULL; + + if (!pcfgcache->cache_valid) + return -EINVAL; + + switch (table) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + /* fall through */ + case PLATFORM_CONFIG_PORT_TABLE: + /* fall through */ + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + if (field && field < platform_config_table_limits[table]) + src_ptr = + pcfgcache->config_tables[table].table_metadata + field; + break; + default: + dd_dev_info(dd, "%s: Unknown table\n", __func__); + break; + } + + if (!src_ptr) + return -EINVAL; + + if (field_start_bits) + *field_start_bits = *src_ptr & + ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1); + + if (field_len_bits) + *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT) + & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1); + + return 0; +} + +/* This is the central interface to getting data out of the platform config + * file. It depends on parse_platform_config() having populated the + * platform_config_cache in hfi1_devdata, and checks the cache_valid member to + * validate the sanity of the cache. + * + * The non-obvious parameters: + * @table_index: Acts as a look up key into which instance of the tables the + * relevant field is fetched from. + * + * This applies to the data tables that have multiple instances. The port table + * is an exception to this rule as each HFI only has one port and thus the + * relevant table can be distinguished by hfi_id. + * + * @data: pointer to memory that will be populated with the field requested. + * @len: length of memory pointed by @data in bytes. + */ +int get_platform_config_field(struct hfi1_devdata *dd, + enum platform_config_table_type_encoding table_type, + int table_index, int field_index, u32 *data, u32 len) +{ + int ret = 0, wlen = 0, seek = 0; + u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL; + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + + if (data) + memset(data, 0, len); + else + return -EINVAL; + + ret = get_platform_fw_field_metadata(dd, table_type, field_index, + &field_len_bits, &field_start_bits); + if (ret) + return -EINVAL; + + /* Convert length to bits */ + len *= 8; + + /* Our metadata function checked cache_valid and field_index for us */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + src_ptr = pcfgcache->config_tables[table_type].table; + + if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) { + if (len < field_len_bits) + return -EINVAL; + + seek = field_start_bits/8; + wlen = field_len_bits/8; + + src_ptr = (u32 *)((u8 *)src_ptr + seek); + + /* We expect the field to be byte aligned and whole byte + * lengths if we are here */ + memcpy(data, src_ptr, wlen); + return 0; + } + break; + case PLATFORM_CONFIG_PORT_TABLE: + /* Port table is 4 DWORDS in META_VERSION 0 */ + src_ptr = dd->hfi1_id ? + pcfgcache->config_tables[table_type].table + 4 : + pcfgcache->config_tables[table_type].table; + break; + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + src_ptr = pcfgcache->config_tables[table_type].table; + + if (table_index < + pcfgcache->config_tables[table_type].num_table) + src_ptr += table_index; + else + src_ptr = NULL; + break; + default: + dd_dev_info(dd, "%s: Unknown table\n", __func__); + break; + } + + if (!src_ptr || len < field_len_bits) + return -EINVAL; + + src_ptr += (field_start_bits/32); + *data = (*src_ptr >> (field_start_bits % 32)) & + ((1 << field_len_bits) - 1); + + return 0; +} + +/* + * Download the firmware needed for the Gen3 PCIe SerDes. An update + * to the SBus firmware is needed before updating the PCIe firmware. + * + * Note: caller must be holding the HW mutex. + */ +int load_pcie_firmware(struct hfi1_devdata *dd) +{ + int ret = 0; + + /* both firmware loads below use the SBus */ + set_sbus_fast_mode(dd); + + if (fw_sbus_load) { + turn_off_spicos(dd, SPICO_SBUS); + ret = load_sbus_firmware(dd, &fw_sbus); + if (ret) + goto done; + } + + if (fw_pcie_serdes_load) { + dd_dev_info(dd, "Setting PCIe SerDes broadcast\n"); + set_serdes_broadcast(dd, all_pcie_serdes_broadcast, + pcie_serdes_broadcast[dd->hfi1_id], + pcie_serdes_addrs[dd->hfi1_id], + NUM_PCIE_SERDES); + ret = load_pcie_serdes_firmware(dd, &fw_pcie); + if (ret) + goto done; + } + +done: + clear_sbus_fast_mode(dd); + + return ret; +} + +/* + * Read the GUID from the hardware, store it in dd. + */ +void read_guid(struct hfi1_devdata *dd) +{ + dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID); + dd_dev_info(dd, "GUID %llx", + (unsigned long long)dd->base_guid); +} diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h new file mode 100644 index 000000000..8ca171bf3 --- /dev/null +++ b/drivers/staging/rdma/hfi1/hfi.h @@ -0,0 +1,1821 @@ +#ifndef _HFI1_KERNEL_H +#define _HFI1_KERNEL_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "chip_registers.h" +#include "common.h" +#include "verbs.h" +#include "pio.h" +#include "chip.h" +#include "mad.h" +#include "qsfp.h" +#include "platform_config.h" + +/* bumped 1 from s/w major version of TrueScale */ +#define HFI1_CHIP_VERS_MAJ 3U + +/* don't care about this except printing */ +#define HFI1_CHIP_VERS_MIN 0U + +/* The Organization Unique Identifier (Mfg code), and its position in GUID */ +#define HFI1_OUI 0x001175 +#define HFI1_OUI_LSB 40 + +#define DROP_PACKET_OFF 0 +#define DROP_PACKET_ON 1 + +extern unsigned long hfi1_cap_mask; +#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) +#define HFI1_CAP_UGET_MASK(mask, cap) \ + (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap) +#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap)) +#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap)) +#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \ + HFI1_CAP_MISC_MASK) + +/* + * per driver stats, either not device nor port-specific, or + * summed over all of the devices and ports. + * They are described by name via ipathfs filesystem, so layout + * and number of elements can change without breaking compatibility. + * If members are added or deleted hfi1_statnames[] in debugfs.c must + * change to match. + */ +struct hfi1_ib_stats { + __u64 sps_ints; /* number of interrupts handled */ + __u64 sps_errints; /* number of error interrupts */ + __u64 sps_txerrs; /* tx-related packet errors */ + __u64 sps_rcverrs; /* non-crc rcv packet errors */ + __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */ + __u64 sps_nopiobufs; /* no pio bufs avail from kernel */ + __u64 sps_ctxts; /* number of contexts currently open */ + __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */ + __u64 sps_buffull; + __u64 sps_hdrfull; +}; + +extern struct hfi1_ib_stats hfi1_stats; +extern const struct pci_error_handlers hfi1_pci_err_handler; + +/* + * First-cut criterion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000) + +/* + * Below contains all data related to a single context (formerly called port). + */ + +#ifdef CONFIG_DEBUG_FS +struct hfi1_opcode_stats_perctx; +#endif + +/* + * struct ps_state keeps state associated with RX queue "prescanning" + * (prescanning for FECNs, and BECNs), if prescanning is in use. + */ +struct ps_state { + u32 ps_head; + int initialized; +}; + +struct ctxt_eager_bufs { + ssize_t size; /* total size of eager buffers */ + u32 count; /* size of buffers array */ + u32 numbufs; /* number of buffers allocated */ + u32 alloced; /* number of rcvarray entries used */ + u32 rcvtid_size; /* size of each eager rcv tid */ + u32 threshold; /* head update threshold */ + struct eager_buffer { + void *addr; + dma_addr_t phys; + ssize_t len; + } *buffers; + struct { + void *addr; + dma_addr_t phys; + } *rcvtids; +}; + +struct hfi1_ctxtdata { + /* shadow the ctxt's RcvCtrl register */ + u64 rcvctrl; + /* rcvhdrq base, needs mmap before useful */ + void *rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + volatile __le64 *rcvhdrtail_kvaddr; + /* + * Shared page for kernel to signal user processes that send buffers + * need disarming. The process should call HFI1_CMD_DISARM_BUFS + * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set. + */ + unsigned long *user_event_mask; + /* when waiting for rcv or pioavail */ + wait_queue_head_t wait; + /* rcvhdrq size (for freeing) */ + size_t rcvhdrq_size; + /* number of rcvhdrq entries */ + u16 rcvhdrq_cnt; + /* size of each of the rcvhdrq entries */ + u16 rcvhdrqentsize; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t rcvhdrq_phys; + dma_addr_t rcvhdrqtailaddr_phys; + struct ctxt_eager_bufs egrbufs; + /* this receive context's assigned PIO ACK send context */ + struct send_context *sc; + + /* dynamic receive available interrupt timeout */ + u32 rcvavail_timeout; + /* + * number of opens (including slave sub-contexts) on this instance + * (ignoring forks, dup, etc. for now) + */ + int cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned ctxt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_cnt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_id; + u8 uuid[16]; + /* job key */ + u16 jkey; + /* number of RcvArray groups for this context. */ + u32 rcv_array_groups; + /* index of first eager TID entry. */ + u32 eager_base; + /* number of expected TID entries */ + u32 expected_count; + /* index of first expected TID entry. */ + u32 expected_base; + /* cursor into the exp group sets */ + atomic_t tidcursor; + /* number of exp TID groups assigned to the ctxt */ + u16 numtidgroups; + /* size of exp TID group fields in tidusemap */ + u16 tidmapcnt; + /* exp TID group usage bitfield array */ + unsigned long *tidusemap; + /* pinned pages for exp sends, allocated at open */ + struct page **tid_pg_list; + /* dma handles for exp tid pages */ + dma_addr_t *physshadow; + /* lock protecting all Expected TID data */ + spinlock_t exp_lock; + /* number of pio bufs for this ctxt (all procs, if shared) */ + u32 piocnt; + /* first pio buffer for this ctxt */ + u32 pio_base; + /* chip offset of PIO buffers for this ctxt */ + u32 piobufs; + /* per-context configuration flags */ + u16 flags; + /* per-context event flags for fileops/intr communication */ + unsigned long event_flags; + /* WAIT_RCV that timed out, no interrupt */ + u32 rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 pionowait; + /* total number of polled urgent packets */ + u32 urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 urgent_poll; + /* pid of process using this ctxt */ + pid_t pid; + pid_t subpid[HFI1_MAX_SHARED_CTXTS]; + /* same size as task_struct .comm[], command that opened context */ + char comm[16]; + /* so file ops can get at unit */ + struct hfi1_devdata *dd; + /* so functions that need physical port can get it easily */ + struct hfi1_pportdata *ppd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subctxt_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subctxt_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subctxt_rcvhdr_base; + /* The version of the library which opened this ctxt */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* receive packet sequence counter */ + u8 seq_cnt; + u8 redirect_seq_cnt; + /* ctxt rcvhdrq head offset */ + u32 head; + u32 pkt_count; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; + /* interrupt handling */ + u64 imask; /* clear interrupt mask */ + int ireg; /* clear interrupt register */ + unsigned numa_id; /* numa node of this context */ + /* verbs stats per CTX */ + struct hfi1_opcode_stats_perctx *opstats; + /* + * This is the kernel thread that will keep making + * progress on the user sdma requests behind the scenes. + * There is one per context (shared contexts use the master's). + */ + struct task_struct *progress; + struct list_head sdma_queues; + spinlock_t sdma_qlock; + +#ifdef CONFIG_PRESCAN_RXQ + struct ps_state ps_state; +#endif /* CONFIG_PRESCAN_RXQ */ + + /* + * The interrupt handler for a particular receive context can vary + * throughout it's lifetime. This is not a lock protected data member so + * it must be updated atomically and the prev and new value must always + * be valid. Worst case is we process an extra interrupt and up to 64 + * packets with the wrong interrupt handler. + */ + void (*do_interrupt)(struct hfi1_ctxtdata *rcd); +}; + +/* + * Represents a single packet at a high level. Put commonly computed things in + * here so we do not have to keep doing them over and over. The rule of thumb is + * if something is used one time to derive some value, store that something in + * here. If it is used multiple times, then store the result of that derivation + * in here. + */ +struct hfi1_packet { + void *ebuf; + void *hdr; + struct hfi1_ctxtdata *rcd; + __le32 *rhf_addr; + struct hfi1_qp *qp; + struct hfi1_other_headers *ohdr; + u64 rhf; + u32 maxcnt; + u32 rhqoff; + u32 hdrqtail; + int numpkt; + u16 tlen; + u16 hlen; + s16 etail; + u16 rsize; + u8 updegr; + u8 rcv_flags; + u8 etype; +}; + +static inline bool has_sc4_bit(struct hfi1_packet *p) +{ + return !!rhf_dc_info(p->rhf); +} + +/* + * Private data for snoop/capture support. + */ +struct hfi1_snoop_data { + int mode_flag; + struct cdev cdev; + struct device *class_dev; + spinlock_t snoop_lock; + struct list_head queue; + wait_queue_head_t waitq; + void *filter_value; + int (*filter_callback)(void *hdr, void *data, void *value); + u64 dcc_cfg; /* saved value of DCC Cfg register */ +}; + +/* snoop mode_flag values */ +#define HFI1_PORT_SNOOP_MODE 1U +#define HFI1_PORT_CAPTURE_MODE 2U + +struct hfi1_sge_state; + +/* + * Get/Set IB link-level config parameters for f_get/set_ib_cfg() + * Mostly for MADs that set or query link parameters, also ipath + * config interfaces + */ +#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */ +#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */ +#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */ +#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */ +#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */ +#define HFI1_IB_CFG_SPD 5 /* current Link spd */ +#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */ +#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */ +#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */ +#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */ +#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */ +#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */ +#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */ +#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */ +#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */ +#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */ +#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */ +#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */ +#define HFI1_IB_CFG_VL_HIGH_LIMIT 19 +#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */ +#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */ + +/* + * HFI or Host Link States + * + * These describe the states the driver thinks the logical and physical + * states are in. Used as an argument to set_link_state(). Implemented + * as bits for easy multi-state checking. The actual state can only be + * one. + */ +#define __HLS_UP_INIT_BP 0 +#define __HLS_UP_ARMED_BP 1 +#define __HLS_UP_ACTIVE_BP 2 +#define __HLS_DN_DOWNDEF_BP 3 /* link down default */ +#define __HLS_DN_POLL_BP 4 +#define __HLS_DN_DISABLE_BP 5 +#define __HLS_DN_OFFLINE_BP 6 +#define __HLS_VERIFY_CAP_BP 7 +#define __HLS_GOING_UP_BP 8 +#define __HLS_GOING_OFFLINE_BP 9 +#define __HLS_LINK_COOLDOWN_BP 10 + +#define HLS_UP_INIT (1 << __HLS_UP_INIT_BP) +#define HLS_UP_ARMED (1 << __HLS_UP_ARMED_BP) +#define HLS_UP_ACTIVE (1 << __HLS_UP_ACTIVE_BP) +#define HLS_DN_DOWNDEF (1 << __HLS_DN_DOWNDEF_BP) /* link down default */ +#define HLS_DN_POLL (1 << __HLS_DN_POLL_BP) +#define HLS_DN_DISABLE (1 << __HLS_DN_DISABLE_BP) +#define HLS_DN_OFFLINE (1 << __HLS_DN_OFFLINE_BP) +#define HLS_VERIFY_CAP (1 << __HLS_VERIFY_CAP_BP) +#define HLS_GOING_UP (1 << __HLS_GOING_UP_BP) +#define HLS_GOING_OFFLINE (1 << __HLS_GOING_OFFLINE_BP) +#define HLS_LINK_COOLDOWN (1 << __HLS_LINK_COOLDOWN_BP) + +#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) + +/* use this MTU size if none other is given */ +#define HFI1_DEFAULT_ACTIVE_MTU 8192 +/* use this MTU size as the default maximum */ +#define HFI1_DEFAULT_MAX_MTU 8192 +/* default partition key */ +#define DEFAULT_PKEY 0xffff + +/* + * Possible fabric manager config parameters for fm_{get,set}_table() + */ +#define FM_TBL_VL_HIGH_ARB 1 /* Get/set VL high prio weights */ +#define FM_TBL_VL_LOW_ARB 2 /* Get/set VL low prio weights */ +#define FM_TBL_BUFFER_CONTROL 3 /* Get/set Buffer Control */ +#define FM_TBL_SC2VLNT 4 /* Get/set SC->VLnt */ +#define FM_TBL_VL_PREEMPT_ELEMS 5 /* Get (no set) VL preempt elems */ +#define FM_TBL_VL_PREEMPT_MATRIX 6 /* Get (no set) VL preempt matrix */ + +/* + * Possible "operations" for f_rcvctrl(ppd, op, ctxt) + * these are bits so they can be combined, e.g. + * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB + */ +#define HFI1_RCVCTRL_TAILUPD_ENB 0x01 +#define HFI1_RCVCTRL_TAILUPD_DIS 0x02 +#define HFI1_RCVCTRL_CTXT_ENB 0x04 +#define HFI1_RCVCTRL_CTXT_DIS 0x08 +#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10 +#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20 +#define HFI1_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */ +#define HFI1_RCVCTRL_PKEY_DIS 0x80 +#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400 +#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800 +#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000 +#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000 +#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000 +#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000 + +/* partition enforcement flags */ +#define HFI1_PART_ENFORCE_IN 0x1 +#define HFI1_PART_ENFORCE_OUT 0x2 + +/* how often we check for synthetic counter wrap around */ +#define SYNTH_CNT_TIME 2 + +/* Counter flags */ +#define CNTR_NORMAL 0x0 /* Normal counters, just read register */ +#define CNTR_SYNTH 0x1 /* Synthetic counters, saturate at all 1s */ +#define CNTR_DISABLED 0x2 /* Disable this counter */ +#define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */ +#define CNTR_VL 0x8 /* Per VL counter */ +#define CNTR_INVALID_VL -1 /* Specifies invalid VL */ +#define CNTR_MODE_W 0x0 +#define CNTR_MODE_R 0x1 + +/* VLs Supported/Operational */ +#define HFI1_MIN_VLS_SUPPORTED 1 +#define HFI1_MAX_VLS_SUPPORTED 8 + +static inline void incr_cntr64(u64 *cntr) +{ + if (*cntr < (u64)-1LL) + (*cntr)++; +} + +static inline void incr_cntr32(u32 *cntr) +{ + if (*cntr < (u32)-1LL) + (*cntr)++; +} + +#define MAX_NAME_SIZE 64 +struct hfi1_msix_entry { + struct msix_entry msix; + void *arg; + char name[MAX_NAME_SIZE]; + cpumask_var_t mask; +}; + +/* per-SL CCA information */ +struct cca_timer { + struct hrtimer hrtimer; + struct hfi1_pportdata *ppd; /* read-only */ + int sl; /* read-only */ + u16 ccti; /* read/write - current value of CCTI */ +}; + +struct link_down_reason { + /* + * SMA-facing value. Should be set from .latest when + * HLS_UP_* -> HLS_DN_* transition actually occurs. + */ + u8 sma; + u8 latest; +}; + +enum { + LO_PRIO_TABLE, + HI_PRIO_TABLE, + MAX_PRIO_TABLE +}; + +struct vl_arb_cache { + spinlock_t lock; + struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE]; +}; + +/* + * The structure below encapsulates data relevant to a physical IB Port. + * Current chips support only one such port, but the separation + * clarifies things a bit. Note that to conform to IB conventions, + * port-numbers are one-based. The first or only port is port1. + */ +struct hfi1_pportdata { + struct hfi1_ibport ibport_data; + + struct hfi1_devdata *dd; + struct kobject pport_cc_kobj; + struct kobject sc2vl_kobj; + struct kobject sl2sc_kobj; + struct kobject vl2mtu_kobj; + + /* QSFP support */ + struct qsfp_data qsfp_info; + + /* GUID for this interface, in host order */ + u64 guid; + /* GUID for peer interface, in host order */ + u64 neighbor_guid; + + /* up or down physical link state */ + u32 linkup; + + /* + * this address is mapped read-only into user processes so they can + * get status cheaply, whenever they want. One qword of status per port + */ + u64 *statusp; + + /* SendDMA related entries */ + + struct workqueue_struct *hfi1_wq; + + /* move out of interrupt context */ + struct work_struct link_vc_work; + struct work_struct link_up_work; + struct work_struct link_down_work; + struct work_struct sma_message_work; + struct work_struct freeze_work; + struct work_struct link_downgrade_work; + struct work_struct link_bounce_work; + /* host link state variables */ + struct mutex hls_lock; + u32 host_link_state; + + spinlock_t sdma_alllock ____cacheline_aligned_in_smp; + + u32 lstate; /* logical link state */ + + /* these are the "32 bit" regs */ + + u32 ibmtu; /* The MTU programmed for this unit */ + /* + * Current max size IB packet (in bytes) including IB headers, that + * we can send. Changes when ibmtu changes. + */ + u32 ibmaxlen; + u32 current_egress_rate; /* units [10^6 bits/sec] */ + /* LID programmed for this instance */ + u16 lid; + /* list of pkeys programmed; 0 if not set */ + u16 pkeys[MAX_PKEY_VALUES]; + u16 link_width_supported; + u16 link_width_downgrade_supported; + u16 link_speed_supported; + u16 link_width_enabled; + u16 link_width_downgrade_enabled; + u16 link_speed_enabled; + u16 link_width_active; + u16 link_width_downgrade_tx_active; + u16 link_width_downgrade_rx_active; + u16 link_speed_active; + u8 vls_supported; + u8 vls_operational; + /* LID mask control */ + u8 lmc; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 rx_pol_inv; + + u8 hw_pidx; /* physical port index */ + u8 port; /* IB port number and index into dd->pports - 1 */ + /* type of neighbor node */ + u8 neighbor_type; + u8 neighbor_normal; + u8 neighbor_fm_security; /* 1 if firmware checking is disabled */ + u8 neighbor_port_number; + u8 is_sm_config_started; + u8 offline_disabled_reason; + u8 is_active_optimize_enabled; + u8 driver_link_ready; /* driver ready for active link */ + u8 link_enabled; /* link enabled? */ + u8 linkinit_reason; + u8 local_tx_rate; /* rate given to 8051 firmware */ + + /* placeholders for IB MAD packet settings */ + u8 overrun_threshold; + u8 phy_error_threshold; + + /* used to override LED behavior */ + u8 led_override; /* Substituted for normal value, if non-zero */ + u16 led_override_timeoff; /* delta to next timer event */ + u8 led_override_vals[2]; /* Alternates per blink-frame */ + u8 led_override_phase; /* Just counts, LSB picks from vals[] */ + atomic_t led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list led_override_timer; + u32 sm_trap_qp; + u32 sa_qp; + + /* + * cca_timer_lock protects access to the per-SL cca_timer + * structures (specifically the ccti member). + */ + spinlock_t cca_timer_lock ____cacheline_aligned_in_smp; + struct cca_timer cca_timer[OPA_MAX_SLS]; + + /* List of congestion control table entries */ + struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX]; + + /* congestion entries, each entry corresponding to a SL */ + struct opa_congestion_setting_entry_shadow + congestion_entries[OPA_MAX_SLS]; + + /* + * cc_state_lock protects (write) access to the per-port + * struct cc_state. + */ + spinlock_t cc_state_lock ____cacheline_aligned_in_smp; + + struct cc_state __rcu *cc_state; + + /* Total number of congestion control table entries */ + u16 total_cct_entry; + + /* Bit map identifying service level */ + u32 cc_sl_control_map; + + /* CA's max number of 64 entry units in the congestion control table */ + u8 cc_max_table_entries; + + /* begin congestion log related entries + * cc_log_lock protects all congestion log related data */ + spinlock_t cc_log_lock ____cacheline_aligned_in_smp; + u8 threshold_cong_event_map[OPA_MAX_SLS/8]; + u16 threshold_event_counter; + struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS]; + int cc_log_idx; /* index for logging events */ + int cc_mad_idx; /* index for reporting events */ + /* end congestion log related entries */ + + struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE]; + + /* port relative counter buffer */ + u64 *cntrs; + /* port relative synthetic counter buffer */ + u64 *scntrs; + /* we synthesize port_xmit_discards from several egress errors */ + u64 port_xmit_discards; + u64 port_xmit_constraint_errors; + u64 port_rcv_constraint_errors; + /* count of 'link_err' interrupts from DC */ + u64 link_downed; + /* number of times link retrained successfully */ + u64 link_up; + /* port_ltp_crc_mode is returned in 'portinfo' MADs */ + u16 port_ltp_crc_mode; + /* port_crc_mode_enabled is the crc we support */ + u8 port_crc_mode_enabled; + /* mgmt_allowed is also returned in 'portinfo' MADs */ + u8 mgmt_allowed; + u8 part_enforce; /* partition enforcement flags */ + struct link_down_reason local_link_down_reason; + struct link_down_reason neigh_link_down_reason; + /* Value to be sent to link peer on LinkDown .*/ + u8 remote_link_down_reason; + /* Error events that will cause a port bounce. */ + u32 port_error_action; +}; + +typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); + +typedef void (*opcode_handler)(struct hfi1_packet *packet); + +/* return values for the RHF receive functions */ +#define RHF_RCV_CONTINUE 0 /* keep going */ +#define RHF_RCV_DONE 1 /* stop, this packet processed */ +#define RHF_RCV_REPROCESS 2 /* stop. retain this packet */ + +struct rcv_array_data { + u8 group_size; + u16 ngroups; + u16 nctxt_extra; +}; + +struct per_vl_data { + u16 mtu; + struct send_context *sc; +}; + +/* 16 to directly index */ +#define PER_VL_SEND_CONTEXTS 16 + +struct err_info_rcvport { + u8 status_and_code; + u64 packet_flit1; + u64 packet_flit2; +}; + +struct err_info_constraint { + u8 status; + u16 pkey; + u32 slid; +}; + +struct hfi1_temp { + unsigned int curr; /* current temperature */ + unsigned int lo_lim; /* low temperature limit */ + unsigned int hi_lim; /* high temperature limit */ + unsigned int crit_lim; /* critical temperature limit */ + u8 triggers; /* temperature triggers */ +}; + +/* device data struct now contains only "general per-device" info. + * fields related to a physical IB port are in a hfi1_pportdata struct. + */ +struct sdma_engine; +struct sdma_vl_map; + +#define BOARD_VERS_MAX 96 /* how long the version string can be */ +#define SERIAL_MAX 16 /* length of the serial number */ + +struct hfi1_devdata { + struct hfi1_ibdev verbs_dev; /* must be first */ + struct list_head list; + /* pointers to related structs for this device */ + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev user_cdev; + struct cdev diag_cdev; + struct cdev ui_cdev; + struct device *user_device; + struct device *diag_device; + struct device *ui_device; + + /* mem-mapped pointer to base of chip regs */ + u8 __iomem *kregbase; + /* end of mem-mapped chip space excluding sendbuf and user regs */ + u8 __iomem *kregend; + /* physical address of chip for io_remap, etc. */ + resource_size_t physaddr; + /* receive context data */ + struct hfi1_ctxtdata **rcd; + /* send context data */ + struct send_context_info *send_contexts; + /* map hardware send contexts to software index */ + u8 *hw_to_sw; + /* spinlock for allocating and releasing send context resources */ + spinlock_t sc_lock; + /* Per VL data. Enough for all VLs but not all elements are set/used. */ + struct per_vl_data vld[PER_VL_SEND_CONTEXTS]; + /* seqlock for sc2vl */ + seqlock_t sc2vl_lock; + u64 sc2vl[4]; + /* Send Context initialization lock. */ + spinlock_t sc_init_lock; + + /* fields common to all SDMA engines */ + + /* default flags to last descriptor */ + u64 default_desc1; + volatile __le64 *sdma_heads_dma; /* DMA'ed by chip */ + dma_addr_t sdma_heads_phys; + void *sdma_pad_dma; /* DMA'ed by chip */ + dma_addr_t sdma_pad_phys; + /* for deallocation */ + size_t sdma_heads_size; + /* number from the chip */ + u32 chip_sdma_engines; + /* num used */ + u32 num_sdma; + /* lock for sdma_map */ + spinlock_t sde_map_lock; + /* array of engines sized by num_sdma */ + struct sdma_engine *per_sdma; + /* array of vl maps */ + struct sdma_vl_map __rcu *sdma_map; + /* SPC freeze waitqueue and variable */ + wait_queue_head_t sdma_unfreeze_wq; + atomic_t sdma_unfreeze_count; + + + /* hfi1_pportdata, points to array of (physical) port-specific + * data structs, indexed by pidx (0..n-1) + */ + struct hfi1_pportdata *pport; + + /* mem-mapped pointer to base of PIO buffers */ + void __iomem *piobase; + /* + * write-combining mem-mapped pointer to base of RcvArray + * memory. + */ + void __iomem *rcvarray_wc; + /* + * credit return base - a per-NUMA range of DMA address that + * the chip will use to update the per-context free counter + */ + struct credit_return_base *cr_base; + + /* send context numbers and sizes for each type */ + struct sc_config_sizes sc_sizes[SC_MAX]; + + u32 lcb_access_count; /* count of LCB users */ + + char *boardname; /* human readable board info */ + + /* device (not port) flags, basically device capabilities */ + u32 flags; + + /* reset value */ + u64 z_int_counter; + u64 z_rcv_limit; + /* percpu int_counter */ + u64 __percpu *int_counter; + u64 __percpu *rcv_limit; + + /* number of receive contexts in use by the driver */ + u32 num_rcv_contexts; + /* number of pio send contexts in use by the driver */ + u32 num_send_contexts; + /* + * number of ctxts available for PSM open + */ + u32 freectxts; + /* base receive interrupt timeout, in CSR units */ + u32 rcv_intr_timeout_csr; + + u64 __iomem *egrtidbase; + spinlock_t sendctrl_lock; /* protect changes to SendCtrl */ + spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */ + /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */ + spinlock_t uctxt_lock; /* rcd and user context changes */ + /* exclusive access to 8051 */ + spinlock_t dc8051_lock; + /* exclusive access to 8051 memory */ + spinlock_t dc8051_memlock; + int dc8051_timed_out; /* remember if the 8051 timed out */ + /* + * A page that will hold event notification bitmaps for all + * contexts. This page will be mapped into all processes. + */ + unsigned long *events; + /* + * per unit status, see also portdata statusp + * mapped read-only into user processes so they can get unit and + * IB link status cheaply + */ + struct hfi1_status *status; + u32 freezelen; /* max length of freezemsg */ + + /* revision register shadow */ + u64 revision; + /* Base GUID for device (network order) */ + u64 base_guid; + + /* these are the "32 bit" regs */ + + /* value we put in kr_rcvhdrsize */ + u32 rcvhdrsize; + /* number of receive contexts the chip supports */ + u32 chip_rcv_contexts; + /* number of receive array entries */ + u32 chip_rcv_array_count; + /* number of PIO send contexts the chip supports */ + u32 chip_send_contexts; + /* number of bytes in the PIO memory buffer */ + u32 chip_pio_mem_size; + /* number of bytes in the SDMA memory buffer */ + u32 chip_sdma_mem_size; + + /* size of each rcvegrbuffer */ + u32 rcvegrbufsize; + /* log2 of above */ + u16 rcvegrbufsize_shift; + /* both sides of the PCIe link are gen3 capable */ + u8 link_gen3_capable; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 lbus_width; + /* localbus speed in MHz */ + u32 lbus_speed; + int unit; /* unit # of this chip */ + int node; /* home node of this chip */ + + /* save these PCI fields to restore after a reset */ + u32 pcibar0; + u32 pcibar1; + u32 pci_rom; + u16 pci_command; + u16 pcie_devctl; + u16 pcie_lnkctl; + u16 pcie_devctl2; + u32 pci_msix0; + u32 pci_lnkctl3; + u32 pci_tph2; + + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer serial number format + */ + u8 serial[SERIAL_MAX]; + /* human readable board version */ + u8 boardversion[BOARD_VERS_MAX]; + u8 lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from CceRevision */ + u8 majrev; + /* chip minor rev, from CceRevision */ + u8 minrev; + /* hardware ID */ + u8 hfi1_id; + /* implementation code */ + u8 icode; + /* default link down value (poll/sleep) */ + u8 link_default; + /* vAU of this device */ + u8 vau; + /* vCU of this device */ + u8 vcu; + /* link credits of this device */ + u16 link_credits; + /* initial vl15 credits to use */ + u16 vl15_init; + + /* Misc small ints */ + /* Number of physical ports available */ + u8 num_pports; + /* Lowest context number which can be used by user processes */ + u8 first_user_ctxt; + u8 n_krcv_queues; + u8 qos_shift; + u8 qpn_mask; + + u16 rhf_offset; /* offset of RHF within receive header entry */ + u16 irev; /* implementation revision */ + u16 dc8051_ver; /* 8051 firmware version */ + + struct platform_config_cache pcfg_cache; + /* control high-level access to qsfp */ + struct mutex qsfp_i2c_mutex; + + struct diag_client *diag_client; + spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */ + + u8 psxmitwait_supported; + /* cycle length of PS* counters in HW (in picoseconds) */ + u16 psxmitwait_check_rate; + /* high volume overflow errors deferred to tasklet */ + struct tasklet_struct error_tasklet; + /* per device cq worker */ + struct kthread_worker *worker; + + /* MSI-X information */ + struct hfi1_msix_entry *msix_entries; + u32 num_msix_entries; + + /* INTx information */ + u32 requested_intx_irq; /* did we request one? */ + char intx_name[MAX_NAME_SIZE]; /* INTx name */ + + /* general interrupt: mask of handled interrupts */ + u64 gi_mask[CCE_NUM_INT_CSRS]; + + struct rcv_array_data rcv_entries; + + /* + * 64 bit synthetic counters + */ + struct timer_list synth_stats_timer; + + /* + * device counters + */ + char *cntrnames; + size_t cntrnameslen; + size_t ndevcntrs; + u64 *cntrs; + u64 *scntrs; + + /* + * remembered values for synthetic counters + */ + u64 last_tx; + u64 last_rx; + + /* + * per-port counters + */ + size_t nportcntrs; + char *portcntrnames; + size_t portcntrnameslen; + + struct hfi1_snoop_data hfi1_snoop; + + struct err_info_rcvport err_info_rcvport; + struct err_info_constraint err_info_rcv_constraint; + struct err_info_constraint err_info_xmit_constraint; + u8 err_info_uncorrectable; + u8 err_info_fmconfig; + + atomic_t drop_packet; + u8 do_drop; + + /* receive interrupt functions */ + rhf_rcv_function_ptr *rhf_rcv_function_map; + rhf_rcv_function_ptr normal_rhf_rcv_functions[8]; + + /* + * Handlers for outgoing data so that snoop/capture does not + * have to have its hooks in the send path + */ + int (*process_pio_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr, + u32 hdrwords, struct hfi1_sge_state *ss, + u32 len, u32 plen, u32 dwords, u64 pbc); + int (*process_dma_send)(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr, + u32 hdrwords, struct hfi1_sge_state *ss, + u32 len, u32 plen, u32 dwords, u64 pbc); + void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count); + + /* OUI comes from the HW. Used everywhere as 3 separate bytes. */ + u8 oui1; + u8 oui2; + u8 oui3; + /* Timer and counter used to detect RcvBufOvflCnt changes */ + struct timer_list rcverr_timer; + u32 rcv_ovfl_cnt; + + int assigned_node_id; + wait_queue_head_t event_queue; + + /* Save the enabled LCB error bits */ + u64 lcb_err_en; + u8 dc_shutdown; +}; + +/* 8051 firmware version helper */ +#define dc8051_ver(a, b) ((a) << 8 | (b)) + +/* f_put_tid types */ +#define PT_EXPECTED 0 +#define PT_EAGER 1 +#define PT_INVALID 2 + +/* Private data for file operations */ +struct hfi1_filedata { + struct hfi1_ctxtdata *uctxt; + unsigned subctxt; + struct hfi1_user_sdma_comp_q *cq; + struct hfi1_user_sdma_pkt_q *pq; + /* for cpu affinity; -1 if none */ + int rec_cpu_num; +}; + +extern struct list_head hfi1_dev_list; +extern spinlock_t hfi1_devs_lock; +struct hfi1_devdata *hfi1_lookup(int unit); +extern u32 hfi1_cpulist_count; +extern unsigned long *hfi1_cpulist; + +extern unsigned int snoop_drop_send; +extern unsigned int snoop_force_capture; +int hfi1_init(struct hfi1_devdata *, int); +int hfi1_count_units(int *npresentp, int *nupp); +int hfi1_count_active_units(void); + +int hfi1_diag_add(struct hfi1_devdata *); +void hfi1_diag_remove(struct hfi1_devdata *); +void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup); + +void handle_user_interrupt(struct hfi1_ctxtdata *rcd); + +int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *); +int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *); +int hfi1_create_ctxts(struct hfi1_devdata *dd); +struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32); +void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *, + struct hfi1_devdata *, u8, u8); +void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *); + +void handle_receive_interrupt(struct hfi1_ctxtdata *); +void handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd); +void handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd); +int hfi1_reset_device(int); + +/* return the driver's idea of the logical OPA port state */ +static inline u32 driver_lstate(struct hfi1_pportdata *ppd) +{ + return ppd->lstate; /* use the cached value */ +} + +static inline u16 generate_jkey(kuid_t uid) +{ + return from_kuid(current_user_ns(), uid) & 0xffff; +} + +/* + * active_egress_rate + * + * returns the active egress rate in units of [10^6 bits/sec] + */ +static inline u32 active_egress_rate(struct hfi1_pportdata *ppd) +{ + u16 link_speed = ppd->link_speed_active; + u16 link_width = ppd->link_width_active; + u32 egress_rate; + + if (link_speed == OPA_LINK_SPEED_25G) + egress_rate = 25000; + else /* assume OPA_LINK_SPEED_12_5G */ + egress_rate = 12500; + + switch (link_width) { + case OPA_LINK_WIDTH_4X: + egress_rate *= 4; + break; + case OPA_LINK_WIDTH_3X: + egress_rate *= 3; + break; + case OPA_LINK_WIDTH_2X: + egress_rate *= 2; + break; + default: + /* assume IB_WIDTH_1X */ + break; + } + + return egress_rate; +} + +/* + * egress_cycles + * + * Returns the number of 'fabric clock cycles' to egress a packet + * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock + * rate is (approximately) 805 MHz, the units of the returned value + * are (1/805 MHz). + */ +static inline u32 egress_cycles(u32 len, u32 rate) +{ + u32 cycles; + + /* + * cycles is: + * + * (length) [bits] / (rate) [bits/sec] + * --------------------------------------------------- + * fabric_clock_period == 1 /(805 * 10^6) [cycles/sec] + */ + + cycles = len * 8; /* bits */ + cycles *= 805; + cycles /= rate; + + return cycles; +} + +void set_link_ipg(struct hfi1_pportdata *ppd); +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, + u32 rqpn, u8 svc_type); +void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn, + u32 pkey, u32 slid, u32 dlid, u8 sc5, + const struct ib_grh *old_grh); + +#define PACKET_EGRESS_TIMEOUT 350 +static inline void pause_for_credit_return(struct hfi1_devdata *dd) +{ + /* Pause at least 1us, to ensure chip returns all credits */ + u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000; + + udelay(usec ? usec : 1); +} + +/** + * sc_to_vlt() reverse lookup sc to vl + * @dd - devdata + * @sc5 - 5 bit sc + */ +static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5) +{ + unsigned seq; + u8 rval; + + if (sc5 >= OPA_MAX_SCS) + return (u8)(0xff); + + do { + seq = read_seqbegin(&dd->sc2vl_lock); + rval = *(((u8 *)dd->sc2vl) + sc5); + } while (read_seqretry(&dd->sc2vl_lock, seq)); + + return rval; +} + +#define PKEY_MEMBER_MASK 0x8000 +#define PKEY_LOW_15_MASK 0x7fff + +/* + * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent + * being an entry from the ingress partition key table), return 0 + * otherwise. Use the matching criteria for ingress partition keys + * specified in the OPAv1 spec., section 9.10.14. + */ +static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent) +{ + u16 mkey = pkey & PKEY_LOW_15_MASK; + u16 ment = ent & PKEY_LOW_15_MASK; + + if (mkey == ment) { + /* + * If pkey[15] is clear (limited partition member), + * is bit 15 in the corresponding table element + * clear (limited member)? + */ + if (!(pkey & PKEY_MEMBER_MASK)) + return !!(ent & PKEY_MEMBER_MASK); + return 1; + } + return 0; +} + +/* + * ingress_pkey_table_search - search the entire pkey table for + * an entry which matches 'pkey'. return 0 if a match is found, + * and 1 otherwise. + */ +static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey) +{ + int i; + + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } + return 1; +} + +/* + * ingress_pkey_table_fail - record a failure of ingress pkey validation, + * i.e., increment port_rcv_constraint_errors for the port, and record + * the 'error info' for this failure. + */ +static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey, + u16 slid) +{ + struct hfi1_devdata *dd = ppd->dd; + + incr_cntr64(&ppd->port_rcv_constraint_errors); + if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) { + dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK; + dd->err_info_rcv_constraint.slid = slid; + dd->err_info_rcv_constraint.pkey = pkey; + } +} + +/* + * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx + * is a hint as to the best place in the partition key table to begin + * searching. This function should not be called on the data path because + * of performance reasons. On datapath pkey check is expected to be done + * by HW and rcv_pkey_check function should be called instead. + */ +static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u8 idx, u16 slid) +{ + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + /* Is the pkey = 0x0, or 0x8000? */ + if ((pkey & PKEY_LOW_15_MASK) == 0) + goto bad; + + /* The most likely matching pkey has index 'idx' */ + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx])) + return 0; + + /* no match - try the whole table */ + if (!ingress_pkey_table_search(ppd, pkey)) + return 0; + +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* + * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. It only ensures pkey is vlid for QP0. This function + * should be called on the data path instead of ingress_pkey_check + * as on data path, pkey check is done by HW (except for QP0). + */ +static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u16 slid) +{ + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + return 0; +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* MTU handling */ + +/* MTU enumeration, 256-4k match IB */ +#define OPA_MTU_0 0 +#define OPA_MTU_256 1 +#define OPA_MTU_512 2 +#define OPA_MTU_1024 3 +#define OPA_MTU_2048 4 +#define OPA_MTU_4096 5 + +u32 lrh_max_header_bytes(struct hfi1_devdata *dd); +int mtu_to_enum(u32 mtu, int default_if_bad); +u16 enum_to_mtu(int); +static inline int valid_ib_mtu(unsigned int mtu) +{ + return mtu == 256 || mtu == 512 || + mtu == 1024 || mtu == 2048 || + mtu == 4096; +} +static inline int valid_opa_max_mtu(unsigned int mtu) +{ + return mtu >= 2048 && + (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240); +} + +int set_mtu(struct hfi1_pportdata *); + +int hfi1_set_lid(struct hfi1_pportdata *, u32, u8); +void hfi1_disable_after_error(struct hfi1_devdata *); +int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int); +int hfi1_rcvbuf_validate(u32, u8, u16 *); + +int fm_get_table(struct hfi1_pportdata *, int, void *); +int fm_set_table(struct hfi1_pportdata *, int, void *); + +void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf); +void reset_link_credits(struct hfi1_devdata *dd); +void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu); + +int snoop_recv_handler(struct hfi1_packet *packet); +int snoop_send_dma_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc); +int snoop_send_pio_handler(struct hfi1_qp *qp, struct ahg_ib_header *ibhdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc); +void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count); + +/* for use in system calls, where we want to know device type, etc. */ +#define ctxt_fp(fp) \ + (((struct hfi1_filedata *)(fp)->private_data)->uctxt) +#define subctxt_fp(fp) \ + (((struct hfi1_filedata *)(fp)->private_data)->subctxt) +#define tidcursor_fp(fp) \ + (((struct hfi1_filedata *)(fp)->private_data)->tidcursor) +#define user_sdma_pkt_fp(fp) \ + (((struct hfi1_filedata *)(fp)->private_data)->pq) +#define user_sdma_comp_fp(fp) \ + (((struct hfi1_filedata *)(fp)->private_data)->cq) + +static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd) +{ + return ppd->dd; +} + +static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev) +{ + return container_of(dev, struct hfi1_devdata, verbs_dev); +} + +static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev) +{ + return dd_from_dev(to_idev(ibdev)); +} + +static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp) +{ + return container_of(ibp, struct hfi1_pportdata, ibport_data); +} + +static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + WARN_ON(pidx >= dd->num_pports); + return &dd->pport[pidx].ibport_data; +} + +/* + * Return the indexed PKEY from the port PKEY table. + */ +static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 ret; + + if (index >= ARRAY_SIZE(ppd->pkeys)) + ret = 0; + else + ret = ppd->pkeys[index]; + + return ret; +} + +/* + * Readers of cc_state must call get_cc_state() under rcu_read_lock(). + * Writers of cc_state must call get_cc_state() under cc_state_lock. + */ +static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) +{ + return rcu_dereference(ppd->cc_state); +} + +/* + * values for dd->flags (_device_ related flags) + */ +#define HFI1_INITTED 0x1 /* chip and driver up and initted */ +#define HFI1_PRESENT 0x2 /* chip accesses can be done */ +#define HFI1_FROZEN 0x4 /* chip in SPC freeze */ +#define HFI1_HAS_SDMA_TIMEOUT 0x8 +#define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */ +#define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */ +#define HFI1_DO_INIT_ASIC 0x100 /* This device will init the ASIC */ + +/* IB dword length mask in PBC (lower 11 bits); same for all chips */ +#define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1) + + +/* ctxt_flag bit offsets */ + /* context has been setup */ +#define HFI1_CTXT_SETUP_DONE 1 + /* waiting for a packet to arrive */ +#define HFI1_CTXT_WAITING_RCV 2 + /* master has not finished initializing */ +#define HFI1_CTXT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define HFI1_CTXT_WAITING_URG 5 + +/* free up any allocated data at closes */ +struct hfi1_devdata *hfi1_init_dd(struct pci_dev *, + const struct pci_device_id *); +void hfi1_free_devdata(struct hfi1_devdata *); +void cc_state_reclaim(struct rcu_head *rcu); +struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra); + +/* + * Set LED override, only the two LSBs have "public" meaning, but + * any non-zero value substitutes them for the Link and LinkTrain + * LED states. + */ +#define HFI1_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ +#define HFI1_LED_LOG 2 /* Logical (link) YELLOW LED */ +void hfi1_set_led_override(struct hfi1_pportdata *ppd, unsigned int val); + +#define HFI1_CREDIT_RETURN_RATE (100) + +/* + * The number of words for the KDETH protocol field. If this is + * larger then the actual field used, then part of the payload + * will be in the header. + * + * Optimally, we want this sized so that a typical case will + * use full cache lines. The typical local KDETH header would + * be: + * + * Bytes Field + * 8 LRH + * 12 BHT + * ?? KDETH + * 8 RHF + * --- + * 28 + KDETH + * + * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS + */ +#define DEFAULT_RCVHDRSIZE 9 + +/* + * Maximal header byte count: + * + * Bytes Field + * 8 LRH + * 40 GRH (optional) + * 12 BTH + * ?? KDETH + * 8 RHF + * --- + * 68 + KDETH + * + * We also want to maintain a cache line alignment to assist DMA'ing + * of the header bytes. Round up to a good size. + */ +#define DEFAULT_RCVHDR_ENTSIZE 32 + +int hfi1_get_user_pages(unsigned long, size_t, struct page **); +void hfi1_release_user_pages(struct page **, size_t); + +static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + *((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + /* + * volatile because it's a DMA target from the chip, routine is + * inlined, and don't want register caching or reordering. + */ + return (u32) le64_to_cpu(*rcd->rcvhdrtail_kvaddr); +} + +/* + * sysfs interface. + */ + +extern const char ib_hfi1_version[]; + +int hfi1_device_create(struct hfi1_devdata *); +void hfi1_device_remove(struct hfi1_devdata *); + +int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj); +int hfi1_verbs_register_sysfs(struct hfi1_devdata *); +void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *); +/* Hook for sysfs read of QSFP */ +int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); + +int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *); +void hfi1_pcie_cleanup(struct pci_dev *); +int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *, + const struct pci_device_id *); +void hfi1_pcie_ddcleanup(struct hfi1_devdata *); +void hfi1_pcie_flr(struct hfi1_devdata *); +int pcie_speeds(struct hfi1_devdata *); +void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *); +void hfi1_enable_intx(struct pci_dev *); +void hfi1_nomsix(struct hfi1_devdata *); +void restore_pci_variables(struct hfi1_devdata *dd); +int do_pcie_gen3_transition(struct hfi1_devdata *dd); +int parse_platform_config(struct hfi1_devdata *dd); +int get_platform_config_field(struct hfi1_devdata *dd, + enum platform_config_table_type_encoding table_type, + int table_index, int field_index, u32 *data, u32 len); + +dma_addr_t hfi1_map_page(struct pci_dev *, struct page *, unsigned long, + size_t, int); +const char *get_unit_name(int unit); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +static inline void flush_wc(void) +{ + asm volatile("sfence" : : : "memory"); +} + +void handle_eflags(struct hfi1_packet *packet); +int process_receive_ib(struct hfi1_packet *packet); +int process_receive_bypass(struct hfi1_packet *packet); +int process_receive_error(struct hfi1_packet *packet); +int kdeth_process_expected(struct hfi1_packet *packet); +int kdeth_process_eager(struct hfi1_packet *packet); +int process_receive_invalid(struct hfi1_packet *packet); + +extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8]; + +void update_sge(struct hfi1_sge_state *ss, u32 length); + +/* global module parameter variables */ +extern unsigned int hfi1_max_mtu; +extern unsigned int hfi1_cu; +extern unsigned int user_credit_return_threshold; +extern uint num_rcv_contexts; +extern unsigned n_krcvqs; +extern u8 krcvqs[]; +extern int krcvqsset; +extern uint kdeth_qp; +extern uint loopback; +extern uint quick_linkup; +extern uint rcv_intr_timeout; +extern uint rcv_intr_count; +extern uint rcv_intr_dynamic; +extern ushort link_crc_mask; + +extern struct mutex hfi1_mutex; + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +#define DRIVER_NAME "hfi1" +#define HFI1_USER_MINOR_BASE 0 +#define HFI1_TRACE_MINOR 127 +#define HFI1_DIAGPKT_MINOR 128 +#define HFI1_DIAG_MINOR_BASE 129 +#define HFI1_SNOOP_CAPTURE_BASE 200 +#define HFI1_NMINORS 255 + +#define PCI_VENDOR_ID_INTEL 0x8086 +#define PCI_DEVICE_ID_INTEL0 0x24f0 +#define PCI_DEVICE_ID_INTEL1 0x24f1 + +#define HFI1_PKT_USER_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK) + +#define HFI1_PKT_KERNEL_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK) + +static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, + u16 ctxt_type) +{ + u64 base_sc_integrity = + SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (ctxt_type == SC_USER) + base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY; + else + base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; + + if (is_a0(dd)) + /* turn off send-side job key checks - A0 erratum */ + return base_sc_integrity & + ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + return base_sc_integrity; +} + +static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) +{ + u64 base_sdma_integrity = + SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (is_a0(dd)) + /* turn off send-side job key checks - A0 erratum */ + return base_sdma_integrity & + ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + return base_sdma_integrity; +} + +/* + * hfi1_early_err is used (only!) to print early errors before devdata is + * allocated, or when dd->pcidev may not be valid, and at the tail end of + * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is + * the same as dd_dev_err, but is used when the message really needs + * the IB port# to be definitive as to what's happening.. + */ +#define hfi1_early_err(dev, fmt, ...) \ + dev_err(dev, fmt, ##__VA_ARGS__) + +#define hfi1_early_info(dev, fmt, ...) \ + dev_info(dev, fmt, ##__VA_ARGS__) + +#define dd_dev_emerg(dd, fmt, ...) \ + dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) +#define dd_dev_err(dd, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) +#define dd_dev_warn(dd, fmt, ...) \ + dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_warn_ratelimited(dd, fmt, ...) \ + dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_info(dd, fmt, ...) \ + dev_info(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define hfi1_dev_porterr(dd, port, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ + get_unit_name((dd)->unit), (dd)->unit, (port), \ + ##__VA_ARGS__) + +/* + * this is used for formatting hw error messages... + */ +struct hfi1_hwerror_msgs { + u64 mask; + const char *msg; + size_t sz; +}; + +/* in intr.c... */ +void hfi1_format_hwerrors(u64 hwerrs, + const struct hfi1_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t lmsg); + +#define USER_OPCODE_CHECK_VAL 0xC0 +#define USER_OPCODE_CHECK_MASK 0xC0 +#define OPCODE_CHECK_VAL_DISABLED 0x0 +#define OPCODE_CHECK_MASK_DISABLED 0x0 + +static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + dd->z_int_counter = get_all_cpu_total(dd->int_counter); + dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit); + + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->ibport_data.z_rc_acks = + get_all_cpu_total(ppd->ibport_data.rc_acks); + ppd->ibport_data.z_rc_qacks = + get_all_cpu_total(ppd->ibport_data.rc_qacks); + } +} + +/* Control LED state */ +static inline void setextled(struct hfi1_devdata *dd, u32 on) +{ + if (on) + write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F); + else + write_csr(dd, DCC_CFG_LED_CNTRL, 0x10); +} + +int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); + +#endif /* _HFI1_KERNEL_H */ diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c new file mode 100644 index 000000000..a877eda8c --- /dev/null +++ b/drivers/staging/rdma/hfi1/init.c @@ -0,0 +1,1722 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "device.h" +#include "common.h" +#include "mad.h" +#include "sdma.h" +#include "debugfs.h" +#include "verbs.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +/* + * min buffers we want to have per context, after driver + */ +#define HFI1_MIN_USER_CTXT_BUFCNT 7 + +#define HFI1_MIN_HDRQ_EGRBUF_CNT 2 +#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ +#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ + +/* + * Number of user receive contexts we are configured to use (to allow for more + * pio buffers per ctxt, etc.) Zero means use one user context per CPU. + */ +uint num_rcv_contexts; +module_param_named(num_rcv_contexts, num_rcv_contexts, uint, S_IRUGO); +MODULE_PARM_DESC( + num_rcv_contexts, "Set max number of user receive contexts to use"); + +u8 krcvqs[RXE_NUM_DATA_VL]; +int krcvqsset; +module_param_array(krcvqs, byte, &krcvqsset, S_IRUGO); +MODULE_PARM_DESC(krcvqs, "Array of the number of kernel receive queues by VL"); + +/* computed based on above array */ +unsigned n_krcvqs; + +static unsigned hfi1_rcvarr_split = 25; +module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); +MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); + +static uint eager_buffer_size = (2 << 20); /* 2MB */ +module_param(eager_buffer_size, uint, S_IRUGO); +MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB"); + +static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ +module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); + +static uint hfi1_hdrq_entsize = 32; +module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO); +MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B"); + +unsigned int user_credit_return_threshold = 33; /* default is 33% */ +module_param(user_credit_return_threshold, uint, S_IRUGO); +MODULE_PARM_DESC(user_credit_return_theshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); + +static inline u64 encode_rcv_header_entry_size(u16); + +static struct idr hfi1_unit_table; +u32 hfi1_cpulist_count; +unsigned long *hfi1_cpulist; + +/* + * Common code for creating the receive context array. + */ +int hfi1_create_ctxts(struct hfi1_devdata *dd) +{ + unsigned i; + int ret; + int local_node_id = pcibus_to_node(dd->pcidev->bus); + + if (local_node_id < 0) + local_node_id = numa_node_id(); + dd->assigned_node_id = local_node_id; + + dd->rcd = kcalloc(dd->num_rcv_contexts, sizeof(*dd->rcd), GFP_KERNEL); + if (!dd->rcd) { + dd_dev_err(dd, + "Unable to allocate receive context array, failing\n"); + goto nomem; + } + + /* create one or more kernel contexts */ + for (i = 0; i < dd->first_user_ctxt; ++i) { + struct hfi1_pportdata *ppd; + struct hfi1_ctxtdata *rcd; + + ppd = dd->pport + (i % dd->num_pports); + rcd = hfi1_create_ctxtdata(ppd, i); + if (!rcd) { + dd_dev_err(dd, + "Unable to allocate kernel receive context, failing\n"); + goto nomem; + } + /* + * Set up the kernel context flags here and now because they + * use default values for all receive side memories. User + * contexts will be handled as they are created. + */ + rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | + HFI1_CAP_KGET(NODROP_RHQ_FULL) | + HFI1_CAP_KGET(NODROP_EGR_FULL) | + HFI1_CAP_KGET(DMA_RTAIL); + rcd->seq_cnt = 1; + + rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); + if (!rcd->sc) { + dd_dev_err(dd, + "Unable to allocate kernel send context, failing\n"); + dd->rcd[rcd->ctxt] = NULL; + hfi1_free_ctxtdata(dd, rcd); + goto nomem; + } + + ret = hfi1_init_ctxt(rcd->sc); + if (ret < 0) { + dd_dev_err(dd, + "Failed to setup kernel receive context, failing\n"); + sc_free(rcd->sc); + dd->rcd[rcd->ctxt] = NULL; + hfi1_free_ctxtdata(dd, rcd); + ret = -EFAULT; + goto bail; + } + } + + return 0; +nomem: + ret = -ENOMEM; +bail: + kfree(dd->rcd); + dd->rcd = NULL; + return ret; +} + +/* + * Common code for user and kernel context setup. + */ +struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt) +{ + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_ctxtdata *rcd; + unsigned kctxt_ngroups = 0; + u32 base; + + if (dd->rcv_entries.nctxt_extra > + dd->num_rcv_contexts - dd->first_user_ctxt) + kctxt_ngroups = (dd->rcv_entries.nctxt_extra - + (dd->num_rcv_contexts - dd->first_user_ctxt)); + rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); + if (rcd) { + u32 rcvtids, max_entries; + + dd_dev_info(dd, "%s: setting up context %u\n", __func__, ctxt); + + INIT_LIST_HEAD(&rcd->qp_wait_list); + rcd->ppd = ppd; + rcd->dd = dd; + rcd->cnt = 1; + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; + rcd->numa_id = numa_node_id(); + rcd->rcv_array_groups = dd->rcv_entries.ngroups; + + spin_lock_init(&rcd->exp_lock); + + /* + * Calculate the context's RcvArray entry starting point. + * We do this here because we have to take into account all + * the RcvArray entries that previous context would have + * taken and we have to account for any extra groups + * assigned to the kernel or user contexts. + */ + if (ctxt < dd->first_user_ctxt) { + if (ctxt < kctxt_ngroups) { + base = ctxt * (dd->rcv_entries.ngroups + 1); + rcd->rcv_array_groups++; + } else + base = kctxt_ngroups + + (ctxt * dd->rcv_entries.ngroups); + } else { + u16 ct = ctxt - dd->first_user_ctxt; + + base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + + kctxt_ngroups); + if (ct < dd->rcv_entries.nctxt_extra) { + base += ct * (dd->rcv_entries.ngroups + 1); + rcd->rcv_array_groups++; + } else + base += dd->rcv_entries.nctxt_extra + + (ct * dd->rcv_entries.ngroups); + } + rcd->eager_base = base * dd->rcv_entries.group_size; + + /* Validate and initialize Rcv Hdr Q variables */ + if (rcvhdrcnt % HDRQ_INCREMENT) { + dd_dev_err(dd, + "ctxt%u: header queue count %d must be divisible by %d\n", + rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT); + goto bail; + } + rcd->rcvhdrq_cnt = rcvhdrcnt; + rcd->rcvhdrqentsize = hfi1_hdrq_entsize; + /* + * Simple Eager buffer allocation: we have already pre-allocated + * the number of RcvArray entry groups. Each ctxtdata structure + * holds the number of groups for that context. + * + * To follow CSR requirements and maintain cacheline alignment, + * make sure all sizes and bases are multiples of group_size. + * + * The expected entry count is what is left after assigning + * eager. + */ + max_entries = rcd->rcv_array_groups * + dd->rcv_entries.group_size; + rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); + rcd->egrbufs.count = round_down(rcvtids, + dd->rcv_entries.group_size); + if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { + dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", + rcd->ctxt); + rcd->egrbufs.count = MAX_EAGER_ENTRIES; + } + dd_dev_info(dd, "ctxt%u: max Eager buffer RcvArray entries: %u\n", + rcd->ctxt, rcd->egrbufs.count); + + /* + * Allocate array that will hold the eager buffer accounting + * data. + * This will allocate the maximum possible buffer count based + * on the value of the RcvArray split parameter. + * The resulting value will be rounded down to the closest + * multiple of dd->rcv_entries.group_size. + */ + rcd->egrbufs.buffers = kzalloc(sizeof(*rcd->egrbufs.buffers) * + rcd->egrbufs.count, GFP_KERNEL); + if (!rcd->egrbufs.buffers) + goto bail; + rcd->egrbufs.rcvtids = kzalloc(sizeof(*rcd->egrbufs.rcvtids) * + rcd->egrbufs.count, GFP_KERNEL); + if (!rcd->egrbufs.rcvtids) + goto bail; + rcd->egrbufs.size = eager_buffer_size; + /* + * The size of the buffers programmed into the RcvArray + * entries needs to be big enough to handle the highest + * MTU supported. + */ + if (rcd->egrbufs.size < hfi1_max_mtu) { + rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); + dd_dev_info(dd, + "ctxt%u: eager bufs size too small. Adjusting to %zu\n", + rcd->ctxt, rcd->egrbufs.size); + } + rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; + + if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ + rcd->opstats = kzalloc(sizeof(*rcd->opstats), + GFP_KERNEL); + if (!rcd->opstats) { + dd_dev_err(dd, + "ctxt%u: Unable to allocate per ctxt stats buffer\n", + rcd->ctxt); + goto bail; + } + } + } + return rcd; +bail: + kfree(rcd->opstats); + kfree(rcd->egrbufs.rcvtids); + kfree(rcd->egrbufs.buffers); + kfree(rcd); + return NULL; +} + +/* + * Convert a receive header entry size that to the encoding used in the CSR. + * + * Return a zero if the given size is invalid. + */ +static inline u64 encode_rcv_header_entry_size(u16 size) +{ + /* there are only 3 valid receive header entry sizes */ + if (size == 2) + return 1; + if (size == 16) + return 2; + else if (size == 32) + return 4; + return 0; /* invalid */ +} + +/* + * Select the largest ccti value over all SLs to determine the intra- + * packet gap for the link. + * + * called with cca_timer_lock held (to protect access to cca_timer + * array), and rcu_read_lock() (to protect access to cc_state). + */ +void set_link_ipg(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + struct cc_state *cc_state; + int i; + u16 cce, ccti_limit, max_ccti = 0; + u16 shift, mult; + u64 src; + u32 current_egress_rate; /* Mbits /sec */ + u32 max_pkt_time; + /* + * max_pkt_time is the maximum packet egress time in units + * of the fabric clock period 1/(805 MHz). + */ + + cc_state = get_cc_state(ppd); + + if (cc_state == NULL) + /* + * This should _never_ happen - rcu_read_lock() is held, + * and set_link_ipg() should not be called if cc_state + * is NULL. + */ + return; + + for (i = 0; i < OPA_MAX_SLS; i++) { + u16 ccti = ppd->cca_timer[i].ccti; + + if (ccti > max_ccti) + max_ccti = ccti; + } + + ccti_limit = cc_state->cct.ccti_limit; + if (max_ccti > ccti_limit) + max_ccti = ccti_limit; + + cce = cc_state->cct.entries[max_ccti].entry; + shift = (cce & 0xc000) >> 14; + mult = (cce & 0x3fff); + + current_egress_rate = active_egress_rate(ppd); + + max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); + + src = (max_pkt_time >> shift) * mult; + + src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; + src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; + + write_csr(dd, SEND_STATIC_RATE_CONTROL, src); +} + +static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) +{ + struct cca_timer *cca_timer; + struct hfi1_pportdata *ppd; + int sl; + u16 ccti, ccti_timer, ccti_min; + struct cc_state *cc_state; + + cca_timer = container_of(t, struct cca_timer, hrtimer); + ppd = cca_timer->ppd; + sl = cca_timer->sl; + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (cc_state == NULL) { + rcu_read_unlock(); + return HRTIMER_NORESTART; + } + + /* + * 1) decrement ccti for SL + * 2) calculate IPG for link (set_link_ipg()) + * 3) restart timer, unless ccti is at min value + */ + + ccti_min = cc_state->cong_setting.entries[sl].ccti_min; + ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; + + spin_lock(&ppd->cca_timer_lock); + + ccti = cca_timer->ccti; + + if (ccti > ccti_min) { + cca_timer->ccti--; + set_link_ipg(ppd); + } + + spin_unlock(&ppd->cca_timer_lock); + + rcu_read_unlock(); + + if (ccti > ccti_min) { + unsigned long nsec = 1024 * ccti_timer; + /* ccti_timer is in units of 1.024 usec */ + hrtimer_forward_now(t, ns_to_ktime(nsec)); + return HRTIMER_RESTART; + } + return HRTIMER_NORESTART; +} + +/* + * Common code for initializing the physical port structure. + */ +void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, + struct hfi1_devdata *dd, u8 hw_pidx, u8 port) +{ + int i, size; + uint default_pkey_idx; + + ppd->dd = dd; + ppd->hw_pidx = hw_pidx; + ppd->port = port; /* IB port number, not index */ + + default_pkey_idx = 1; + + ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; + if (loopback) { + hfi1_early_err(&pdev->dev, + "Faking data partition 0x8001 in idx %u\n", + !default_pkey_idx); + ppd->pkeys[!default_pkey_idx] = 0x8001; + } + + INIT_WORK(&ppd->link_vc_work, handle_verify_cap); + INIT_WORK(&ppd->link_up_work, handle_link_up); + INIT_WORK(&ppd->link_down_work, handle_link_down); + INIT_WORK(&ppd->freeze_work, handle_freeze); + INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); + INIT_WORK(&ppd->sma_message_work, handle_sma_message); + INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); + mutex_init(&ppd->hls_lock); + spin_lock_init(&ppd->sdma_alllock); + spin_lock_init(&ppd->qsfp_info.qsfp_lock); + + ppd->sm_trap_qp = 0x0; + ppd->sa_qp = 0x1; + + ppd->hfi1_wq = NULL; + + spin_lock_init(&ppd->cca_timer_lock); + + for (i = 0; i < OPA_MAX_SLS; i++) { + hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + ppd->cca_timer[i].ppd = ppd; + ppd->cca_timer[i].sl = i; + ppd->cca_timer[i].ccti = 0; + ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + } + + ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; + + spin_lock_init(&ppd->cc_state_lock); + spin_lock_init(&ppd->cc_log_lock); + size = sizeof(struct cc_state); + RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL)); + if (!rcu_dereference(ppd->cc_state)) + goto bail; + return; + +bail: + + hfi1_early_err(&pdev->dev, + "Congestion Control Agent disabled for port %d\n", port); +} + +/* + * Do initialization for device that is only needed on + * first detect, not on resets. + */ +static int loadtime_init(struct hfi1_devdata *dd) +{ + return 0; +} + +/** + * init_after_reset - re-initialize after a reset + * @dd: the hfi1_ib device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_after_reset(struct hfi1_devdata *dd) +{ + int i; + + /* + * Ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize. This is mostly + * for the driver data structures, not chip registers. + */ + for (i = 0; i < dd->num_rcv_contexts; i++) + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS, i); + pio_send_control(dd, PSC_GLOBAL_DISABLE); + for (i = 0; i < dd->num_send_contexts; i++) + sc_disable(dd->send_contexts[i].sc); + + return 0; +} + +static void enable_chip(struct hfi1_devdata *dd) +{ + u32 rcvmask; + u32 i; + + /* enable PIO send */ + pio_send_control(dd, PSC_GLOBAL_ENABLE); + + /* + * Enable kernel ctxts' receive and receive interrupt. + * Other ctxts done as user opens and initializes them. + */ + rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; + for (i = 0; i < dd->first_user_ctxt; ++i) { + rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? + HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; + if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR)) + rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL)) + rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL)) + rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + hfi1_rcvctrl(dd, rcvmask, i); + sc_enable(dd->rcd[i]->sc); + } +} + +/** + * create_workqueues - create per port workqueues + * @dd: the hfi1_ib device + */ +static int create_workqueues(struct hfi1_devdata *dd) +{ + int pidx; + struct hfi1_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (!ppd->hfi1_wq) { + char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */ + + snprintf(wq_name, sizeof(wq_name), "hfi%d_%d", + dd->unit, pidx); + ppd->hfi1_wq = + create_singlethread_workqueue(wq_name); + if (!ppd->hfi1_wq) + goto wq_error; + } + } + return 0; +wq_error: + pr_err("create_singlethread_workqueue failed for port %d\n", + pidx + 1); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + } + return -ENOMEM; +} + +/** + * hfi1_init - do the actual initialization sequence on the chip + * @dd: the hfi1_ib device + * @reinit: re-initializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int hfi1_init(struct hfi1_devdata *dd, int reinit) +{ + int ret = 0, pidx, lastfail = 0; + unsigned i, len; + struct hfi1_ctxtdata *rcd; + struct hfi1_pportdata *ppd; + + /* Set up recv low level handlers */ + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] = + kdeth_process_expected; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] = + kdeth_process_eager; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] = + process_receive_error; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] = + process_receive_bypass; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] = + process_receive_invalid; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] = + process_receive_invalid; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] = + process_receive_invalid; + dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; + + /* Set up send low level handlers */ + dd->process_pio_send = hfi1_verbs_send_pio; + dd->process_dma_send = hfi1_verbs_send_dma; + dd->pio_inline_send = pio_copy; + + if (is_a0(dd)) { + atomic_set(&dd->drop_packet, DROP_PACKET_ON); + dd->do_drop = 1; + } else { + atomic_set(&dd->drop_packet, DROP_PACKET_OFF); + dd->do_drop = 0; + } + + /* make sure the link is not "up" */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + ppd->linkup = 0; + } + + if (reinit) + ret = init_after_reset(dd); + else + ret = loadtime_init(dd); + if (ret) + goto done; + + /* dd->rcd can be NULL if early initialization failed */ + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + /* + * Set up the (kernel) rcvhdr queue and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of ctxt 0 ctxtdata as well. + */ + rcd = dd->rcd[i]; + if (!rcd) + continue; + + rcd->do_interrupt = &handle_receive_interrupt; + + lastfail = hfi1_create_rcvhdrq(dd, rcd); + if (!lastfail) + lastfail = hfi1_setup_eagerbufs(rcd); + if (lastfail) + dd_dev_err(dd, + "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); + } + if (lastfail) + ret = lastfail; + + /* Allocate enough memory for user event notification. */ + len = ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS * + sizeof(*dd->events), PAGE_SIZE); + dd->events = vmalloc_user(len); + if (!dd->events) + dd_dev_err(dd, "Failed to allocate user events page\n"); + /* + * Allocate a page for device and port status. + * Page will be shared amongst all user processes. + */ + dd->status = vmalloc_user(PAGE_SIZE); + if (!dd->status) + dd_dev_err(dd, "Failed to allocate dev status page\n"); + else + dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) - + sizeof(dd->status->freezemsg)); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (dd->status) + /* Currently, we only have one port */ + ppd->statusp = &dd->status->port; + + set_mtu(ppd); + } + + /* enable chip even if we have an error, so we can debug cause */ + enable_chip(dd); + + ret = hfi1_cq_init(dd); +done: + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + if (dd->status) + dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | + HFI1_STATUS_INITTED; + if (!ret) { + /* enable all interrupts from the chip */ + set_intr_state(dd, 1); + + /* chip is OK for user apps; mark it as initialized */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* initialize the qsfp if it exists + * Requires interrupts to be enabled so we are notified + * when the QSFP completes reset, and has + * to be done before bringing up the SERDES + */ + init_qsfp(ppd); + + /* start the serdes - must be after interrupts are + enabled so we are notified when the link goes up */ + lastfail = bringup_serdes(ppd); + if (lastfail) + dd_dev_info(dd, + "Failed to bring up port %u\n", + ppd->port); + + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + if (ppd->statusp) + *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | + HFI1_STATUS_INITTED; + if (!ppd->link_speed_enabled) + continue; + } + } + + /* if ret is non-zero, we probably should do some cleanup here... */ + return ret; +} + +static inline struct hfi1_devdata *__hfi1_lookup(int unit) +{ + return idr_find(&hfi1_unit_table, unit); +} + +struct hfi1_devdata *hfi1_lookup(int unit) +{ + struct hfi1_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + dd = __hfi1_lookup(unit); + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + + return dd; +} + +/* + * Stop the timers during unit shutdown, or after an error late + * in initialization. + */ +static void stop_timers(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int pidx; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->led_override_timer.data) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + } +} + +/** + * shutdown_device - shut down a device + * @dd: the hfi1_ib device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by hfi1_init(dd, 1) + */ +static void shutdown_device(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + unsigned pidx; + int i; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + ppd->linkup = 0; + if (ppd->statusp) + *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | + HFI1_STATUS_IB_READY); + } + dd->flags &= ~HFI1_INITTED; + + /* mask interrupts, but not errors */ + set_intr_state(dd, 0); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + for (i = 0; i < dd->num_rcv_contexts; i++) + hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | + HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_PKEY_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i); + /* + * Gracefully stop all sends allowing any in progress to + * trickle out first. + */ + for (i = 0; i < dd->num_send_contexts; i++) + sc_flush(dd->send_contexts[i].sc); + } + + /* + * Enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(20); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* disable all contexts */ + for (i = 0; i < dd->num_send_contexts; i++) + sc_disable(dd->send_contexts[i].sc); + /* disable the send device */ + pio_send_control(dd, PSC_GLOBAL_DISABLE); + + /* + * Clear SerdesEnable. + * We can't count on interrupts since we are stopping. + */ + hfi1_quiet_serdes(ppd); + + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + } + sdma_exit(dd); +} + +/** + * hfi1_free_ctxtdata - free a context's allocated data + * @dd: the hfi1_ib device + * @rcd: the ctxtdata structure + * + * free up any allocated data for a context + * This should not touch anything that would affect a simultaneous + * re-allocation of context data, because it is called after hfi1_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + */ +void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + unsigned e; + + if (!rcd) + return; + + if (rcd->rcvhdrq) { + dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, + rcd->rcvhdrq, rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; + if (rcd->rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *)rcd->rcvhdrtail_kvaddr, + rcd->rcvhdrqtailaddr_phys); + rcd->rcvhdrtail_kvaddr = NULL; + } + } + + /* all the RcvArray entries should have been cleared by now */ + kfree(rcd->egrbufs.rcvtids); + + for (e = 0; e < rcd->egrbufs.alloced; e++) { + if (rcd->egrbufs.buffers[e].phys) + dma_free_coherent(&dd->pcidev->dev, + rcd->egrbufs.buffers[e].len, + rcd->egrbufs.buffers[e].addr, + rcd->egrbufs.buffers[e].phys); + } + kfree(rcd->egrbufs.buffers); + + sc_free(rcd->sc); + vfree(rcd->physshadow); + vfree(rcd->tid_pg_list); + vfree(rcd->user_event_mask); + vfree(rcd->subctxt_uregbase); + vfree(rcd->subctxt_rcvegrbuf); + vfree(rcd->subctxt_rcvhdr_base); + kfree(rcd->tidusemap); + kfree(rcd->opstats); + kfree(rcd); +} + +void hfi1_free_devdata(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + idr_remove(&hfi1_unit_table, dd->unit); + list_del(&dd->list); + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + hfi1_dbg_ibdev_exit(&dd->verbs_dev); + rcu_barrier(); /* wait for rcu callbacks to complete */ + free_percpu(dd->int_counter); + free_percpu(dd->rcv_limit); + ib_dealloc_device(&dd->verbs_dev.ibdev); +} + +/* + * Allocate our primary per-unit data structure. Must be done via verbs + * allocator, because the verbs cleanup process both does cleanup and + * free of the data structure. + * "extra" is for chip-specific data. + * + * Use the idr mechanism to get a unit number for this unit. + */ +struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) +{ + unsigned long flags; + struct hfi1_devdata *dd; + int ret; + + dd = (struct hfi1_devdata *)ib_alloc_device(sizeof(*dd) + extra); + if (!dd) + return ERR_PTR(-ENOMEM); + /* extra is * number of ports */ + dd->num_pports = extra / sizeof(struct hfi1_pportdata); + dd->pport = (struct hfi1_pportdata *)(dd + 1); + + INIT_LIST_HEAD(&dd->list); + dd->node = dev_to_node(&pdev->dev); + if (dd->node < 0) + dd->node = 0; + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&hfi1_devs_lock, flags); + + ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + dd->unit = ret; + list_add(&dd->list, &hfi1_dev_list); + } + + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + idr_preload_end(); + + if (ret < 0) { + hfi1_early_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); + goto bail; + } + /* + * Initialize all locks for the device. This needs to be as early as + * possible so locks are usable. + */ + spin_lock_init(&dd->sc_lock); + spin_lock_init(&dd->sendctrl_lock); + spin_lock_init(&dd->rcvctrl_lock); + spin_lock_init(&dd->uctxt_lock); + spin_lock_init(&dd->hfi1_diag_trans_lock); + spin_lock_init(&dd->sc_init_lock); + spin_lock_init(&dd->dc8051_lock); + spin_lock_init(&dd->dc8051_memlock); + mutex_init(&dd->qsfp_i2c_mutex); + seqlock_init(&dd->sc2vl_lock); + spin_lock_init(&dd->sde_map_lock); + init_waitqueue_head(&dd->event_queue); + + dd->int_counter = alloc_percpu(u64); + if (!dd->int_counter) { + ret = -ENOMEM; + hfi1_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); + goto bail; + } + + dd->rcv_limit = alloc_percpu(u64); + if (!dd->rcv_limit) { + ret = -ENOMEM; + hfi1_early_err(&pdev->dev, + "Could not allocate per-cpu rcv_limit\n"); + goto bail; + } + + if (!hfi1_cpulist_count) { + u32 count = num_online_cpus(); + + hfi1_cpulist = kzalloc(BITS_TO_LONGS(count) * + sizeof(long), GFP_KERNEL); + if (hfi1_cpulist) + hfi1_cpulist_count = count; + else + hfi1_early_err( + &pdev->dev, + "Could not alloc cpulist info, cpu affinity might be wrong\n"); + } + hfi1_dbg_ibdev_init(&dd->verbs_dev); + return dd; + +bail: + if (!list_empty(&dd->list)) + list_del_init(&dd->list); + ib_dealloc_device(&dd->verbs_dev.ibdev); + return ERR_PTR(ret); +} + +/* + * Called from freeze mode handlers, and from PCI error + * reporting code. Should be paranoid about state of + * system and data structures. + */ +void hfi1_disable_after_error(struct hfi1_devdata *dd) +{ + if (dd->flags & HFI1_INITTED) { + u32 pidx; + + dd->flags &= ~HFI1_INITTED; + if (dd->pport) + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct hfi1_pportdata *ppd; + + ppd = dd->pport + pidx; + if (dd->flags & HFI1_PRESENT) + set_link_state(ppd, HLS_DN_DISABLE); + + if (ppd->statusp) + *ppd->statusp &= ~HFI1_STATUS_IB_READY; + } + } + + /* + * Mark as having had an error for driver, and also + * for /sys and status word mapped to user programs. + * This marks unit as not usable, until reset. + */ + if (dd->status) + dd->status->dev |= HFI1_STATUS_HWERROR; +} + +static void remove_one(struct pci_dev *); +static int init_one(struct pci_dev *, const struct pci_device_id *); + +#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " +#define PFX DRIVER_NAME ": " + +static const struct pci_device_id hfi1_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); + +static struct pci_driver hfi1_pci_driver = { + .name = DRIVER_NAME, + .probe = init_one, + .remove = remove_one, + .id_table = hfi1_pci_tbl, + .err_handler = &hfi1_pci_err_handler, +}; + +static void __init compute_krcvqs(void) +{ + int i; + + for (i = 0; i < krcvqsset; i++) + n_krcvqs += krcvqs[i]; +} + +/* + * Do all the generic driver unit- and chip-independent memory + * allocation and initialization. + */ +static int __init hfi1_mod_init(void) +{ + int ret; + + ret = dev_init(); + if (ret) + goto bail; + + /* validate max MTU before any devices start */ + if (!valid_opa_max_mtu(hfi1_max_mtu)) { + pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", + hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); + hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; + } + /* valid CUs run from 1-128 in powers of 2 */ + if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) + hfi1_cu = 1; + /* valid credit return threshold is 0-100, variable is unsigned */ + if (user_credit_return_threshold > 100) + user_credit_return_threshold = 100; + + compute_krcvqs(); + /* sanitize receive interrupt count, time must wait until after + the hardware type is known */ + if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) + rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; + /* reject invalid combinations */ + if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { + pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); + rcv_intr_count = 1; + } + if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { + /* + * Avoid indefinite packet delivery by requiring a timeout + * if count is > 1. + */ + pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); + rcv_intr_timeout = 1; + } + if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { + /* + * The dynamic algorithm expects a non-zero timeout + * and a count > 1. + */ + pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); + rcv_intr_dynamic = 0; + } + + /* sanitize link CRC options */ + link_crc_mask &= SUPPORTED_CRCS; + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&hfi1_unit_table); + + hfi1_dbg_init(); + ret = pci_register_driver(&hfi1_pci_driver); + if (ret < 0) { + pr_err("Unable to register driver: error %d\n", -ret); + goto bail_dev; + } + goto bail; /* all OK */ + +bail_dev: + hfi1_dbg_exit(); + idr_destroy(&hfi1_unit_table); + dev_cleanup(); +bail: + return ret; +} + +module_init(hfi1_mod_init); + +/* + * Do the non-unit driver cleanup, memory free, etc. at unload. + */ +static void __exit hfi1_mod_cleanup(void) +{ + pci_unregister_driver(&hfi1_pci_driver); + hfi1_dbg_exit(); + hfi1_cpulist_count = 0; + kfree(hfi1_cpulist); + + idr_destroy(&hfi1_unit_table); + dispose_firmware(); /* asymmetric with obtain_firmware() */ + dev_cleanup(); +} + +module_exit(hfi1_mod_cleanup); + +/* this can only be called after a successful initialization */ +static void cleanup_device_data(struct hfi1_devdata *dd) +{ + int ctxt; + int pidx; + struct hfi1_ctxtdata **tmp; + unsigned long flags; + + /* users can't do anything more with chip */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct hfi1_pportdata *ppd = &dd->pport[pidx]; + struct cc_state *cc_state; + int i; + + if (ppd->statusp) + *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; + + for (i = 0; i < OPA_MAX_SLS; i++) + hrtimer_cancel(&ppd->cca_timer[i].hrtimer); + + spin_lock(&ppd->cc_state_lock); + cc_state = get_cc_state(ppd); + rcu_assign_pointer(ppd->cc_state, NULL); + spin_unlock(&ppd->cc_state_lock); + + if (cc_state) + call_rcu(&cc_state->rcu, cc_state_reclaim); + } + + free_credit_return(dd); + + /* + * Free any resources still in use (usually just kernel contexts) + * at unload; we do for ctxtcnt, because that's what we allocate. + * We acquire lock to be really paranoid that rcd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + tmp = dd->rcd; + dd->rcd = NULL; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) { + struct hfi1_ctxtdata *rcd = tmp[ctxt]; + + tmp[ctxt] = NULL; /* debugging paranoia */ + if (rcd) { + hfi1_clear_tids(rcd); + hfi1_free_ctxtdata(dd, rcd); + } + } + kfree(tmp); + /* must follow rcv context free - need to remove rcv's hooks */ + for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) + sc_free(dd->send_contexts[ctxt].sc); + dd->num_send_contexts = 0; + kfree(dd->send_contexts); + dd->send_contexts = NULL; + kfree(dd->boardname); + vfree(dd->events); + vfree(dd->status); + hfi1_cq_exit(dd); +} + +/* + * Clean up on unit shutdown, or error during unit load after + * successful initialization. + */ +static void postinit_cleanup(struct hfi1_devdata *dd) +{ + hfi1_start_cleanup(dd); + + hfi1_pcie_ddcleanup(dd); + hfi1_pcie_cleanup(dd->pcidev); + + cleanup_device_data(dd); + + hfi1_free_devdata(dd); +} + +static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret = 0, j, pidx, initfail; + struct hfi1_devdata *dd = NULL; + + /* First, lock the non-writable module parameters */ + HFI1_CAP_LOCK(); + + /* Validate some global module parameters */ + if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { + hfi1_early_err(&pdev->dev, "Header queue count too small\n"); + ret = -EINVAL; + goto bail; + } + /* use the encoding function as a sanitization check */ + if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { + hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", + hfi1_hdrq_entsize); + goto bail; + } + + /* The receive eager buffer size must be set before the receive + * contexts are created. + * + * Set the eager buffer size. Validate that it falls in a range + * allowed by the hardware - all powers of 2 between the min and + * max. The maximum valid MTU is within the eager buffer range + * so we do not need to cap the max_mtu by an eager buffer size + * setting. + */ + if (eager_buffer_size) { + if (!is_power_of_2(eager_buffer_size)) + eager_buffer_size = + roundup_pow_of_two(eager_buffer_size); + eager_buffer_size = + clamp_val(eager_buffer_size, + MIN_EAGER_BUFFER * 8, + MAX_EAGER_BUFFER_TOTAL); + hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", + eager_buffer_size); + } else { + hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); + ret = -EINVAL; + goto bail; + } + + /* restrict value of hfi1_rcvarr_split */ + hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); + + ret = hfi1_pcie_init(pdev, ent); + if (ret) + goto bail; + + /* + * Do device-specific initialization, function table setup, dd + * allocation, etc. + */ + switch (ent->device) { + case PCI_DEVICE_ID_INTEL0: + case PCI_DEVICE_ID_INTEL1: + dd = hfi1_init_dd(pdev, ent); + break; + default: + hfi1_early_err(&pdev->dev, + "Failing on unknown Intel deviceid 0x%x\n", + ent->device); + ret = -ENODEV; + } + + if (IS_ERR(dd)) + ret = PTR_ERR(dd); + if (ret) + goto clean_bail; /* error already printed */ + + ret = create_workqueues(dd); + if (ret) + goto clean_bail; + + /* do the generic initialization */ + initfail = hfi1_init(dd, 0); + + ret = hfi1_register_ib_device(dd); + + /* + * Now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. If earlier failure, + * we still create devices, so diags, etc. can be used + * to determine cause of problem. + */ + if (!initfail && !ret) + dd->flags |= HFI1_INITTED; + + j = hfi1_device_create(dd); + if (j) + dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); + + if (initfail || ret) { + stop_timers(dd); + flush_workqueue(ib_wq); + for (pidx = 0; pidx < dd->num_pports; ++pidx) + hfi1_quiet_serdes(dd->pport + pidx); + if (!j) + hfi1_device_remove(dd); + if (!ret) + hfi1_unregister_ib_device(dd); + postinit_cleanup(dd); + if (initfail) + ret = initfail; + goto bail; /* everything already cleaned */ + } + + sdma_start(dd); + + return 0; + +clean_bail: + hfi1_pcie_cleanup(pdev); +bail: + return ret; +} + +static void remove_one(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + /* unregister from IB core */ + hfi1_unregister_ib_device(dd); + + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. + */ + shutdown_device(dd); + + stop_timers(dd); + + /* wait until all of our (qsfp) queue_work() calls complete */ + flush_workqueue(ib_wq); + + hfi1_device_remove(dd); + + postinit_cleanup(dd); +} + +/** + * hfi1_create_rcvhdrq - create a receive header queue + * @dd: the hfi1_ib device + * @rcd: the context data + * + * This must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + unsigned amt; + u64 reg; + + if (!rcd->rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags; + + /* + * rcvhdrqentsize is in DWs, so we have to convert to bytes + * (* sizeof(u32)). + */ + amt = ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize * + sizeof(u32), PAGE_SIZE); + + gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? + GFP_USER : GFP_KERNEL; + rcd->rcvhdrq = dma_zalloc_coherent( + &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, + gfp_flags | __GFP_COMP); + + if (!rcd->rcvhdrq) { + dd_dev_err(dd, + "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", + amt, rcd->ctxt); + goto bail; + } + + /* Event mask is per device now and is in hfi1_devdata */ + /*if (rcd->ctxt >= dd->first_user_ctxt) { + rcd->user_event_mask = vmalloc_user(PAGE_SIZE); + if (!rcd->user_event_mask) + goto bail_free_hdrq; + }*/ + + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { + rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + gfp_flags); + if (!rcd->rcvhdrtail_kvaddr) + goto bail_free; + rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; + } + + rcd->rcvhdrq_size = amt; + } + /* + * These values are per-context: + * RcvHdrCnt + * RcvHdrEntSize + * RcvHdrSize + */ + reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT) + & RCV_HDR_CNT_CNT_MASK) + << RCV_HDR_CNT_CNT_SHIFT; + write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg); + reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize) + & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) + << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; + write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); + reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK) + << RCV_HDR_SIZE_HDR_SIZE_SHIFT; + write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); + return 0; + +bail_free: + dd_dev_err(dd, + "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", + rcd->ctxt); + vfree(rcd->user_event_mask); + rcd->user_event_mask = NULL; + dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, + rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; +bail: + return -ENOMEM; +} + +/** + * allocate eager buffers, both kernel and user contexts. + * @rcd: the context we are setting up. + * + * Allocate the eager TID buffers and program them into hip. + * They are no longer completely contiguous, we do multiple allocation + * calls. Otherwise we get the OOM code involved, by asking for too + * much per call, with disastrous results on some kernels. + */ +int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 max_entries, egrtop, alloced_bytes = 0, idx = 0; + gfp_t gfp_flags; + u16 order; + int ret = 0; + u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + /* + * The minimum size of the eager buffers is a groups of MTU-sized + * buffers. + * The global eager_buffer_size parameter is checked against the + * theoretical lower limit of the value. Here, we check against the + * MTU. + */ + if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) + rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; + /* + * If using one-pkt-per-egr-buffer, lower the eager buffer + * size to the max MTU (page-aligned). + */ + if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) + rcd->egrbufs.rcvtid_size = round_mtu; + + /* + * Eager buffers sizes of 1MB or less require smaller TID sizes + * to satisfy the "multiple of 8 RcvArray entries" requirement. + */ + if (rcd->egrbufs.size <= (1 << 20)) + rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, + rounddown_pow_of_two(rcd->egrbufs.size / 8)); + + while (alloced_bytes < rcd->egrbufs.size && + rcd->egrbufs.alloced < rcd->egrbufs.count) { + rcd->egrbufs.buffers[idx].addr = + dma_zalloc_coherent(&dd->pcidev->dev, + rcd->egrbufs.rcvtid_size, + &rcd->egrbufs.buffers[idx].phys, + gfp_flags); + if (rcd->egrbufs.buffers[idx].addr) { + rcd->egrbufs.buffers[idx].len = + rcd->egrbufs.rcvtid_size; + rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = + rcd->egrbufs.buffers[idx].addr; + rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys = + rcd->egrbufs.buffers[idx].phys; + rcd->egrbufs.alloced++; + alloced_bytes += rcd->egrbufs.rcvtid_size; + idx++; + } else { + u32 new_size, i, j; + u64 offset = 0; + + /* + * Fail the eager buffer allocation if: + * - we are already using the lowest acceptable size + * - we are using one-pkt-per-egr-buffer (this implies + * that we are accepting only one size) + */ + if (rcd->egrbufs.rcvtid_size == round_mtu || + !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { + dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", + rcd->ctxt); + goto bail_rcvegrbuf_phys; + } + + new_size = rcd->egrbufs.rcvtid_size / 2; + + /* + * If the first attempt to allocate memory failed, don't + * fail everything but continue with the next lower + * size. + */ + if (idx == 0) { + rcd->egrbufs.rcvtid_size = new_size; + continue; + } + + /* + * Re-partition already allocated buffers to a smaller + * size. + */ + rcd->egrbufs.alloced = 0; + for (i = 0, j = 0, offset = 0; j < idx; i++) { + if (i >= rcd->egrbufs.count) + break; + rcd->egrbufs.rcvtids[i].phys = + rcd->egrbufs.buffers[j].phys + offset; + rcd->egrbufs.rcvtids[i].addr = + rcd->egrbufs.buffers[j].addr + offset; + rcd->egrbufs.alloced++; + if ((rcd->egrbufs.buffers[j].phys + offset + + new_size) == + (rcd->egrbufs.buffers[j].phys + + rcd->egrbufs.buffers[j].len)) { + j++; + offset = 0; + } else + offset += new_size; + } + rcd->egrbufs.rcvtid_size = new_size; + } + } + rcd->egrbufs.numbufs = idx; + rcd->egrbufs.size = alloced_bytes; + + dd_dev_info(dd, "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", + rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size, + rcd->egrbufs.size); + + /* + * Set the contexts rcv array head update threshold to the closest + * power of 2 (so we can use a mask instead of modulo) below half + * the allocated entries. + */ + rcd->egrbufs.threshold = + rounddown_pow_of_two(rcd->egrbufs.alloced / 2); + /* + * Compute the expected RcvArray entry base. This is done after + * allocating the eager buffers in order to maximize the + * expected RcvArray entries for the context. + */ + max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; + egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); + rcd->expected_count = max_entries - egrtop; + if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) + rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; + + rcd->expected_base = rcd->eager_base + egrtop; + dd_dev_info(dd, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", + rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, + rcd->eager_base, rcd->expected_base); + + if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { + dd_dev_err(dd, "ctxt%u: current Eager buffer size is invalid %u\n", + rcd->ctxt, rcd->egrbufs.rcvtid_size); + ret = -EINVAL; + goto bail; + } + + for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { + hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, + rcd->egrbufs.rcvtids[idx].phys, order); + cond_resched(); + } + goto bail; + +bail_rcvegrbuf_phys: + for (idx = 0; idx < rcd->egrbufs.alloced && + rcd->egrbufs.buffers[idx].addr; + idx++) { + dma_free_coherent(&dd->pcidev->dev, + rcd->egrbufs.buffers[idx].len, + rcd->egrbufs.buffers[idx].addr, + rcd->egrbufs.buffers[idx].phys); + rcd->egrbufs.buffers[idx].addr = NULL; + rcd->egrbufs.buffers[idx].phys = 0; + rcd->egrbufs.buffers[idx].len = 0; + } +bail: + return ret; +} diff --git a/drivers/staging/rdma/hfi1/intr.c b/drivers/staging/rdma/hfi1/intr.c new file mode 100644 index 000000000..426582b9a --- /dev/null +++ b/drivers/staging/rdma/hfi1/intr.c @@ -0,0 +1,207 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include "hfi.h" +#include "common.h" +#include "sdma.h" + +/** + * format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * hfi1_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t msgl) +{ + int i; + + for (i = 0; i < nhwerrmsgs; i++) + if (hwerrs & hwerrmsgs[i].mask) + format_hwmsg(msg, msgl, hwerrmsgs[i].msg); +} + +static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev) +{ + struct ib_event event; + struct hfi1_devdata *dd = ppd->dd; + + /* + * Only call ib_dispatch_event() if the IB device has been + * registered. HFI1_INITED is set iff the driver has successfully + * registered with the IB core. + */ + if (!(dd->flags & HFI1_INITTED)) + return; + event.device = &dd->verbs_dev.ibdev; + event.element.port_num = ppd->port; + event.event = ev; + ib_dispatch_event(&event); +} + +/* + * Handle a linkup or link down notification. + * This is called outside an interrupt. + */ +void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup) +{ + struct hfi1_pportdata *ppd = &dd->pport[0]; + enum ib_event_type ev; + + if (!(ppd->linkup ^ !!linkup)) + return; /* no change, nothing to do */ + + if (linkup) { + /* + * Quick linkup and all link up on the simulator does not + * trigger or implement: + * - VerifyCap interrupt + * - VerifyCap frames + * But rather moves directly to LinkUp. + * + * Do the work of the VerifyCap interrupt handler, + * handle_verify_cap(), but do not try moving the state to + * LinkUp as we are already there. + * + * NOTE: This uses this device's vAU, vCU, and vl15_init for + * the remote values. Both sides must be using the values. + */ + if (quick_linkup + || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + set_up_vl15(dd, dd->vau, dd->vl15_init); + assign_remote_cm_au_table(dd, dd->vcu); + ppd->neighbor_guid = + read_csr(dd, + DC_DC8051_STS_REMOTE_GUID); + ppd->neighbor_type = + read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) & + DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK; + ppd->neighbor_port_number = + read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) & + DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK; + dd_dev_info(dd, + "Neighbor GUID: %llx Neighbor type %d\n", + ppd->neighbor_guid, + ppd->neighbor_type); + } + + /* physical link went up */ + ppd->linkup = 1; + ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE; + + /* link widths are not available until the link is fully up */ + get_linkup_link_widths(ppd); + + } else { + /* physical link went down */ + ppd->linkup = 0; + + /* clear HW details of the previous connection */ + reset_link_credits(dd); + + /* freeze after a link down to guarantee a clean egress */ + start_freeze_handling(ppd, FREEZE_SELF|FREEZE_LINK_DOWN); + + ev = IB_EVENT_PORT_ERR; + + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT); + + /* if we are down, the neighbor is down */ + ppd->neighbor_normal = 0; + + /* notify IB of the link change */ + signal_ib_event(ppd, ev); + } + + +} + +/* + * Handle receive or urgent interrupts for user contexts. This means a user + * process was waiting for a packet to arrive, and didn't want to poll. + */ +void handle_user_interrupt(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + unsigned long flags; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (!rcd->cnt) + goto done; + + if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) { + wake_up_interruptible(&rcd->wait); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt); + } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG, + &rcd->event_flags)) { + rcd->urgent++; + wake_up_interruptible(&rcd->wait); + } +done: + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +} diff --git a/drivers/staging/rdma/hfi1/iowait.h b/drivers/staging/rdma/hfi1/iowait.h new file mode 100644 index 000000000..fa361b405 --- /dev/null +++ b/drivers/staging/rdma/hfi1/iowait.h @@ -0,0 +1,186 @@ +#ifndef _HFI1_IOWAIT_H +#define _HFI1_IOWAIT_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +/* + * typedef (*restart_t)() - restart callback + * @work: pointer to work structure + */ +typedef void (*restart_t)(struct work_struct *work); + +struct sdma_txreq; +struct sdma_engine; +/** + * struct iowait - linkage for delayed progress/waiting + * @list: used to add/insert into QP/PQ wait lists + * @tx_head: overflow list of sdma_txreq's + * @sleep: no space callback + * @wakeup: space callback + * @iowork: workqueue overhead + * @wait_dma: wait for sdma_busy == 0 + * @sdma_busy: # of packets in flight + * @count: total number of descriptors in tx_head'ed list + * @tx_limit: limit for overflow queuing + * @tx_count: number of tx entry's in tx_head'ed list + * + * This is to be embedded in user's state structure + * (QP or PQ). + * + * The sleep and wakeup members are a + * bit misnamed. They do not strictly + * speaking sleep or wake up, but they + * are callbacks for the ULP to implement + * what ever queuing/dequeuing of + * the embedded iowait and its containing struct + * when a resource shortage like SDMA ring space is seen. + * + * Both potentially have locks help + * so sleeping is not allowed. + * + * The wait_dma member along with the iow + */ + +struct iowait { + struct list_head list; + struct list_head tx_head; + int (*sleep)( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx, + unsigned seq); + void (*wakeup)(struct iowait *wait, int reason); + struct work_struct iowork; + wait_queue_head_t wait_dma; + atomic_t sdma_busy; + u32 count; + u32 tx_limit; + u32 tx_count; +}; + +#define SDMA_AVAIL_REASON 0 + +/** + * iowait_init() - initialize wait structure + * @wait: wait struct to initialize + * @tx_limit: limit for overflow queuing + * @func: restart function for workqueue + * @sleep: sleep function for no space + * @wakeup: wakeup function for no space + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ + +static inline void iowait_init( + struct iowait *wait, + u32 tx_limit, + void (*func)(struct work_struct *work), + int (*sleep)( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx, + unsigned seq), + void (*wakeup)(struct iowait *wait, int reason)) +{ + wait->count = 0; + INIT_LIST_HEAD(&wait->list); + INIT_LIST_HEAD(&wait->tx_head); + INIT_WORK(&wait->iowork, func); + init_waitqueue_head(&wait->wait_dma); + atomic_set(&wait->sdma_busy, 0); + wait->tx_limit = tx_limit; + wait->sleep = sleep; + wait->wakeup = wakeup; +} + +/** + * iowait_schedule() - initialize wait structure + * @wait: wait struct to schedule + * @wq: workqueue for schedule + */ +static inline void iowait_schedule( + struct iowait *wait, + struct workqueue_struct *wq) +{ + queue_work(wq, &wait->iowork); +} + +/** + * iowait_sdma_drain() - wait for DMAs to drain + * + * @wait: iowait structure + * + * This will delay until the iowait sdmas have + * completed. + */ +static inline void iowait_sdma_drain(struct iowait *wait) +{ + wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy)); +} + +/** + * iowait_drain_wakeup() - trigger iowait_drain() waiter + * + * @wait: iowait structure + * + * This will trigger any waiters. + */ +static inline void iowait_drain_wakeup(struct iowait *wait) +{ + wake_up(&wait->wait_dma); +} + +#endif diff --git a/drivers/staging/rdma/hfi1/keys.c b/drivers/staging/rdma/hfi1/keys.c new file mode 100644 index 000000000..f6eff177a --- /dev/null +++ b/drivers/staging/rdma/hfi1/keys.c @@ -0,0 +1,411 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" + +/** + * hfi1_alloc_lkey - allocate an lkey + * @mr: memory region that this lkey protects + * @dma_region: 0->normal key, 1->restricted DMA key + * + * Returns 0 if successful, otherwise returns -errno. + * + * Increments mr reference count as required. + * + * Sets the lkey field mr for non-dma regions. + * + */ + +int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region) +{ + unsigned long flags; + u32 r; + u32 n; + int ret = 0; + struct hfi1_ibdev *dev = to_idev(mr->pd->device); + struct hfi1_lkey_table *rkt = &dev->lk_table; + + hfi1_get_mr(mr); + spin_lock_irqsave(&rkt->lock, flags); + + /* special case for dma_mr lkey == 0 */ + if (dma_region) { + struct hfi1_mregion *tmr; + + tmr = rcu_access_pointer(dev->dma_mr); + if (!tmr) { + rcu_assign_pointer(dev->dma_mr, mr); + mr->lkey_published = 1; + } else { + hfi1_put_mr(mr); + } + goto success; + } + + /* Find the next available LKEY */ + r = rkt->next; + n = r; + for (;;) { + if (!rcu_access_pointer(rkt->table[r])) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) + goto bail; + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + /* + * bits are capped in verbs.c to ensure enough bits for + * generation number + */ + mr->lkey = (r << (32 - hfi1_lkey_table_size)) | + ((((1 << (24 - hfi1_lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + rcu_assign_pointer(rkt->table[r], mr); + mr->lkey_published = 1; +success: + spin_unlock_irqrestore(&rkt->lock, flags); +out: + return ret; +bail: + hfi1_put_mr(mr); + spin_unlock_irqrestore(&rkt->lock, flags); + ret = -ENOMEM; + goto out; +} + +/** + * hfi1_free_lkey - free an lkey + * @mr: mr to free from tables + */ +void hfi1_free_lkey(struct hfi1_mregion *mr) +{ + unsigned long flags; + u32 lkey = mr->lkey; + u32 r; + struct hfi1_ibdev *dev = to_idev(mr->pd->device); + struct hfi1_lkey_table *rkt = &dev->lk_table; + int freed = 0; + + spin_lock_irqsave(&rkt->lock, flags); + if (!mr->lkey_published) + goto out; + if (lkey == 0) + RCU_INIT_POINTER(dev->dma_mr, NULL); + else { + r = lkey >> (32 - hfi1_lkey_table_size); + RCU_INIT_POINTER(rkt->table[r], NULL); + } + mr->lkey_published = 0; + freed++; +out: + spin_unlock_irqrestore(&rkt->lock, flags); + if (freed) { + synchronize_rcu(); + hfi1_put_mr(mr); + } +} + +/** + * hfi1_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @pd: protection domain + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Return 1 if valid and successful, otherwise returns 0. + * + * increments the reference count upon success + * + * Check the IB SGE for validity and initialize our internal version + * of it. + */ +int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd, + struct hfi1_sge *isge, struct ib_sge *sge, int acc) +{ + struct hfi1_mregion *mr; + unsigned n, m; + size_t off; + + /* + * We use LKEY == zero for kernel virtual addresses + * (see hfi1_get_dma_mr and dma.c). + */ + rcu_read_lock(); + if (sge->lkey == 0) { + struct hfi1_ibdev *dev = to_idev(pd->ibpd.device); + + if (pd->user) + goto bail; + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + isge->mr = mr; + isge->vaddr = (void *) sge->addr; + isge->length = sge->length; + isge->sge_length = sge->length; + isge->m = 0; + isge->n = 0; + goto ok; + } + mr = rcu_dereference( + rkt->table[(sge->lkey >> (32 - hfi1_lkey_table_size))]); + if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd)) + goto bail; + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off / HFI1_SEGSZ; + n = entries_spanned_by_off % HFI1_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= HFI1_SEGSZ) { + m++; + n = 0; + } + } + } + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; + isge->sge_length = sge->length; + isge->m = m; + isge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} + +/** + * hfi1_rkey_ok - check the IB virtual address, length, and RKEY + * @qp: qp for validation + * @sge: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return 1 if successful, otherwise 0. + * + * increments the reference count upon success + */ +int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct hfi1_mregion *mr; + unsigned n, m; + size_t off; + + /* + * We use RKEY == zero for kernel virtual addresses + * (see hfi1_get_dma_mr and dma.c). + */ + rcu_read_lock(); + if (rkey == 0) { + struct hfi1_pd *pd = to_ipd(qp->ibqp.pd); + struct hfi1_ibdev *dev = to_idev(pd->ibpd.device); + + if (pd->user) + goto bail; + mr = rcu_dereference(dev->dma_mr); + if (!mr) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + sge->mr = mr; + sge->vaddr = (void *) vaddr; + sge->length = len; + sge->sge_length = len; + sge->m = 0; + sge->n = 0; + goto ok; + } + + mr = rcu_dereference( + rkt->table[(rkey >> (32 - hfi1_lkey_table_size))]); + if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) + goto bail; + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) + goto bail; + atomic_inc(&mr->refcount); + rcu_read_unlock(); + + off += mr->offset; + if (mr->page_shift) { + /* + page sizes are uniform power of 2 so no loop is necessary + entries_spanned_by_off is the number of times the loop below + would have executed. + */ + size_t entries_spanned_by_off; + + entries_spanned_by_off = off >> mr->page_shift; + off -= (entries_spanned_by_off << mr->page_shift); + m = entries_spanned_by_off / HFI1_SEGSZ; + n = entries_spanned_by_off % HFI1_SEGSZ; + } else { + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= HFI1_SEGSZ) { + m++; + n = 0; + } + } + } + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; + sge->sge_length = len; + sge->m = m; + sge->n = n; +ok: + return 1; +bail: + rcu_read_unlock(); + return 0; +} + +/* + * Initialize the memory region specified by the work request. + */ +int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr) +{ + struct hfi1_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct hfi1_pd *pd = to_ipd(qp->ibqp.pd); + struct hfi1_mregion *mr; + u32 rkey = wr->wr.fast_reg.rkey; + unsigned i, n, m; + int ret = -EINVAL; + unsigned long flags; + u64 *page_list; + size_t ps; + + spin_lock_irqsave(&rkt->lock, flags); + if (pd->user || rkey == 0) + goto bail; + + mr = rcu_dereference_protected( + rkt->table[(rkey >> (32 - hfi1_lkey_table_size))], + lockdep_is_held(&rkt->lock)); + if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd)) + goto bail; + + if (wr->wr.fast_reg.page_list_len > mr->max_segs) + goto bail; + + ps = 1UL << wr->wr.fast_reg.page_shift; + if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len) + goto bail; + + mr->user_base = wr->wr.fast_reg.iova_start; + mr->iova = wr->wr.fast_reg.iova_start; + mr->lkey = rkey; + mr->length = wr->wr.fast_reg.length; + mr->access_flags = wr->wr.fast_reg.access_flags; + page_list = wr->wr.fast_reg.page_list->page_list; + m = 0; + n = 0; + for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) { + mr->map[m]->segs[n].vaddr = (void *) page_list[i]; + mr->map[m]->segs[n].length = ps; + if (++n == HFI1_SEGSZ) { + m++; + n = 0; + } + } + + ret = 0; +bail: + spin_unlock_irqrestore(&rkt->lock, flags); + return ret; +} diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c new file mode 100644 index 000000000..b2c1b72d3 --- /dev/null +++ b/drivers/staging/rdma/hfi1/mad.c @@ -0,0 +1,4257 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \ + / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16))) + +#include "hfi.h" +#include "mad.h" +#include "trace.h" + +/* the reset value from the FM is supposed to be 0xffff, handle both */ +#define OPA_LINK_WIDTH_RESET_OLD 0x0fff +#define OPA_LINK_WIDTH_RESET 0xffff + +static int reply(struct ib_mad_hdr *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static inline void clear_opa_smp_data(struct opa_smp *smp) +{ + void *data = opa_get_smp_data(smp); + size_t size = opa_get_smp_data_size(smp); + + memset(data, 0, size); +} + +static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) +{ + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent; + struct ib_smp *smp; + int ret; + unsigned long flags; + unsigned long timeout; + int pkey_idx; + u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp; + + agent = ibp->send_agent; + if (!agent) + return; + + /* o14-3.2.1 */ + if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE) + return; + + /* o14-2 */ + if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout)) + return; + + pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); + if (pkey_idx < 0) { + pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n", + __func__, hfi1_get_pkey(ibp, 1)); + pkey_idx = 1; + } + + send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0, + IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, IB_MGMT_BASE_VERSION); + if (IS_ERR(send_buf)) + return; + + smp = send_buf->mad; + smp->base_version = IB_MGMT_BASE_VERSION; + smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + smp->class_version = 1; + smp->method = IB_MGMT_METHOD_TRAP; + ibp->tid++; + smp->tid = cpu_to_be64(ibp->tid); + smp->attr_id = IB_SMP_ATTR_NOTICE; + /* o14-1: smp->mkey = 0; */ + memcpy(smp->data, data, len); + + spin_lock_irqsave(&ibp->lock, flags); + if (!ibp->sm_ah) { + if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { + struct ib_ah *ah; + + ah = hfi1_create_qp0_ah(ibp, ibp->sm_lid); + if (IS_ERR(ah)) + ret = PTR_ERR(ah); + else { + send_buf->ah = ah; + ibp->sm_ah = to_iah(ah); + ret = 0; + } + } else + ret = -EINVAL; + } else { + send_buf->ah = &ibp->sm_ah->ibah; + ret = 0; + } + spin_unlock_irqrestore(&ibp->lock, flags); + + if (!ret) + ret = ib_post_send_mad(send_buf, NULL); + if (!ret) { + /* 4.096 usec. */ + timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000; + ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout); + } else { + ib_free_send_mad(send_buf); + ibp->trap_timeout = 0; + } +} + +/* + * Send a bad [PQ]_Key trap (ch. 14.3.8). + */ +void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2) +{ + struct ib_mad_notice_attr data; + + if (trap_num == IB_NOTICE_TRAP_BAD_PKEY) + ibp->pkey_violations++; + else + ibp->qkey_violations++; + ibp->n_pkt_drops++; + + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = trap_num; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_257_258.lid1 = lid1; + data.details.ntc_257_258.lid2 = lid2; + data.details.ntc_257_258.key = cpu_to_be32(key); + data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1); + data.details.ntc_257_258.qp2 = cpu_to_be32(qp2); + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a bad M_Key trap (ch. 14.3.9). + */ +static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, + __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt) +{ + struct ib_mad_notice_attr data; + + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_BAD_MKEY; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_256.lid = data.issuer_lid; + data.details.ntc_256.method = mad->method; + data.details.ntc_256.attr_id = mad->attr_id; + data.details.ntc_256.attr_mod = mad->attr_mod; + data.details.ntc_256.mkey = mkey; + if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + + data.details.ntc_256.dr_slid = (__force __be16)dr_slid; + data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; + if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) { + data.details.ntc_256.dr_trunc_hop |= + IB_NOTICE_TRAP_DR_TRUNC; + hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path); + } + data.details.ntc_256.dr_trunc_hop |= hop_cnt; + memcpy(data.details.ntc_256.dr_rtn_path, return_path, + hop_cnt); + } + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a Port Capability Mask Changed trap (ch. 14.3.11). + */ +void hfi1_cap_mask_chg(struct hfi1_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_144.lid = data.issuer_lid; + data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags); + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a System Image GUID Changed trap (ch. 14.3.12). + */ +void hfi1_sys_guid_chg(struct hfi1_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_145.lid = data.issuer_lid; + data.details.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid; + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a Node Description Changed trap (ch. 14.3.13). + */ +void hfi1_node_desc_chg(struct hfi1_ibport *ibp) +{ + struct ib_mad_notice_attr data; + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_msb = 0; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG; + data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid); + data.toggle_count = 0; + memset(&data.details, 0, sizeof(data.details)); + data.details.ntc_144.lid = data.issuer_lid; + data.details.ntc_144.local_changes = 1; + data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG; + + send_trap(ibp, &data, sizeof(data)); +} + +static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct opa_node_description *nd; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + nd = (struct opa_node_description *)data; + + memcpy(nd->data, ibdev->node_desc, sizeof(nd->data)); + + if (resp_len) + *resp_len += sizeof(*nd); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_node_info *ni; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */ + + ni = (struct opa_node_info *)data; + + /* GUID 0 is illegal */ + if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ni->port_guid = cpu_to_be64(dd->pport[pidx].guid); + ni->base_version = OPA_MGMT_BASE_VERSION; + ni->class_version = OPA_SMI_CLASS_VERSION; + ni->node_type = 1; /* channel adapter */ + ni->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + ni->system_image_guid = ib_hfi1_sys_image_guid; + /* Use first-port GUID as node */ + ni->node_guid = cpu_to_be64(dd->pport->guid); + ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd)); + ni->device_id = cpu_to_be16(dd->pcidev->device); + ni->revision = cpu_to_be32(dd->minrev); + ni->local_port_num = port; + ni->vendor_id[0] = dd->oui1; + ni->vendor_id[1] = dd->oui2; + ni->vendor_id[2] = dd->oui3; + + if (resp_len) + *resp_len += sizeof(*ni); + + return reply((struct ib_mad_hdr *)smp); +} + +static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct ib_node_info *nip = (struct ib_node_info *)&smp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */ + + /* GUID 0 is illegal */ + if (smp->attr_mod || pidx >= dd->num_pports || + dd->pport[pidx].guid == 0) + smp->status |= IB_SMP_INVALID_FIELD; + else + nip->port_guid = cpu_to_be64(dd->pport[pidx].guid); + + nip->base_version = OPA_MGMT_BASE_VERSION; + nip->class_version = OPA_SMI_CLASS_VERSION; + nip->node_type = 1; /* channel adapter */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = ib_hfi1_sys_image_guid; + /* Use first-port GUID as node */ + nip->node_guid = cpu_to_be64(dd->pport->guid); + nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->pcidev->device); + nip->revision = cpu_to_be32(dd->minrev); + nip->local_port_num = port; + nip->vendor_id[0] = dd->oui1; + nip->vendor_id[1] = dd->oui2; + nip->vendor_id[2] = dd->oui3; + + return reply((struct ib_mad_hdr *)smp); +} + +static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w); +} + +static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w); +} + +static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s); +} + +static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, + int mad_flags, __be64 mkey, __be32 dr_slid, + u8 return_path[], u8 hop_cnt) +{ + int valid_mkey = 0; + int ret = 0; + + /* Is the mkey in the process of expiring? */ + if (ibp->mkey_lease_timeout && + time_after_eq(jiffies, ibp->mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + ibp->mkey_lease_timeout = 0; + ibp->mkeyprot = 0; + } + + if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->mkey == 0 || + ibp->mkey == mkey) + valid_mkey = 1; + + /* Unset lease timeout on any valid Get/Set/TrapRepress */ + if (valid_mkey && ibp->mkey_lease_timeout && + (mad->method == IB_MGMT_METHOD_GET || + mad->method == IB_MGMT_METHOD_SET || + mad->method == IB_MGMT_METHOD_TRAP_REPRESS)) + ibp->mkey_lease_timeout = 0; + + if (!valid_mkey) { + switch (mad->method) { + case IB_MGMT_METHOD_GET: + /* Bad mkey not a violation below level 2 */ + if (ibp->mkeyprot < 2) + break; + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (ibp->mkey_violations != 0xFFFF) + ++ibp->mkey_violations; + if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period) + ibp->mkey_lease_timeout = jiffies + + ibp->mkey_lease_period * HZ; + /* Generate a trap notice. */ + bad_mkey(ibp, mad, mkey, dr_slid, return_path, + hop_cnt); + ret = 1; + } + } + + return ret; +} + +/* + * The SMA caches reads from LCB registers in case the LCB is unavailable. + * (The LCB is unavailable in certain link states, for example.) + */ +struct lcb_datum { + u32 off; + u64 val; +}; + +static struct lcb_datum lcb_cache[] = { + { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 }, +}; + +static int write_lcb_cache(u32 off, u64 val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + if (lcb_cache[i].off == off) { + lcb_cache[i].val = val; + return 0; + } + } + + pr_warn("%s bad offset 0x%x\n", __func__, off); + return -1; +} + +static int read_lcb_cache(u32 off, u64 *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + if (lcb_cache[i].off == off) { + *val = lcb_cache[i].val; + return 0; + } + } + + pr_warn("%s bad offset 0x%x\n", __func__, off); + return -1; +} + +void read_ltp_rtt(struct hfi1_devdata *dd) +{ + u64 reg; + + if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, ®)) + dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__); + else + write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg); +} + +static u8 __opa_porttype(struct hfi1_pportdata *ppd) +{ + if (qsfp_mod_present(ppd)) { + if (ppd->qsfp_info.cache_valid) + return OPA_PORT_TYPE_STANDARD; + return OPA_PORT_TYPE_DISCONNECTED; + } + return OPA_PORT_TYPE_UNKNOWN; +} + +static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int i; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct opa_port_info *pi = (struct opa_port_info *)data; + u8 mtu; + u8 credit_rate; + u32 state; + u32 num_ports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 buffer_units; + u64 tmp = 0; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + ibp = &ppd->ibport_data; + + if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + pi->lid = cpu_to_be32(ppd->lid); + + /* Only return the mkey if the protection field allows it. */ + if (!(smp->method == IB_MGMT_METHOD_GET && + ibp->mkey != smp->mkey && + ibp->mkeyprot == 1)) + pi->mkey = ibp->mkey; + + pi->subnet_prefix = ibp->gid_prefix; + pi->sm_lid = cpu_to_be32(ibp->sm_lid); + pi->ib_cap_mask = cpu_to_be32(ibp->port_cap_flags); + pi->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period); + pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp); + pi->sa_qp = cpu_to_be32(ppd->sa_qp); + + pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled); + pi->link_width.supported = cpu_to_be16(ppd->link_width_supported); + pi->link_width.active = cpu_to_be16(ppd->link_width_active); + + pi->link_width_downgrade.supported = + cpu_to_be16(ppd->link_width_downgrade_supported); + pi->link_width_downgrade.enabled = + cpu_to_be16(ppd->link_width_downgrade_enabled); + pi->link_width_downgrade.tx_active = + cpu_to_be16(ppd->link_width_downgrade_tx_active); + pi->link_width_downgrade.rx_active = + cpu_to_be16(ppd->link_width_downgrade_rx_active); + + pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported); + pi->link_speed.active = cpu_to_be16(ppd->link_speed_active); + pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled); + + state = driver_lstate(ppd); + + if (start_of_sm_config && (state == IB_PORT_INIT)) + ppd->is_sm_config_started = 1; + + pi->port_phys_conf = __opa_porttype(ppd) & 0xf; + +#if PI_LED_ENABLE_SUP + pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; + pi->port_states.ledenable_offlinereason |= + ppd->is_sm_config_started << 5; + pi->port_states.ledenable_offlinereason |= + ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON; +#else + pi->port_states.offline_reason = ppd->neighbor_normal << 4; + pi->port_states.offline_reason |= ppd->is_sm_config_started << 5; + pi->port_states.offline_reason |= ppd->offline_disabled_reason & + OPA_PI_MASK_OFFLINE_REASON; +#endif /* PI_LED_ENABLE_SUP */ + + pi->port_states.portphysstate_portstate = + (hfi1_ibphys_portstate(ppd) << 4) | state; + + pi->mkeyprotect_lmc = (ibp->mkeyprot << 6) | ppd->lmc; + + memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu)); + for (i = 0; i < ppd->vls_supported; i++) { + mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU); + if ((i % 2) == 0) + pi->neigh_mtu.pvlx_to_mtu[i/2] |= (mtu << 4); + else + pi->neigh_mtu.pvlx_to_mtu[i/2] |= mtu; + } + /* don't forget VL 15 */ + mtu = mtu_to_enum(dd->vld[15].mtu, 2048); + pi->neigh_mtu.pvlx_to_mtu[15/2] |= mtu; + pi->smsl = ibp->sm_sl & OPA_PI_MASK_SMSL; + pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS); + pi->partenforce_filterraw |= + (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON); + if (ppd->part_enforce & HFI1_PART_ENFORCE_IN) + pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN; + if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT) + pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT; + pi->mkey_violations = cpu_to_be16(ibp->mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pi->pkey_violations = cpu_to_be16(ibp->pkey_violations); + pi->qkey_violations = cpu_to_be16(ibp->qkey_violations); + + pi->vl.cap = ppd->vls_supported; + pi->vl.high_limit = cpu_to_be16(ibp->vl_high_limit); + pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP); + pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP); + + pi->clientrereg_subnettimeout = ibp->subnet_timeout; + + pi->port_link_mode = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 | + OPA_PORT_LINK_MODE_OPA << 5 | + OPA_PORT_LINK_MODE_OPA); + + pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode); + + pi->port_mode = cpu_to_be16( + ppd->is_active_optimize_enabled ? + OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0); + + pi->port_packet_format.supported = + cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B); + pi->port_packet_format.enabled = + cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B); + + /* flit_control.interleave is (OPA V1, version .76): + * bits use + * ---- --- + * 2 res + * 2 DistanceSupported + * 2 DistanceEnabled + * 5 MaxNextLevelTxEnabled + * 5 MaxNestLevelRxSupported + * + * HFI supports only "distance mode 1" (see OPA V1, version .76, + * section 9.6.2), so set DistanceSupported, DistanceEnabled + * to 0x1. + */ + pi->flit_control.interleave = cpu_to_be16(0x1400); + + pi->link_down_reason = ppd->local_link_down_reason.sma; + pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma; + pi->port_error_action = cpu_to_be32(ppd->port_error_action); + pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096); + + /* 32.768 usec. response time (guessing) */ + pi->resptimevalue = 3; + + pi->local_port_num = port; + + /* buffer info for FM */ + pi->overall_buffer_space = cpu_to_be16(dd->link_credits); + + pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid); + pi->neigh_port_num = ppd->neighbor_port_number; + pi->port_neigh_mode = + (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) | + (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) | + (ppd->neighbor_fm_security ? + OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0); + + /* HFIs shall always return VL15 credits to their + * neighbor in a timely manner, without any credit return pacing. + */ + credit_rate = 0; + buffer_units = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC; + buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK; + buffer_units |= (credit_rate << 6) & + OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE; + buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT; + pi->buffer_units = cpu_to_be32(buffer_units); + + pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported); + + /* HFI supports a replay buffer 128 LTPs in size */ + pi->replay_depth.buffer = 0x80; + /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ + read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp); + + /* this counter is 16 bits wide, but the replay_depth.wire + * variable is only 8 bits */ + if (tmp > 0xff) + tmp = 0xff; + pi->replay_depth.wire = tmp; + + if (resp_len) + *resp_len += sizeof(struct opa_port_info); + + return reply((struct ib_mad_hdr *)smp); +} + +/** + * get_pkeys - return the PKEY table + * @dd: the hfi1_ib device + * @port: the IB port number + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) +{ + struct hfi1_pportdata *ppd = dd->pport + port - 1; + + memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys)); + + return 0; +} + +static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 n_blocks_req = OPA_AM_NBLK(am); + u32 start_block = am & 0x7ff; + __be16 *p; + u16 *q; + int i; + u16 n_blocks_avail; + unsigned npkeys = hfi1_get_npkeys(dd); + size_t size; + + if (n_blocks_req == 0) { + pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n", + port, start_block, n_blocks_req); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + n_blocks_avail = (u16) (npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1; + + size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16); + + if (start_block + n_blocks_req > n_blocks_avail || + n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) { + pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; " + "avail 0x%x; blk/smp 0x%lx\n", + start_block, n_blocks_req, n_blocks_avail, + OPA_NUM_PKEY_BLOCKS_PER_SMP); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + p = (__be16 *) data; + q = (u16 *)data; + /* get the real pkeys if we are requesting the first block */ + if (start_block == 0) { + get_pkeys(dd, port, q); + for (i = 0; i < npkeys; i++) + p[i] = cpu_to_be16(q[i]); + if (resp_len) + *resp_len += size; + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply((struct ib_mad_hdr *)smp); +} + +enum { + HFI_TRANSITION_DISALLOWED, + HFI_TRANSITION_IGNORED, + HFI_TRANSITION_ALLOWED, + HFI_TRANSITION_UNDEFINED, +}; + +/* + * Use shortened names to improve readability of + * {logical,physical}_state_transitions + */ +enum { + __D = HFI_TRANSITION_DISALLOWED, + __I = HFI_TRANSITION_IGNORED, + __A = HFI_TRANSITION_ALLOWED, + __U = HFI_TRANSITION_UNDEFINED, +}; + +/* + * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are + * represented in physical_state_transitions. + */ +#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1) + +/* + * Within physical_state_transitions, rows represent "old" states, + * columns "new" states, and physical_state_transitions.allowed[old][new] + * indicates if the transition from old state to new state is legal (see + * OPAg1v1, Table 6-4). + */ +static const struct { + u8 allowed[__N_PHYSTATES][__N_PHYSTATES]; +} physical_state_transitions = { + { + /* 2 3 4 5 6 7 8 9 10 11 */ + /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D }, + /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A }, + /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D }, + /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D }, + /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D }, + /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I }, + } +}; + +/* + * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented + * logical_state_transitions + */ + +#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1) + +/* + * Within logical_state_transitions rows represent "old" states, + * columns "new" states, and logical_state_transitions.allowed[old][new] + * indicates if the transition from old state to new state is legal (see + * OPAg1v1, Table 9-12). + */ +static const struct { + u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES]; +} logical_state_transitions = { + { + /* 1 2 3 4 5 */ + /* 1 */ { __I, __D, __D, __D, __U}, + /* 2 */ { __D, __I, __A, __D, __U}, + /* 3 */ { __D, __D, __I, __A, __U}, + /* 4 */ { __D, __D, __I, __I, __U}, + /* 5 */ { __U, __U, __U, __U, __U}, + } +}; + +static int logical_transition_allowed(int old, int new) +{ + if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER || + new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) { + pr_warn("invalid logical state(s) (old %d new %d)\n", + old, new); + return HFI_TRANSITION_UNDEFINED; + } + + if (new == IB_PORT_NOP) + return HFI_TRANSITION_ALLOWED; /* always allowed */ + + /* adjust states for indexing into logical_state_transitions */ + old -= IB_PORT_DOWN; + new -= IB_PORT_DOWN; + + if (old < 0 || new < 0) + return HFI_TRANSITION_UNDEFINED; + return logical_state_transitions.allowed[old][new]; +} + +static int physical_transition_allowed(int old, int new) +{ + if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX || + new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) { + pr_warn("invalid physical state(s) (old %d new %d)\n", + old, new); + return HFI_TRANSITION_UNDEFINED; + } + + if (new == IB_PORTPHYSSTATE_NOP) + return HFI_TRANSITION_ALLOWED; /* always allowed */ + + /* adjust states for indexing into physical_state_transitions */ + old -= IB_PORTPHYSSTATE_POLLING; + new -= IB_PORTPHYSSTATE_POLLING; + + if (old < 0 || new < 0) + return HFI_TRANSITION_UNDEFINED; + return physical_state_transitions.allowed[old][new]; +} + +static int port_states_transition_allowed(struct hfi1_pportdata *ppd, + u32 logical_new, u32 physical_new) +{ + u32 physical_old = driver_physical_state(ppd); + u32 logical_old = driver_logical_state(ppd); + int ret, logical_allowed, physical_allowed; + + logical_allowed = ret = + logical_transition_allowed(logical_old, logical_new); + + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + pr_warn("invalid logical state transition %s -> %s\n", + opa_lstate_name(logical_old), + opa_lstate_name(logical_new)); + return ret; + } + + physical_allowed = ret = + physical_transition_allowed(physical_old, physical_new); + + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + pr_warn("invalid physical state transition %s -> %s\n", + opa_pstate_name(physical_old), + opa_pstate_name(physical_new)); + return ret; + } + + if (logical_allowed == HFI_TRANSITION_IGNORED && + physical_allowed == HFI_TRANSITION_IGNORED) + return HFI_TRANSITION_IGNORED; + + /* + * Either physical_allowed or logical_allowed is + * HFI_TRANSITION_ALLOWED. + */ + return HFI_TRANSITION_ALLOWED; +} + +static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, + u32 logical_state, u32 phys_state, + int suppress_idle_sma) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 link_state; + int ret; + + ret = port_states_transition_allowed(ppd, logical_state, phys_state); + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + /* error message emitted above */ + smp->status |= IB_SMP_INVALID_FIELD; + return 0; + } + + if (ret == HFI_TRANSITION_IGNORED) + return 0; + + if ((phys_state != IB_PORTPHYSSTATE_NOP) && + !(logical_state == IB_PORT_DOWN || + logical_state == IB_PORT_NOP)){ + pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n", + logical_state, phys_state); + smp->status |= IB_SMP_INVALID_FIELD; + } + + /* + * Logical state changes are summarized in OPAv1g1 spec., + * Table 9-12; physical state changes are summarized in + * OPAv1g1 spec., Table 6.4. + */ + switch (logical_state) { + case IB_PORT_NOP: + if (phys_state == IB_PORTPHYSSTATE_NOP) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (phys_state == IB_PORTPHYSSTATE_NOP) + link_state = HLS_DN_DOWNDEF; + else if (phys_state == IB_PORTPHYSSTATE_POLLING) { + link_state = HLS_DN_POLL; + set_link_down_reason(ppd, + OPA_LINKDOWN_REASON_FM_BOUNCE, 0, + OPA_LINKDOWN_REASON_FM_BOUNCE); + } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) + link_state = HLS_DN_DISABLE; + else { + pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n", + phys_state); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + set_link_state(ppd, link_state); + if (link_state == HLS_DN_DISABLE && + (ppd->offline_disabled_reason > + OPA_LINKDOWN_REASON_SMA_DISABLED || + ppd->offline_disabled_reason == + OPA_LINKDOWN_REASON_NONE)) + ppd->offline_disabled_reason = + OPA_LINKDOWN_REASON_SMA_DISABLED; + /* + * Don't send a reply if the response would be sent + * through the disabled port. + */ + if (link_state == HLS_DN_DISABLE && smp->hop_cnt) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + break; + case IB_PORT_ARMED: + ret = set_link_state(ppd, HLS_UP_ARMED); + if ((ret == 0) && (suppress_idle_sma == 0)) + send_idle_sma(dd, SMA_IDLE_ARM); + break; + case IB_PORT_ACTIVE: + if (ppd->neighbor_normal) { + ret = set_link_state(ppd, HLS_UP_ACTIVE); + if (ret == 0) + send_idle_sma(dd, SMA_IDLE_ACTIVE); + } else { + pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n"); + smp->status |= IB_SMP_INVALID_FIELD; + } + break; + default: + pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n", + logical_state); + smp->status |= IB_SMP_INVALID_FIELD; + } + + return 0; +} + +/** + * subn_set_opa_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + */ +static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_port_info *pi = (struct opa_port_info *)data; + struct ib_event event; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u8 clientrereg; + unsigned long flags; + u32 smlid, opa_lid; /* tmp vars to hold LID values */ + u16 lid; + u8 ls_old, ls_new, ps_new; + u8 vls; + u8 msl; + u8 crc_enabled; + u16 lse, lwe, mtu; + u32 num_ports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + int ret, i, invalid = 0, call_set_mtu = 0; + int call_link_downgrade_policy = 0; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + opa_lid = be32_to_cpu(pi->lid); + if (opa_lid & 0xFFFF0000) { + pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid); + smp->status |= IB_SMP_INVALID_FIELD; + goto get_only; + } + + lid = (u16)(opa_lid & 0x0000FFFF); + + smlid = be32_to_cpu(pi->sm_lid); + if (smlid & 0xFFFF0000) { + pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid); + smp->status |= IB_SMP_INVALID_FIELD; + goto get_only; + } + smlid &= 0x0000FFFF; + + clientrereg = (pi->clientrereg_subnettimeout & + OPA_PI_MASK_CLIENT_REREGISTER); + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + ibp = &ppd->ibport_data; + event.device = ibdev; + event.element.port_num = port; + + ls_old = driver_lstate(ppd); + + ibp->mkey = pi->mkey; + ibp->gid_prefix = pi->subnet_prefix; + ibp->mkey_lease_period = be16_to_cpu(pi->mkey_lease_period); + + /* Must be a valid unicast LID address. */ + if ((lid == 0 && ls_old > IB_PORT_INIT) || + lid >= HFI1_MULTICAST_LID_BASE) { + smp->status |= IB_SMP_INVALID_FIELD; + pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n", + lid); + } else if (ppd->lid != lid || + ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) { + if (ppd->lid != lid) + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT); + if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT); + hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + msl = pi->smsl & OPA_PI_MASK_SMSL; + if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON) + ppd->linkinit_reason = + (pi->partenforce_filterraw & + OPA_PI_MASK_LINKINIT_REASON); + /* enable/disable SW pkey checking as per FM control */ + if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN) + ppd->part_enforce |= HFI1_PART_ENFORCE_IN; + else + ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN; + + if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT) + ppd->part_enforce |= HFI1_PART_ENFORCE_OUT; + else + ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT; + + /* Must be a valid unicast LID address. */ + if ((smlid == 0 && ls_old > IB_PORT_INIT) || + smlid >= HFI1_MULTICAST_LID_BASE) { + smp->status |= IB_SMP_INVALID_FIELD; + pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid); + } else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) { + pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid); + spin_lock_irqsave(&ibp->lock, flags); + if (ibp->sm_ah) { + if (smlid != ibp->sm_lid) + ibp->sm_ah->attr.dlid = smlid; + if (msl != ibp->sm_sl) + ibp->sm_ah->attr.sl = msl; + } + spin_unlock_irqrestore(&ibp->lock, flags); + if (smlid != ibp->sm_lid) + ibp->sm_lid = smlid; + if (msl != ibp->sm_sl) + ibp->sm_sl = msl; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + if (pi->link_down_reason == 0) { + ppd->local_link_down_reason.sma = 0; + ppd->local_link_down_reason.latest = 0; + } + + if (pi->neigh_link_down_reason == 0) { + ppd->neigh_link_down_reason.sma = 0; + ppd->neigh_link_down_reason.latest = 0; + } + + ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp); + ppd->sa_qp = be32_to_cpu(pi->sa_qp); + + ppd->port_error_action = be32_to_cpu(pi->port_error_action); + lwe = be16_to_cpu(pi->link_width.enabled); + if (lwe) { + if (lwe == OPA_LINK_WIDTH_RESET + || lwe == OPA_LINK_WIDTH_RESET_OLD) + set_link_width_enabled(ppd, ppd->link_width_supported); + else if ((lwe & ~ppd->link_width_supported) == 0) + set_link_width_enabled(ppd, lwe); + else + smp->status |= IB_SMP_INVALID_FIELD; + } + lwe = be16_to_cpu(pi->link_width_downgrade.enabled); + /* LWD.E is always applied - 0 means "disabled" */ + if (lwe == OPA_LINK_WIDTH_RESET + || lwe == OPA_LINK_WIDTH_RESET_OLD) { + set_link_width_downgrade_enabled(ppd, + ppd->link_width_downgrade_supported); + } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) { + /* only set and apply if something changed */ + if (lwe != ppd->link_width_downgrade_enabled) { + set_link_width_downgrade_enabled(ppd, lwe); + call_link_downgrade_policy = 1; + } + } else + smp->status |= IB_SMP_INVALID_FIELD; + + lse = be16_to_cpu(pi->link_speed.enabled); + if (lse) { + if (lse & be16_to_cpu(pi->link_speed.supported)) + set_link_speed_enabled(ppd, lse); + else + smp->status |= IB_SMP_INVALID_FIELD; + } + + ibp->mkeyprot = (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6; + ibp->vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT, + ibp->vl_high_limit); + + if (ppd->vls_supported/2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + for (i = 0; i < ppd->vls_supported; i++) { + if ((i % 2) == 0) + mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i/2] >> 4) + & 0xF); + else + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i/2] & 0xF); + if (mtu == 0xffff) { + pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n", + mtu, + (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF); + smp->status |= IB_SMP_INVALID_FIELD; + mtu = hfi1_max_mtu; /* use a valid MTU */ + } + if (dd->vld[i].mtu != mtu) { + dd_dev_info(dd, + "MTU change on vl %d from %d to %d\n", + i, dd->vld[i].mtu, mtu); + dd->vld[i].mtu = mtu; + call_set_mtu++; + } + } + /* As per OPAV1 spec: VL15 must support and be configured + * for operation with a 2048 or larger MTU. + */ + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15/2] & 0xF); + if (mtu < 2048 || mtu == 0xffff) + mtu = 2048; + if (dd->vld[15].mtu != mtu) { + dd_dev_info(dd, + "MTU change on vl 15 from %d to %d\n", + dd->vld[15].mtu, mtu); + dd->vld[15].mtu = mtu; + call_set_mtu++; + } + if (call_set_mtu) + set_mtu(ppd); + + /* Set operational VLs */ + vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL; + if (vls) { + if (vls > ppd->vls_supported) { + pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n", + pi->operational_vls); + smp->status |= IB_SMP_INVALID_FIELD; + } else { + if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS, + vls) == -EINVAL) + smp->status |= IB_SMP_INVALID_FIELD; + } + } + + if (pi->mkey_violations == 0) + ibp->mkey_violations = 0; + + if (pi->pkey_violations == 0) + ibp->pkey_violations = 0; + + if (pi->qkey_violations == 0) + ibp->qkey_violations = 0; + + ibp->subnet_timeout = + pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT; + + crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode); + crc_enabled >>= 4; + crc_enabled &= 0xf; + + if (crc_enabled != 0) + ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled); + + ppd->is_active_optimize_enabled = + !!(be16_to_cpu(pi->port_mode) + & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE); + + ls_new = pi->port_states.portphysstate_portstate & + OPA_PI_MASK_PORT_STATE; + ps_new = (pi->port_states.portphysstate_portstate & + OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4; + + if (ls_old == IB_PORT_INIT) { + if (start_of_sm_config) { + if (ls_new == ls_old || (ls_new == IB_PORT_ARMED)) + ppd->is_sm_config_started = 1; + } else if (ls_new == IB_PORT_ARMED) { + if (ppd->is_sm_config_started == 0) + invalid = 1; + } + } + + /* Handle CLIENT_REREGISTER event b/c SM asked us for it */ + if (clientrereg) { + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + + ret = set_port_states(ppd, smp, ls_new, ps_new, invalid); + if (ret) + return ret; + + ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len); + + /* restore re-reg bit per o14-12.2.1 */ + pi->clientrereg_subnettimeout |= clientrereg; + + /* + * Apply the new link downgrade policy. This may result in a link + * bounce. Do this after everything else so things are settled. + * Possible problem: if setting the port state above fails, then + * the policy change is not applied. + */ + if (call_link_downgrade_policy) + apply_link_downgrade_policy(ppd, 0); + + return ret; + +get_only: + return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len); +} + +/** + * set_pkeys - set the PKEY table for ctxt 0 + * @dd: the hfi1_ib device + * @port: the IB port number + * @pkeys: the PKEY table + */ +static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) +{ + struct hfi1_pportdata *ppd; + int i; + int changed = 0; + int update_includes_mgmt_partition = 0; + + /* + * IB port one/two always maps to context zero/one, + * always a kernel context, no locking needed + * If we get here with ppd setup, no need to check + * that rcd is valid. + */ + ppd = dd->pport + (port - 1); + /* + * If the update does not include the management pkey, don't do it. + */ + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (pkeys[i] == LIM_MGMT_P_KEY) { + update_includes_mgmt_partition = 1; + break; + } + } + + if (!update_includes_mgmt_partition) + return 1; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = ppd->pkeys[i]; + + if (key == okey) + continue; + /* + * The SM gives us the complete PKey table. We have + * to ensure that we put the PKeys in the matching + * slots. + */ + ppd->pkeys[i] = key; + changed = 1; + } + + if (changed) { + struct ib_event event; + + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev.ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); + } + return 0; +} + +static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 n_blocks_sent = OPA_AM_NBLK(am); + u32 start_block = am & 0x7ff; + u16 *p = (u16 *) data; + __be16 *q = (__be16 *)data; + int i; + u16 n_blocks_avail; + unsigned npkeys = hfi1_get_npkeys(dd); + + if (n_blocks_sent == 0) { + pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n", + port, start_block, n_blocks_sent); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + n_blocks_avail = (u16)(npkeys/OPA_PARTITION_TABLE_BLK_SIZE) + 1; + + if (start_block + n_blocks_sent > n_blocks_avail || + n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) { + pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n", + start_block, n_blocks_sent, n_blocks_avail, + OPA_NUM_PKEY_BLOCKS_PER_SMP); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++) + p[i] = be16_to_cpu(q[i]); + + if (start_block == 0 && set_pkeys(dd, port, p) != 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len); +} + +static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data) +{ + u64 *val = (u64 *)data; + + *val++ = read_csr(dd, SEND_SC2VLT0); + *val++ = read_csr(dd, SEND_SC2VLT1); + *val++ = read_csr(dd, SEND_SC2VLT2); + *val++ = read_csr(dd, SEND_SC2VLT3); + return 0; +} + +#define ILLEGAL_VL 12 +/* + * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except + * for SC15, which must map to VL15). If we don't remap things this + * way it is possible for VL15 counters to increment when we try to + * send on a SC which is mapped to an invalid VL. + */ +static void filter_sc2vlt(void *data) +{ + int i; + u8 *pd = (u8 *)data; + + for (i = 0; i < OPA_MAX_SCS; i++) { + if (i == 15) + continue; + if ((pd[i] & 0x1f) == 0xf) + pd[i] = ILLEGAL_VL; + } +} + +static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data) +{ + u64 *val = (u64 *)data; + + filter_sc2vlt(data); + + write_csr(dd, SEND_SC2VLT0, *val++); + write_csr(dd, SEND_SC2VLT1, *val++); + write_csr(dd, SEND_SC2VLT2, *val++); + write_csr(dd, SEND_SC2VLT3, *val++); + write_seqlock_irq(&dd->sc2vl_lock); + memcpy(dd->sc2vl, (u64 *)data, sizeof(dd->sc2vl)); + write_sequnlock_irq(&dd->sc2vl_lock); + return 0; +} + +static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *)data; + size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */ + unsigned i; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) + *p++ = ibp->sl_to_sc[i]; + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *)data; + int i; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) + ibp->sl_to_sc[i] = *p++; + + return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *)data; + size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */ + unsigned i; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++) + *p++ = ibp->sc_to_sl[i]; + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = (u8 *)data; + int i; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++) + ibp->sc_to_sl[i] = *p++; + + return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NBLK(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + void *vp = (void *) data; + size_t size = 4 * sizeof(u64); + + if (n_blocks != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + get_sc2vlt_tables(dd, vp); + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NBLK(am); + int async_update = OPA_AM_ASYNC(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + void *vp = (void *) data; + struct hfi1_pportdata *ppd; + int lstate; + + if (n_blocks != 1 || async_update) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + lstate = driver_lstate(ppd); + /* it's known that async_update is 0 by this point, but include + * the explicit check for clarity */ + if (!async_update && + (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + set_sc2vlt_tables(dd, vp); + + return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + void *vp = (void *) data; + int size; + + if (n_blocks != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + + size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp); + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + void *vp = (void *) data; + int lstate; + + if (n_blocks != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + lstate = driver_lstate(ppd); + if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + + fm_set_table(ppd, FM_TBL_SC2VLNT, vp); + + return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len); +} + +static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 nports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 lstate; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct opa_port_state_info *psi = (struct opa_port_state_info *) data; + + if (nports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ibp = to_iport(ibdev, port); + ppd = ppd_from_ibp(ibp); + + lstate = driver_lstate(ppd); + + if (start_of_sm_config && (lstate == IB_PORT_INIT)) + ppd->is_sm_config_started = 1; + +#if PI_LED_ENABLE_SUP + psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; + psi->port_states.ledenable_offlinereason |= + ppd->is_sm_config_started << 5; + psi->port_states.ledenable_offlinereason |= + ppd->offline_disabled_reason & OPA_PI_MASK_OFFLINE_REASON; +#else + psi->port_states.offline_reason = ppd->neighbor_normal << 4; + psi->port_states.offline_reason |= ppd->is_sm_config_started << 5; + psi->port_states.offline_reason |= ppd->offline_disabled_reason & + OPA_PI_MASK_OFFLINE_REASON; +#endif /* PI_LED_ENABLE_SUP */ + + psi->port_states.portphysstate_portstate = + (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf); + psi->link_width_downgrade_tx_active = + cpu_to_be16(ppd->link_width_downgrade_tx_active); + psi->link_width_downgrade_rx_active = + cpu_to_be16(ppd->link_width_downgrade_rx_active); + if (resp_len) + *resp_len += sizeof(struct opa_port_state_info); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 nports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 ls_old; + u8 ls_new, ps_new; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct opa_port_state_info *psi = (struct opa_port_state_info *) data; + int ret, invalid = 0; + + if (nports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ibp = to_iport(ibdev, port); + ppd = ppd_from_ibp(ibp); + + ls_old = driver_lstate(ppd); + + ls_new = port_states_to_logical_state(&psi->port_states); + ps_new = port_states_to_phys_state(&psi->port_states); + + if (ls_old == IB_PORT_INIT) { + if (start_of_sm_config) { + if (ls_new == ls_old || (ls_new == IB_PORT_ARMED)) + ppd->is_sm_config_started = 1; + } else if (ls_new == IB_PORT_ARMED) { + if (ppd->is_sm_config_started == 0) + invalid = 1; + } + } + + ret = set_port_states(ppd, smp, ls_new, ps_new, invalid); + if (ret) + return ret; + + if (invalid) + smp->status |= IB_SMP_INVALID_FIELD; + + return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 addr = OPA_AM_CI_ADDR(am); + u32 len = OPA_AM_CI_LEN(am) + 1; + int ret; + +#define __CI_PAGE_SIZE (1 << 7) /* 128 bytes */ +#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1) +#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK) + + /* check that addr is within spec, and + * addr and (addr + len - 1) are on the same "page" */ + if (addr >= 4096 || + (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ret = get_cable_info(dd, port, addr, len, data); + + if (ret == -ENODEV) { + smp->status |= IB_SMP_UNSUP_METH_ATTR; + return reply((struct ib_mad_hdr *)smp); + } + + /* The address range for the CableInfo SMA query is wider than the + * memory available on the QSFP cable. We want to return a valid + * response, albeit zeroed out, for address ranges beyond available + * memory but that are within the CableInfo query spec + */ + if (ret < 0 && ret != -ERANGE) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + if (resp_len) + *resp_len += len; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + u32 num_ports = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + struct buffer_control *p = (struct buffer_control *) data; + int size; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p); + trace_bct_get(dd, p); + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + u32 num_ports = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + struct buffer_control *p = (struct buffer_control *) data; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + ppd = dd->pport + (port - 1); + trace_bct_set(dd, p); + if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + u32 num_ports = OPA_AM_NPORT(am); + u8 section = (am & 0x00ff0000) >> 16; + u8 *p = data; + int size = 0; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + switch (section) { + case OPA_VLARB_LOW_ELEMENTS: + size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p); + break; + case OPA_VLARB_HIGH_ELEMENTS: + size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p); + break; + case OPA_VLARB_PREEMPT_ELEMENTS: + size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p); + break; + case OPA_VLARB_PREEMPT_MATRIX: + size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p); + break; + default: + pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n", + be32_to_cpu(smp->attr_mod)); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + if (size > 0 && resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + u32 num_ports = OPA_AM_NPORT(am); + u8 section = (am & 0x00ff0000) >> 16; + u8 *p = data; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + switch (section) { + case OPA_VLARB_LOW_ELEMENTS: + (void) fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p); + break; + case OPA_VLARB_HIGH_ELEMENTS: + (void) fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p); + break; + /* neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX + * can be changed from the default values */ + case OPA_VLARB_PREEMPT_ELEMENTS: + /* FALLTHROUGH */ + case OPA_VLARB_PREEMPT_MATRIX: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + break; + default: + pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n", + be32_to_cpu(smp->attr_mod)); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len); +} + +struct opa_pma_mad { + struct ib_mad_hdr mad_hdr; + u8 data[2024]; +} __packed; + +struct opa_class_port_info { + u8 base_version; + u8 class_version; + __be16 cap_mask; + __be32 cap_mask2_resp_time; + + u8 redirect_gid[16]; + __be32 redirect_tc_fl; + __be32 redirect_lid; + __be32 redirect_sl_qp; + __be32 redirect_qkey; + + u8 trap_gid[16]; + __be32 trap_tc_fl; + __be32 trap_lid; + __be32 trap_hl_qp; + __be32 trap_qkey; + + __be16 trap_pkey; + __be16 redirect_pkey; + + u8 trap_sl_rsvd; + u8 reserved[3]; +} __packed; + +struct opa_port_status_req { + __u8 port_num; + __u8 reserved[3]; + __be32 vl_select_mask; +}; + +#define VL_MASK_ALL 0x000080ff + +struct opa_port_status_rsp { + __u8 port_num; + __u8 reserved[3]; + __be32 vl_select_mask; + + /* Data counters */ + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_pkts; + __be64 port_rcv_pkts; + __be64 port_multicast_xmit_pkts; + __be64 port_multicast_rcv_pkts; + __be64 port_xmit_wait; + __be64 sw_port_congestion; + __be64 port_rcv_fecn; + __be64 port_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_xmit_wasted_bw; + __be64 port_xmit_wait_data; + __be64 port_rcv_bubble; + __be64 port_mark_fecn; + /* Error counters */ + __be64 port_rcv_constraint_errors; + __be64 port_rcv_switch_relay_errors; + __be64 port_xmit_discards; + __be64 port_xmit_constraint_errors; + __be64 port_rcv_remote_physical_errors; + __be64 local_link_integrity_errors; + __be64 port_rcv_errors; + __be64 excessive_buffer_overruns; + __be64 fm_config_errors; + __be32 link_error_recovery; + __be32 link_downed; + u8 uncorrectable_errors; + + u8 link_quality_indicator; /* 5res, 3bit */ + u8 res2[6]; + struct _vls_pctrs { + /* per-VL Data counters */ + __be64 port_vl_xmit_data; + __be64 port_vl_rcv_data; + __be64 port_vl_xmit_pkts; + __be64 port_vl_rcv_pkts; + __be64 port_vl_xmit_wait; + __be64 sw_port_vl_congestion; + __be64 port_vl_rcv_fecn; + __be64 port_vl_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_vl_xmit_wasted_bw; + __be64 port_vl_xmit_wait_data; + __be64 port_vl_rcv_bubble; + __be64 port_vl_mark_fecn; + __be64 port_vl_xmit_discards; + } vls[0]; /* real array size defined by # bits set in vl_select_mask */ +}; + +enum counter_selects { + CS_PORT_XMIT_DATA = (1 << 31), + CS_PORT_RCV_DATA = (1 << 30), + CS_PORT_XMIT_PKTS = (1 << 29), + CS_PORT_RCV_PKTS = (1 << 28), + CS_PORT_MCAST_XMIT_PKTS = (1 << 27), + CS_PORT_MCAST_RCV_PKTS = (1 << 26), + CS_PORT_XMIT_WAIT = (1 << 25), + CS_SW_PORT_CONGESTION = (1 << 24), + CS_PORT_RCV_FECN = (1 << 23), + CS_PORT_RCV_BECN = (1 << 22), + CS_PORT_XMIT_TIME_CONG = (1 << 21), + CS_PORT_XMIT_WASTED_BW = (1 << 20), + CS_PORT_XMIT_WAIT_DATA = (1 << 19), + CS_PORT_RCV_BUBBLE = (1 << 18), + CS_PORT_MARK_FECN = (1 << 17), + CS_PORT_RCV_CONSTRAINT_ERRORS = (1 << 16), + CS_PORT_RCV_SWITCH_RELAY_ERRORS = (1 << 15), + CS_PORT_XMIT_DISCARDS = (1 << 14), + CS_PORT_XMIT_CONSTRAINT_ERRORS = (1 << 13), + CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS = (1 << 12), + CS_LOCAL_LINK_INTEGRITY_ERRORS = (1 << 11), + CS_PORT_RCV_ERRORS = (1 << 10), + CS_EXCESSIVE_BUFFER_OVERRUNS = (1 << 9), + CS_FM_CONFIG_ERRORS = (1 << 8), + CS_LINK_ERROR_RECOVERY = (1 << 7), + CS_LINK_DOWNED = (1 << 6), + CS_UNCORRECTABLE_ERRORS = (1 << 5), +}; + +struct opa_clear_port_status { + __be64 port_select_mask[4]; + __be32 counter_select_mask; +}; + +struct opa_aggregate { + __be16 attr_id; + __be16 err_reqlength; /* 1 bit, 8 res, 7 bit */ + __be32 attr_mod; + u8 data[0]; +}; + +/* Request contains first two fields, response contains those plus the rest */ +struct opa_port_data_counters_msg { + __be64 port_select_mask[4]; + __be32 vl_select_mask; + + /* Response fields follow */ + __be32 reserved1; + struct _port_dctrs { + u8 port_number; + u8 reserved2[3]; + __be32 link_quality_indicator; /* 29res, 3bit */ + + /* Data counters */ + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_pkts; + __be64 port_rcv_pkts; + __be64 port_multicast_xmit_pkts; + __be64 port_multicast_rcv_pkts; + __be64 port_xmit_wait; + __be64 sw_port_congestion; + __be64 port_rcv_fecn; + __be64 port_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_xmit_wasted_bw; + __be64 port_xmit_wait_data; + __be64 port_rcv_bubble; + __be64 port_mark_fecn; + + __be64 port_error_counter_summary; + /* Sum of error counts/port */ + + struct _vls_dctrs { + /* per-VL Data counters */ + __be64 port_vl_xmit_data; + __be64 port_vl_rcv_data; + __be64 port_vl_xmit_pkts; + __be64 port_vl_rcv_pkts; + __be64 port_vl_xmit_wait; + __be64 sw_port_vl_congestion; + __be64 port_vl_rcv_fecn; + __be64 port_vl_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_vl_xmit_wasted_bw; + __be64 port_vl_xmit_wait_data; + __be64 port_vl_rcv_bubble; + __be64 port_vl_mark_fecn; + } vls[0]; + /* array size defined by #bits set in vl_select_mask*/ + } port[1]; /* array size defined by #ports in attribute modifier */ +}; + +struct opa_port_error_counters64_msg { + /* Request contains first two fields, response contains the + * whole magilla */ + __be64 port_select_mask[4]; + __be32 vl_select_mask; + + /* Response-only fields follow */ + __be32 reserved1; + struct _port_ectrs { + u8 port_number; + u8 reserved2[7]; + __be64 port_rcv_constraint_errors; + __be64 port_rcv_switch_relay_errors; + __be64 port_xmit_discards; + __be64 port_xmit_constraint_errors; + __be64 port_rcv_remote_physical_errors; + __be64 local_link_integrity_errors; + __be64 port_rcv_errors; + __be64 excessive_buffer_overruns; + __be64 fm_config_errors; + __be32 link_error_recovery; + __be32 link_downed; + u8 uncorrectable_errors; + u8 reserved3[7]; + struct _vls_ectrs { + __be64 port_vl_xmit_discards; + } vls[0]; + /* array size defined by #bits set in vl_select_mask */ + } port[1]; /* array size defined by #ports in attribute modifier */ +}; + +struct opa_port_error_info_msg { + __be64 port_select_mask[4]; + __be32 error_info_select_mask; + __be32 reserved1; + struct _port_ei { + + u8 port_number; + u8 reserved2[7]; + + /* PortRcvErrorInfo */ + struct { + u8 status_and_code; + union { + u8 raw[17]; + struct { + /* EI1to12 format */ + u8 packet_flit1[8]; + u8 packet_flit2[8]; + u8 remaining_flit_bits12; + } ei1to12; + struct { + u8 packet_bytes[8]; + u8 remaining_flit_bits; + } ei13; + } ei; + u8 reserved3[6]; + } __packed port_rcv_ei; + + /* ExcessiveBufferOverrunInfo */ + struct { + u8 status_and_sc; + u8 reserved4[7]; + } __packed excessive_buffer_overrun_ei; + + /* PortXmitConstraintErrorInfo */ + struct { + u8 status; + u8 reserved5; + __be16 pkey; + __be32 slid; + } __packed port_xmit_constraint_ei; + + /* PortRcvConstraintErrorInfo */ + struct { + u8 status; + u8 reserved6; + __be16 pkey; + __be32 slid; + } __packed port_rcv_constraint_ei; + + /* PortRcvSwitchRelayErrorInfo */ + struct { + u8 status_and_code; + u8 reserved7[3]; + __u32 error_info; + } __packed port_rcv_switch_relay_ei; + + /* UncorrectableErrorInfo */ + struct { + u8 status_and_code; + u8 reserved8; + } __packed uncorrectable_ei; + + /* FMConfigErrorInfo */ + struct { + u8 status_and_code; + u8 error_info; + } __packed fm_config_ei; + __u32 reserved9; + } port[1]; /* actual array size defined by #ports in attr modifier */ +}; + +/* opa_port_error_info_msg error_info_select_mask bit definitions */ +enum error_info_selects { + ES_PORT_RCV_ERROR_INFO = (1 << 31), + ES_EXCESSIVE_BUFFER_OVERRUN_INFO = (1 << 30), + ES_PORT_XMIT_CONSTRAINT_ERROR_INFO = (1 << 29), + ES_PORT_RCV_CONSTRAINT_ERROR_INFO = (1 << 28), + ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO = (1 << 27), + ES_UNCORRECTABLE_ERROR_INFO = (1 << 26), + ES_FM_CONFIG_ERROR_INFO = (1 << 25) +}; + +static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u32 *resp_len) +{ + struct opa_class_port_info *p = + (struct opa_class_port_info *)pmp->data; + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + p->base_version = OPA_MGMT_BASE_VERSION; + p->class_version = OPA_SMI_CLASS_VERSION; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. + */ + p->cap_mask2_resp_time = cpu_to_be32(18); + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)pmp); +} + +static void a0_portstatus(struct hfi1_pportdata *ppd, + struct opa_port_status_rsp *rsp, u32 vl_select_mask) +{ + if (!is_bx(ppd->dd)) { + unsigned long vl; + int vfi = 0; + u64 max_vl_xmit_wait = 0, tmp; + u32 vl_all_mask = VL_MASK_ALL; + u64 rcv_data, rcv_bubble; + + rcv_data = be64_to_cpu(rsp->port_rcv_data); + rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble); + /* In the measured time period, calculate the total number + * of flits that were received. Subtract out one false + * rcv_bubble increment for every 32 received flits but + * don't let the number go negative. + */ + if (rcv_bubble >= (rcv_data>>5)) { + rcv_bubble -= (rcv_data>>5); + rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble); + } + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(vl_select_mask)) { + rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data); + rcv_bubble = + be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble); + if (rcv_bubble >= (rcv_data>>5)) { + rcv_bubble -= (rcv_data>>5); + rsp->vls[vfi].port_vl_rcv_bubble = + cpu_to_be64(rcv_bubble); + } + vfi++; + } + + for_each_set_bit(vl, (unsigned long *)&(vl_all_mask), + 8 * sizeof(vl_all_mask)) { + tmp = read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl)); + if (tmp > max_vl_xmit_wait) + max_vl_xmit_wait = tmp; + } + rsp->port_xmit_wait = cpu_to_be64(max_vl_xmit_wait); + } +} + + +static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + struct opa_port_status_req *req = + (struct opa_port_status_req *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct opa_port_status_rsp *rsp; + u32 vl_select_mask = be32_to_cpu(req->vl_select_mask); + unsigned long vl; + size_t response_data_size; + u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + u8 port_num = req->port_num; + u8 num_vls = hweight32(vl_select_mask); + struct _vls_pctrs *vlinfo; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + int vfi; + u64 tmp, tmp2; + + response_data_size = sizeof(struct opa_port_status_rsp) + + num_vls * sizeof(struct _vls_pctrs); + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE; + return reply((struct ib_mad_hdr *)pmp); + } + + if (nports != 1 || (port_num && port_num != port) + || num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + memset(pmp->data, 0, sizeof(pmp->data)); + + rsp = (struct opa_port_status_rsp *)pmp->data; + if (port_num) + rsp->port_num = port_num; + else + rsp->port_num = port; + + rsp->port_rcv_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL)); + + hfi1_read_link_quality(dd, &rsp->link_quality_indicator); + + rsp->vl_select_mask = cpu_to_be32(vl_select_mask); + rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_bubble = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL)); + rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_xmit_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL)); + rsp->port_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); + rsp->port_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL)); + rsp->port_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL)); + rsp->port_xmit_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); + if (tmp2 < tmp) { + /* overflow/wrapped */ + rsp->local_link_integrity_errors = cpu_to_be64(~0); + } else { + rsp->local_link_integrity_errors = cpu_to_be64(tmp2); + } + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { + /* overflow/wrapped */ + rsp->link_error_recovery = cpu_to_be32(~0); + } else { + rsp->link_error_recovery = cpu_to_be32(tmp2); + } + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); + rsp->excessive_buffer_overruns = + cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); + rsp->fm_config_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL)); + rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL)); + + /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */ + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; + + vlinfo = &(rsp->vls[0]); + vfi = 0; + /* The vl_select_mask has been checked above, and we know + * that it contains only entries which represent valid VLs. + * So in the for_each_set_bit() loop below, we don't need + * any additional checks for vl. + */ + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(vl_select_mask)) { + memset(vlinfo, 0, sizeof(*vlinfo)); + + tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl)); + rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp); + rsp->vls[vfi].port_vl_rcv_bubble = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_data = + cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_pkts = + cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, + idx_from_vl(vl))); + + vlinfo++; + vfi++; + } + + a0_portstatus(ppd, rsp, vl_select_mask); + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u64 error_counter_summary = 0, tmp; + + error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL); + /* port_rcv_switch_relay_errors is 0 for HFIs */ + error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL); + error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_TX_REPLAY, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_RX_REPLAY, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL); + /* ppd->link_downed is a 32-bit value */ + error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL); + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + /* this is an 8-bit quantity */ + error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff; + + return error_counter_summary; +} + +static void a0_datacounters(struct hfi1_devdata *dd, struct _port_dctrs *rsp, + u32 vl_select_mask) +{ + if (!is_bx(dd)) { + unsigned long vl; + int vfi = 0; + u64 rcv_data, rcv_bubble, sum_vl_xmit_wait = 0; + + rcv_data = be64_to_cpu(rsp->port_rcv_data); + rcv_bubble = be64_to_cpu(rsp->port_rcv_bubble); + /* In the measured time period, calculate the total number + * of flits that were received. Subtract out one false + * rcv_bubble increment for every 32 received flits but + * don't let the number go negative. + */ + if (rcv_bubble >= (rcv_data>>5)) { + rcv_bubble -= (rcv_data>>5); + rsp->port_rcv_bubble = cpu_to_be64(rcv_bubble); + } + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(vl_select_mask)) { + rcv_data = be64_to_cpu(rsp->vls[vfi].port_vl_rcv_data); + rcv_bubble = + be64_to_cpu(rsp->vls[vfi].port_vl_rcv_bubble); + if (rcv_bubble >= (rcv_data>>5)) { + rcv_bubble -= (rcv_data>>5); + rsp->vls[vfi].port_vl_rcv_bubble = + cpu_to_be64(rcv_bubble); + } + vfi++; + } + vfi = 0; + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(vl_select_mask)) { + u64 tmp = sum_vl_xmit_wait + + be64_to_cpu(rsp->vls[vfi++].port_vl_xmit_wait); + if (tmp < sum_vl_xmit_wait) { + /* we wrapped */ + sum_vl_xmit_wait = (u64) ~0; + break; + } + sum_vl_xmit_wait = tmp; + } + if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait) + rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait); + } +} + +static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + struct opa_port_data_counters_msg *req = + (struct opa_port_data_counters_msg *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct _port_dctrs *rsp; + struct _vls_dctrs *vlinfo; + size_t response_data_size; + u32 num_ports; + u8 num_pslm; + u8 lq, num_vls; + u64 port_mask; + unsigned long port_num; + unsigned long vl; + u32 vl_select_mask; + int vfi; + + num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + num_vls = hweight32(be32_to_cpu(req->vl_select_mask)); + vl_select_mask = be32_to_cpu(req->vl_select_mask); + + if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* Sanity check */ + response_data_size = sizeof(struct opa_port_data_counters_msg) + + num_vls * sizeof(struct _vls_dctrs); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the + * port the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if ((u8)port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + rsp = (struct _port_dctrs *)&(req->port[0]); + memset(rsp, 0, sizeof(*rsp)); + + rsp->port_number = port; + /* + * Note that link_quality_indicator is a 32 bit quantity in + * 'datacounters' queries (as opposed to 'portinfo' queries, + * where it's a byte). + */ + hfi1_read_link_quality(dd, &lq); + rsp->link_quality_indicator = cpu_to_be32((u32)lq); + + /* rsp->sw_port_congestion is 0 for HFIs */ + /* rsp->port_xmit_time_cong is 0 for HFIs */ + /* rsp->port_xmit_wasted_bw ??? */ + /* rsp->port_xmit_wait_data ??? */ + /* rsp->port_mark_fecn is 0 for HFIs */ + + rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_bubble = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL)); + rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_xmit_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL)); + rsp->port_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); + rsp->port_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL)); + + rsp->port_error_counter_summary = + cpu_to_be64(get_error_counter_summary(ibdev, port)); + + vlinfo = &(rsp->vls[0]); + vfi = 0; + /* The vl_select_mask has been checked above, and we know + * that it contains only entries which represent valid VLs. + * So in the for_each_set_bit() loop below, we don't need + * any additional checks for vl. + */ + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(req->vl_select_mask)) { + memset(vlinfo, 0, sizeof(*vlinfo)); + + rsp->vls[vfi].port_vl_xmit_data = + cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_data = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL, + idx_from_vl(vl))); + rsp->vls[vfi].port_vl_rcv_bubble = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BBL_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_pkts = + cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, + idx_from_vl(vl))); + rsp->vls[vfi].port_vl_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, + idx_from_vl(vl))); + + /* rsp->port_vl_xmit_time_cong is 0 for HFIs */ + /* rsp->port_vl_xmit_wasted_bw ??? */ + /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? + * does this differ from rsp->vls[vfi].port_vl_xmit_wait */ + /*rsp->vls[vfi].port_vl_mark_fecn = + cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + + offset)); + */ + vlinfo++; + vfi++; + } + + a0_datacounters(dd, rsp, vl_select_mask); + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + size_t response_data_size; + struct _port_ectrs *rsp; + unsigned long port_num; + struct opa_port_error_counters64_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 num_ports; + u8 num_pslm; + u8 num_vls; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct _vls_ectrs *vlinfo; + unsigned long vl; + u64 port_mask, tmp, tmp2; + u32 vl_select_mask; + int vfi; + + req = (struct opa_port_error_counters64_msg *)pmp->data; + + num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + num_vls = hweight32(be32_to_cpu(req->vl_select_mask)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + response_data_size = sizeof(struct opa_port_error_counters64_msg) + + num_vls * sizeof(struct _vls_ectrs); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + /* + * The bit set in the mask needs to be consistent with the + * port the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if ((u8)port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + rsp = (struct _port_ectrs *)&(req->port[0]); + + ibp = to_iport(ibdev, port_num); + ppd = ppd_from_ibp(ibp); + + memset(rsp, 0, sizeof(*rsp)); + rsp->port_number = (u8)port_num; + + rsp->port_rcv_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL)); + /* port_rcv_switch_relay_errors is 0 for HFIs */ + rsp->port_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL)); + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); + if (tmp2 < tmp) { + /* overflow/wrapped */ + rsp->local_link_integrity_errors = cpu_to_be64(~0); + } else { + rsp->local_link_integrity_errors = cpu_to_be64(tmp2); + } + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { + /* overflow/wrapped */ + rsp->link_error_recovery = cpu_to_be32(~0); + } else { + rsp->link_error_recovery = cpu_to_be32(tmp2); + } + rsp->port_xmit_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->excessive_buffer_overruns = + cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); + rsp->fm_config_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL)); + rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; + + vlinfo = (struct _vls_ectrs *)&(rsp->vls[0]); + vfi = 0; + vl_select_mask = be32_to_cpu(req->vl_select_mask); + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(req->vl_select_mask)) { + memset(vlinfo, 0, sizeof(*vlinfo)); + /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */ + vlinfo += 1; + vfi++; + } + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + size_t response_data_size; + struct _port_ei *rsp; + struct opa_port_error_info_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u64 port_mask; + u32 num_ports; + unsigned long port_num; + u8 num_pslm; + u64 reg; + + req = (struct opa_port_error_info_msg *)pmp->data; + rsp = (struct _port_ei *)&(req->port[0]); + + num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + + memset(rsp, 0, sizeof(*rsp)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* Sanity check */ + response_data_size = sizeof(struct opa_port_error_info_msg); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the port + * the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if ((u8)port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* PortRcvErrorInfo */ + rsp->port_rcv_ei.status_and_code = + dd->err_info_rcvport.status_and_code; + memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1, + &dd->err_info_rcvport.packet_flit1, sizeof(u64)); + memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2, + &dd->err_info_rcvport.packet_flit2, sizeof(u64)); + + /* ExcessiverBufferOverrunInfo */ + reg = read_csr(dd, RCV_ERR_INFO); + if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) { + /* if the RcvExcessBufferOverrun bit is set, save SC of + * first pkt that encountered an excess buffer overrun */ + u8 tmp = (u8)reg; + + tmp &= RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK; + tmp <<= 2; + rsp->excessive_buffer_overrun_ei.status_and_sc = tmp; + /* set the status bit */ + rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80; + } + + rsp->port_xmit_constraint_ei.status = + dd->err_info_xmit_constraint.status; + rsp->port_xmit_constraint_ei.pkey = + cpu_to_be16(dd->err_info_xmit_constraint.pkey); + rsp->port_xmit_constraint_ei.slid = + cpu_to_be32(dd->err_info_xmit_constraint.slid); + + rsp->port_rcv_constraint_ei.status = + dd->err_info_rcv_constraint.status; + rsp->port_rcv_constraint_ei.pkey = + cpu_to_be16(dd->err_info_rcv_constraint.pkey); + rsp->port_rcv_constraint_ei.slid = + cpu_to_be32(dd->err_info_rcv_constraint.slid); + + /* UncorrectableErrorInfo */ + rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable; + + /* FMConfigErrorInfo */ + rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig; + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + struct opa_clear_port_status *req = + (struct opa_clear_port_status *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + u64 portn = be64_to_cpu(req->port_select_mask[3]); + u32 counter_select = be32_to_cpu(req->counter_select_mask); + u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */ + unsigned long vl; + + if ((nports != 1) || (portn != 1 << port)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + /* + * only counters returned by pma_get_opa_portstatus() are + * handled, so when pma_get_opa_portstatus() gets a fix, + * the corresponding change should be made here as well. + */ + + if (counter_select & CS_PORT_XMIT_DATA) + write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_DATA) + write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_PKTS) + write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_PKTS) + write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_MCAST_XMIT_PKTS) + write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_MCAST_RCV_PKTS) + write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_WAIT) + write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0); + + /* ignore cs_sw_portCongestion for HFIs */ + + if (counter_select & CS_PORT_RCV_FECN) + write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_BECN) + write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0); + + /* ignore cs_port_xmit_time_cong for HFIs */ + /* ignore cs_port_xmit_wasted_bw for now */ + /* ignore cs_port_xmit_wait_data for now */ + if (counter_select & CS_PORT_RCV_BUBBLE) + write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0); + + /* Only applicable for switch */ + /*if (counter_select & CS_PORT_MARK_FECN) + write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0);*/ + + if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS) + write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0); + + /* ignore cs_port_rcv_switch_relay_errors for HFIs */ + if (counter_select & CS_PORT_XMIT_DISCARDS) + write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS) + write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS) + write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) { + write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); + } + + if (counter_select & CS_LINK_ERROR_RECOVERY) { + write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL, 0); + } + + if (counter_select & CS_PORT_RCV_ERRORS) + write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) { + write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); + dd->rcv_ovfl_cnt = 0; + } + + if (counter_select & CS_FM_CONFIG_ERRORS) + write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_LINK_DOWNED) + write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0); + + if (counter_select & CS_UNCORRECTABLE_ERRORS) + write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0); + + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(vl_select_mask)) { + + if (counter_select & CS_PORT_XMIT_DATA) + write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_DATA) + write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_XMIT_PKTS) + write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_PKTS) + write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_XMIT_WAIT) + write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0); + + /* sw_port_vl_congestion is 0 for HFIs */ + if (counter_select & CS_PORT_RCV_FECN) + write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_BECN) + write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0); + + /* port_vl_xmit_time_cong is 0 for HFIs */ + /* port_vl_xmit_wasted_bw ??? */ + /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */ + if (counter_select & CS_PORT_RCV_BUBBLE) + write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0); + + /*if (counter_select & CS_PORT_MARK_FECN) + write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0); + */ + /* port_vl_xmit_discards ??? */ + } + + if (resp_len) + *resp_len += sizeof(*req); + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + struct _port_ei *rsp; + struct opa_port_error_info_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u64 port_mask; + u32 num_ports; + unsigned long port_num; + u8 num_pslm; + u32 error_info_select; + + req = (struct opa_port_error_info_msg *)pmp->data; + rsp = (struct _port_ei *)&(req->port[0]); + + num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + + memset(rsp, 0, sizeof(*rsp)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the port + * the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if ((u8)port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + error_info_select = be32_to_cpu(req->error_info_select_mask); + + /* PortRcvErrorInfo */ + if (error_info_select & ES_PORT_RCV_ERROR_INFO) + /* turn off status bit */ + dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK; + + /* ExcessiverBufferOverrunInfo */ + if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO) + /* status bit is essentially kept in the h/w - bit 5 of + * RCV_ERR_INFO */ + write_csr(dd, RCV_ERR_INFO, + RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); + + if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO) + dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK; + + if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO) + dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK; + + /* UncorrectableErrorInfo */ + if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO) + /* turn off status bit */ + dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK; + + /* FMConfigErrorInfo */ + if (error_info_select & ES_FM_CONFIG_ERROR_INFO) + /* turn off status bit */ + dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK; + + if (resp_len) + *resp_len += sizeof(*req); + + return reply((struct ib_mad_hdr *)pmp); +} + +struct opa_congestion_info_attr { + __be16 congestion_info; + u8 control_table_cap; /* Multiple of 64 entry unit CCTs */ + u8 congestion_log_length; +} __packed; + +static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_congestion_info_attr *p = + (struct opa_congestion_info_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + p->congestion_info = 0; + p->control_table_cap = ppd->cc_max_table_entries; + p->congestion_log_length = OPA_CONG_LOG_ELEMS; + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, + u8 *data, + struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + int i; + struct opa_congestion_setting_attr *p = + (struct opa_congestion_setting_attr *) data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_congestion_setting_entry_shadow *entries; + struct cc_state *cc_state; + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (cc_state == NULL) { + rcu_read_unlock(); + return reply((struct ib_mad_hdr *)smp); + } + + entries = cc_state->cong_setting.entries; + p->port_control = cpu_to_be16(cc_state->cong_setting.port_control); + p->control_map = cpu_to_be32(cc_state->cong_setting.control_map); + for (i = 0; i < OPA_MAX_SLS; i++) { + p->entries[i].ccti_increase = entries[i].ccti_increase; + p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer); + p->entries[i].trigger_threshold = + entries[i].trigger_threshold; + p->entries[i].ccti_min = entries[i].ccti_min; + } + + rcu_read_unlock(); + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_congestion_setting_attr *p = + (struct opa_congestion_setting_attr *) data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_congestion_setting_entry_shadow *entries; + int i; + + ppd->cc_sl_control_map = be32_to_cpu(p->control_map); + + entries = ppd->congestion_entries; + for (i = 0; i < OPA_MAX_SLS; i++) { + entries[i].ccti_increase = p->entries[i].ccti_increase; + entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer); + entries[i].trigger_threshold = + p->entries[i].trigger_threshold; + entries[i].ccti_min = p->entries[i].ccti_min; + } + + return __subn_get_opa_cong_setting(smp, am, data, ibdev, port, + resp_len); +} + +static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data; + s64 ts; + int i; + + if (am != 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + spin_lock(&ppd->cc_log_lock); + + cong_log->log_type = OPA_CC_LOG_TYPE_HFI; + cong_log->congestion_flags = 0; + cong_log->threshold_event_counter = + cpu_to_be16(ppd->threshold_event_counter); + memcpy(cong_log->threshold_cong_event_map, + ppd->threshold_cong_event_map, + sizeof(cong_log->threshold_cong_event_map)); + /* keep timestamp in units of 1.024 usec */ + ts = ktime_to_ns(ktime_get()) / 1024; + cong_log->current_time_stamp = cpu_to_be32(ts); + for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) { + struct opa_hfi1_cong_log_event_internal *cce = + &ppd->cc_events[ppd->cc_mad_idx++]; + if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS) + ppd->cc_mad_idx = 0; + /* + * Entries which are older than twice the time + * required to wrap the counter are supposed to + * be zeroed (CA10-49 IBTA, release 1.2.1, V1). + */ + if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX)) + continue; + memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3); + memcpy(cong_log->events[i].remote_qp_number_cn_entry, + &cce->rqpn, 3); + cong_log->events[i].sl_svc_type_cn_entry = + ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7); + cong_log->events[i].remote_lid_cn_entry = + cpu_to_be32(cce->rlid); + cong_log->events[i].timestamp_cn_entry = + cpu_to_be32(cce->timestamp); + } + + /* + * Reset threshold_cong_event_map, and threshold_event_counter + * to 0 when log is read. + */ + memset(ppd->threshold_cong_event_map, 0x0, + sizeof(ppd->threshold_cong_event_map)); + ppd->threshold_event_counter = 0; + + spin_unlock(&ppd->cc_log_lock); + + if (resp_len) + *resp_len += sizeof(struct opa_hfi1_cong_log); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct ib_cc_table_attr *cc_table_attr = + (struct ib_cc_table_attr *) data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 start_block = OPA_AM_START_BLK(am); + u32 n_blocks = OPA_AM_NBLK(am); + struct ib_cc_table_entry_shadow *entries; + int i, j; + u32 sentry, eentry; + struct cc_state *cc_state; + + /* sanity check n_blocks, start_block */ + if (n_blocks == 0 || + start_block + n_blocks > ppd->cc_max_table_entries) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (cc_state == NULL) { + rcu_read_unlock(); + return reply((struct ib_mad_hdr *)smp); + } + + sentry = start_block * IB_CCT_ENTRIES; + eentry = sentry + (IB_CCT_ENTRIES * n_blocks); + + cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit); + + entries = cc_state->cct.entries; + + /* return n_blocks, though the last block may not be full */ + for (j = 0, i = sentry; i < eentry; j++, i++) + cc_table_attr->ccti_entries[j].entry = + cpu_to_be16(entries[i].entry); + + rcu_read_unlock(); + + if (resp_len) + *resp_len += sizeof(u16)*(IB_CCT_ENTRIES * n_blocks + 1); + + return reply((struct ib_mad_hdr *)smp); +} + +void cc_state_reclaim(struct rcu_head *rcu) +{ + struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu); + + kfree(cc_state); +} + +static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct ib_cc_table_attr *p = (struct ib_cc_table_attr *) data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 start_block = OPA_AM_START_BLK(am); + u32 n_blocks = OPA_AM_NBLK(am); + struct ib_cc_table_entry_shadow *entries; + int i, j; + u32 sentry, eentry; + u16 ccti_limit; + struct cc_state *old_cc_state, *new_cc_state; + + /* sanity check n_blocks, start_block */ + if (n_blocks == 0 || + start_block + n_blocks > ppd->cc_max_table_entries) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + sentry = start_block * IB_CCT_ENTRIES; + eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) + + (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1; + + /* sanity check ccti_limit */ + ccti_limit = be16_to_cpu(p->ccti_limit); + if (ccti_limit + 1 > eentry) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL); + if (new_cc_state == NULL) + goto getit; + + spin_lock(&ppd->cc_state_lock); + + old_cc_state = get_cc_state(ppd); + + if (old_cc_state == NULL) { + spin_unlock(&ppd->cc_state_lock); + kfree(new_cc_state); + return reply((struct ib_mad_hdr *)smp); + } + + *new_cc_state = *old_cc_state; + + new_cc_state->cct.ccti_limit = ccti_limit; + + entries = ppd->ccti_entries; + ppd->total_cct_entry = ccti_limit + 1; + + for (j = 0, i = sentry; i < eentry; j++, i++) + entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry); + + memcpy(new_cc_state->cct.entries, entries, + eentry * sizeof(struct ib_cc_table_entry)); + + new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED; + new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map; + memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries, + OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry)); + + rcu_assign_pointer(ppd->cc_state, new_cc_state); + + spin_unlock(&ppd->cc_state_lock); + + call_rcu(&old_cc_state->rcu, cc_state_reclaim); + +getit: + return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len); +} + +struct opa_led_info { + __be32 rsvd_led_mask; + __be32 rsvd; +}; + +#define OPA_LED_SHIFT 31 +#define OPA_LED_MASK (1 << OPA_LED_SHIFT) + +static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct opa_led_info *p = (struct opa_led_info *) data; + u32 nport = OPA_AM_NPORT(am); + u64 reg; + + if (nport != 1 || OPA_AM_PORTNUM(am)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + reg = read_csr(dd, DCC_CFG_LED_CNTRL); + if ((reg & DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK) && + ((reg & DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK) == 0xf)) + p->rsvd_led_mask = cpu_to_be32(OPA_LED_MASK); + + if (resp_len) + *resp_len += sizeof(struct opa_led_info); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct opa_led_info *p = (struct opa_led_info *) data; + u32 nport = OPA_AM_NPORT(am); + int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK); + + if (nport != 1 || OPA_AM_PORTNUM(am)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + setextled(dd, on); + + return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len); +} + +static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int ret; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + switch (attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_NODE_INFO: + ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_PORT_INFO: + ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_PKEY_TABLE: + ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SL_TO_SC_MAP: + ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_SL_MAP: + ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLT_MAP: + ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: + ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_PORT_STATE_INFO: + ret = __subn_get_opa_psi(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: + ret = __subn_get_opa_bct(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_CABLE_INFO: + ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_CONGESTION_INFO: + ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: + ret = __subn_get_opa_cong_setting(smp, am, data, ibdev, + port, resp_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_LOG: + ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev, + port, resp_len); + break; + case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: + ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_LED_INFO: + ret = __subn_get_opa_led_info(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_SM_INFO: + if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + if (ibp->port_cap_flags & IB_PORT_SM) + return IB_MAD_RESULT_SUCCESS; + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + return ret; +} + +static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int ret; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + switch (attr_id) { + case IB_SMP_ATTR_PORT_INFO: + ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_PKEY_TABLE: + ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SL_TO_SC_MAP: + ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_SL_MAP: + ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLT_MAP: + ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: + ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_PORT_STATE_INFO: + ret = __subn_set_opa_psi(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: + ret = __subn_set_opa_bct(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: + ret = __subn_set_opa_cong_setting(smp, am, data, ibdev, + port, resp_len); + break; + case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: + ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_LED_INFO: + ret = __subn_set_opa_led_info(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_SM_INFO: + if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + if (ibp->port_cap_flags & IB_PORT_SM) + return IB_MAD_RESULT_SUCCESS; + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + return ret; +} + +static inline void set_aggr_error(struct opa_aggregate *ag) +{ + ag->err_reqlength |= cpu_to_be16(0x8000); +} + +static int subn_get_opa_aggregate(struct opa_smp *smp, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int i; + u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; + u8 *next_smp = opa_get_smp_data(smp); + + if (num_attr < 1 || num_attr > 117) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < num_attr; i++) { + struct opa_aggregate *agg; + size_t agg_data_len; + size_t agg_size; + u32 am; + + agg = (struct opa_aggregate *)next_smp; + agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8; + agg_size = sizeof(*agg) + agg_data_len; + am = be32_to_cpu(agg->attr_mod); + + *resp_len += agg_size; + + if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* zero the payload for this segment */ + memset(next_smp + sizeof(*agg), 0, agg_data_len); + + (void) subn_get_opa_sma(agg->attr_id, smp, am, agg->data, + ibdev, port, NULL); + if (smp->status & ~IB_SMP_DIRECTION) { + set_aggr_error(agg); + return reply((struct ib_mad_hdr *)smp); + } + next_smp += agg_size; + + } + + return reply((struct ib_mad_hdr *)smp); +} + +static int subn_set_opa_aggregate(struct opa_smp *smp, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int i; + u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; + u8 *next_smp = opa_get_smp_data(smp); + + if (num_attr < 1 || num_attr > 117) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < num_attr; i++) { + struct opa_aggregate *agg; + size_t agg_data_len; + size_t agg_size; + u32 am; + + agg = (struct opa_aggregate *)next_smp; + agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8; + agg_size = sizeof(*agg) + agg_data_len; + am = be32_to_cpu(agg->attr_mod); + + *resp_len += agg_size; + + if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + (void) subn_set_opa_sma(agg->attr_id, smp, am, agg->data, + ibdev, port, NULL); + if (smp->status & ~IB_SMP_DIRECTION) { + set_aggr_error(agg); + return reply((struct ib_mad_hdr *)smp); + } + next_smp += agg_size; + + } + + return reply((struct ib_mad_hdr *)smp); +} + +/* + * OPAv1 specifies that, on the transition to link up, these counters + * are cleared: + * PortRcvErrors [*] + * LinkErrorRecovery + * LocalLinkIntegrityErrors + * ExcessiveBufferOverruns [*] + * + * [*] Error info associated with these counters is retained, but the + * error info status is reset to 0. + */ +void clear_linkup_counters(struct hfi1_devdata *dd) +{ + /* PortRcvErrors */ + write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0); + dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK; + /* LinkErrorRecovery */ + write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0); + /* LocalLinkIntegrityErrors */ + write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); + /* ExcessiveBufferOverruns */ + write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); + dd->rcv_ovfl_cnt = 0; + dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK; +} + +/* + * is_local_mad() returns 1 if 'mad' is sent from, and destined to the + * local node, 0 otherwise. + */ +static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad, + const struct ib_wc *in_wc) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + const struct opa_smp *smp = (const struct opa_smp *)mad; + + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + return (smp->hop_cnt == 0 && + smp->route.dr.dr_slid == OPA_LID_PERMISSIVE && + smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE); + } + + return (in_wc->slid == ppd->lid); +} + +/* + * opa_local_smp_check() should only be called on MADs for which + * is_local_mad() returns true. It applies the SMP checks that are + * specific to SMPs which are sent from, and destined to this node. + * opa_local_smp_check() returns 0 if the SMP passes its checks, 1 + * otherwise. + * + * SMPs which arrive from other nodes are instead checked by + * opa_smp_check(). + */ +static int opa_local_smp_check(struct hfi1_ibport *ibp, + const struct ib_wc *in_wc) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 slid = in_wc->slid; + u16 pkey; + + if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys)) + return 1; + + pkey = ppd->pkeys[in_wc->pkey_index]; + /* + * We need to do the "node-local" checks specified in OPAv1, + * rev 0.90, section 9.10.26, which are: + * - pkey is 0x7fff, or 0xffff + * - Source QPN == 0 || Destination QPN == 0 + * - the MAD header's management class is either + * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or + * IB_MGMT_CLASS_SUBN_LID_ROUTED + * - SLID != 0 + * + * However, we know (and so don't need to check again) that, + * for local SMPs, the MAD stack passes MADs with: + * - Source QPN of 0 + * - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or + * our own port's lid + * + */ + if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) + return 0; + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +static int process_subn_opa(struct ib_device *ibdev, int mad_flags, + u8 port, const struct opa_mad *in_mad, + struct opa_mad *out_mad, + u32 *resp_len) +{ + struct opa_smp *smp = (struct opa_smp *)out_mad; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *data; + u32 am; + __be16 attr_id; + int ret; + + *out_mad = *in_mad; + data = opa_get_smp_data(smp); + + am = be32_to_cpu(smp->attr_mod); + attr_id = smp->attr_id; + if (smp->class_version != OPA_SMI_CLASS_VERSION) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)smp); + goto bail; + } + ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey, + smp->route.dr.dr_slid, smp->route.dr.return_path, + smp->hop_cnt); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void) check_mkey(to_iport(ibdev, port_num), + (struct ib_mad_hdr *)smp, 0, + smp->mkey, smp->route.dr.dr_slid, + smp->route.dr.return_path, + smp->hop_cnt); + ret = IB_MAD_RESULT_FAILURE; + goto bail; + } + + *resp_len = opa_get_smp_header_size(smp); + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (attr_id) { + default: + clear_opa_smp_data(smp); + ret = subn_get_opa_sma(attr_id, smp, am, data, + ibdev, port, resp_len); + goto bail; + case OPA_ATTRIB_ID_AGGREGATE: + ret = subn_get_opa_aggregate(smp, ibdev, port, + resp_len); + goto bail; + } + case IB_MGMT_METHOD_SET: + switch (attr_id) { + default: + ret = subn_set_opa_sma(attr_id, smp, am, data, + ibdev, port, resp_len); + goto bail; + case OPA_ATTRIB_ID_AGGREGATE: + ret = subn_set_opa_aggregate(smp, ibdev, port, + resp_len); + goto bail; + } + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)smp); + } + +bail: + return ret; +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port, const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)smp); + goto bail; + } + + ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, + smp->mkey, (__force __be32)smp->dr_slid, + smp->return_path, smp->hop_cnt); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void) check_mkey(to_iport(ibdev, port_num), + (struct ib_mad_hdr *)smp, 0, + smp->mkey, + (__force __be32)smp->dr_slid, + smp->return_path, smp->hop_cnt); + ret = IB_MAD_RESULT_FAILURE; + goto bail; + } + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_INFO: + ret = subn_get_nodeinfo(smp, ibdev, port); + goto bail; + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + goto bail; + } + } + +bail: + return ret; +} + +static int process_perf_opa(struct ib_device *ibdev, u8 port, + const struct opa_mad *in_mad, + struct opa_mad *out_mad, u32 *resp_len) +{ + struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + + if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + return reply((struct ib_mad_hdr *)pmp); + } + + *resp_len = sizeof(pmp->mad_hdr); + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len); + goto bail; + case OPA_PM_ATTRIB_ID_PORT_STATUS: + ret = pma_get_opa_portstatus(pmp, ibdev, port, + resp_len); + goto bail; + case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS: + ret = pma_get_opa_datacounters(pmp, ibdev, port, + resp_len); + goto bail; + case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS: + ret = pma_get_opa_porterrors(pmp, ibdev, port, + resp_len); + goto bail; + case OPA_PM_ATTRIB_ID_ERROR_INFO: + ret = pma_get_opa_errorinfo(pmp, ibdev, port, + resp_len); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS: + ret = pma_set_opa_portstatus(pmp, ibdev, port, + resp_len); + goto bail; + case OPA_PM_ATTRIB_ID_ERROR_INFO: + ret = pma_set_opa_errorinfo(pmp, ibdev, port, + resp_len); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)pmp); + } + +bail: + return ret; +} + +static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, + u8 port, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct opa_mad *in_mad, + struct opa_mad *out_mad, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + int ret; + int pkey_idx; + u32 resp_len = 0; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); + if (pkey_idx < 0) { + pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n", + hfi1_get_pkey(ibp, 1)); + pkey_idx = 1; + } + *out_mad_pkey_index = (u16)pkey_idx; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + if (is_local_mad(ibp, in_mad, in_wc)) { + ret = opa_local_smp_check(ibp, in_wc); + if (ret) + return IB_MAD_RESULT_FAILURE; + } + ret = process_subn_opa(ibdev, mad_flags, port, in_mad, + out_mad, &resp_len); + goto bail; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf_opa(ibdev, port, in_mad, out_mad, + &resp_len); + goto bail; + + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + if (ret & IB_MAD_RESULT_REPLY) + *out_mad_size = round_up(resp_len, 8); + else if (ret & IB_MAD_RESULT_SUCCESS) + *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh); + + return ret; +} + +static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port, + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + int ret; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad); + goto bail; + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + return ret; +} + +/** + * hfi1_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, size_t in_mad_size, + struct ib_mad_hdr *out_mad, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + switch (in_mad->base_version) { + case OPA_MGMT_BASE_VERSION: + if (unlikely(in_mad_size != sizeof(struct opa_mad))) { + dev_err(ibdev->dma_device, "invalid in_mad_size\n"); + return IB_MAD_RESULT_FAILURE; + } + return hfi1_process_opa_mad(ibdev, mad_flags, port, + in_wc, in_grh, + (struct opa_mad *)in_mad, + (struct opa_mad *)out_mad, + out_mad_size, + out_mad_pkey_index); + case IB_MGMT_BASE_VERSION: + return hfi1_process_ib_mad(ibdev, mad_flags, port, + in_wc, in_grh, + (const struct ib_mad *)in_mad, + (struct ib_mad *)out_mad); + default: + break; + } + + return IB_MAD_RESULT_FAILURE; +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + ib_free_send_mad(mad_send_wc->send_buf); +} + +int hfi1_create_agents(struct hfi1_ibdev *dev) +{ + struct hfi1_devdata *dd = dd_from_dev(dev); + struct ib_mad_agent *agent; + struct hfi1_ibport *ibp; + int p; + int ret; + + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI, + NULL, 0, send_handler, + NULL, NULL, 0); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + goto err; + } + + ibp->send_agent = agent; + } + + return 0; + +err: + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + if (ibp->send_agent) { + agent = ibp->send_agent; + ibp->send_agent = NULL; + ib_unregister_mad_agent(agent); + } + } + + return ret; +} + +void hfi1_free_agents(struct hfi1_ibdev *dev) +{ + struct hfi1_devdata *dd = dd_from_dev(dev); + struct ib_mad_agent *agent; + struct hfi1_ibport *ibp; + int p; + + for (p = 0; p < dd->num_pports; p++) { + ibp = &dd->pport[p].ibport_data; + if (ibp->send_agent) { + agent = ibp->send_agent; + ibp->send_agent = NULL; + ib_unregister_mad_agent(agent); + } + if (ibp->sm_ah) { + ib_destroy_ah(&ibp->sm_ah->ibah); + ibp->sm_ah = NULL; + } + } +} diff --git a/drivers/staging/rdma/hfi1/mad.h b/drivers/staging/rdma/hfi1/mad.h new file mode 100644 index 000000000..47457501c --- /dev/null +++ b/drivers/staging/rdma/hfi1/mad.h @@ -0,0 +1,325 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_MAD_H +#define _HFI1_MAD_H + +#include +#define USE_PI_LED_ENABLE 1 /* use led enabled bit in struct + * opa_port_states, if available */ +#include +#include +#ifndef PI_LED_ENABLE_SUP +#define PI_LED_ENABLE_SUP 0 +#endif +#include "opa_compat.h" + + + +#define IB_VLARB_LOWPRI_0_31 1 +#define IB_VLARB_LOWPRI_32_63 2 +#define IB_VLARB_HIGHPRI_0_31 3 +#define IB_VLARB_HIGHPRI_32_63 4 + +#define OPA_MAX_PREEMPT_CAP 32 +#define OPA_VLARB_LOW_ELEMENTS 0 +#define OPA_VLARB_HIGH_ELEMENTS 1 +#define OPA_VLARB_PREEMPT_ELEMENTS 2 +#define OPA_VLARB_PREEMPT_MATRIX 3 + +#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00) + +struct ib_pma_portcounters_cong { + u8 reserved; + u8 reserved1; + __be16 port_check_rate; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved2; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved3; + __be16 vl15_dropped; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_xmit_wait; + __be64 port_adr_events; +} __packed; + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +#define OPA_MAX_PREEMPT_CAP 32 +#define OPA_VLARB_LOW_ELEMENTS 0 +#define OPA_VLARB_HIGH_ELEMENTS 1 +#define OPA_VLARB_PREEMPT_ELEMENTS 2 +#define OPA_VLARB_PREEMPT_MATRIX 3 + +#define HFI1_XMIT_RATE_UNSUPPORTED 0x0 +#define HFI1_XMIT_RATE_PICO 0x7 +/* number of 4nsec cycles equaling 2secs */ +#define HFI1_CONG_TIMER_PSINTERVAL 0x1DCD64EC + +#define IB_CC_SVCTYPE_RC 0x0 +#define IB_CC_SVCTYPE_UC 0x1 +#define IB_CC_SVCTYPE_RD 0x2 +#define IB_CC_SVCTYPE_UD 0x3 + + +/* + * There should be an equivalent IB #define for the following, but + * I cannot find it. + */ +#define OPA_CC_LOG_TYPE_HFI 2 + +struct opa_hfi1_cong_log_event_internal { + u32 lqpn; + u32 rqpn; + u8 sl; + u8 svc_type; + u32 rlid; + s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */ +}; + +struct opa_hfi1_cong_log_event { + u8 local_qp_cn_entry[3]; + u8 remote_qp_number_cn_entry[3]; + u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */ + u8 reserved; + __be32 remote_lid_cn_entry; + __be32 timestamp_cn_entry; +} __packed; + +#define OPA_CONG_LOG_ELEMS 96 + +struct opa_hfi1_cong_log { + u8 log_type; + u8 congestion_flags; + __be16 threshold_event_counter; + __be32 current_time_stamp; + u8 threshold_cong_event_map[OPA_MAX_SLS/8]; + struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS]; +} __packed; + +#define IB_CC_TABLE_CAP_DEFAULT 31 + +/* Port control flags */ +#define IB_CC_CCS_PC_SL_BASED 0x01 + +struct opa_congestion_setting_entry { + u8 ccti_increase; + u8 reserved; + __be16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct opa_congestion_setting_entry_shadow { + u8 ccti_increase; + u8 reserved; + u16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct opa_congestion_setting_attr { + __be32 control_map; + __be16 port_control; + struct opa_congestion_setting_entry entries[OPA_MAX_SLS]; +} __packed; + +struct opa_congestion_setting_attr_shadow { + u32 control_map; + u16 port_control; + struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS]; +} __packed; + +#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1 +#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1 + +/* 64 Congestion Control table entries in a single MAD */ +#define IB_CCT_ENTRIES 64 +#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2) + +struct ib_cc_table_entry { + __be16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_entry_shadow { + u16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_attr { + __be16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +struct ib_cc_table_attr_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +#define CC_TABLE_SHADOW_MAX \ + (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES) + +struct cc_table_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX]; +} __packed; + +/* + * struct cc_state combines the (active) per-port congestion control + * table, and the (active) per-SL congestion settings. cc_state data + * may need to be read in code paths that we want to be fast, so it + * is an RCU protected structure. + */ +struct cc_state { + struct rcu_head rcu; + struct cc_table_shadow cct; + struct opa_congestion_setting_attr_shadow cong_setting; +}; + +/* + * OPA BufferControl MAD + */ + +/* attribute modifier macros */ +#define OPA_AM_NPORT_SHIFT 24 +#define OPA_AM_NPORT_MASK 0xff +#define OPA_AM_NPORT_SMASK (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT) +#define OPA_AM_NPORT(am) (((am) >> OPA_AM_NPORT_SHIFT) & \ + OPA_AM_NPORT_MASK) + +#define OPA_AM_NBLK_SHIFT 24 +#define OPA_AM_NBLK_MASK 0xff +#define OPA_AM_NBLK_SMASK (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT) +#define OPA_AM_NBLK(am) (((am) >> OPA_AM_NBLK_SHIFT) & \ + OPA_AM_NBLK_MASK) + +#define OPA_AM_START_BLK_SHIFT 0 +#define OPA_AM_START_BLK_MASK 0xff +#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \ + OPA_AM_START_BLK_SHIFT) +#define OPA_AM_START_BLK(am) (((am) >> OPA_AM_START_BLK_SHIFT) & \ + OPA_AM_START_BLK_MASK) + +#define OPA_AM_PORTNUM_SHIFT 0 +#define OPA_AM_PORTNUM_MASK 0xff +#define OPA_AM_PORTNUM_SMASK (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT) +#define OPA_AM_PORTNUM(am) (((am) >> OPA_AM_PORTNUM_SHIFT) & \ + OPA_AM_PORTNUM_MASK) + +#define OPA_AM_ASYNC_SHIFT 12 +#define OPA_AM_ASYNC_MASK 0x1 +#define OPA_AM_ASYNC_SMASK (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT) +#define OPA_AM_ASYNC(am) (((am) >> OPA_AM_ASYNC_SHIFT) & \ + OPA_AM_ASYNC_MASK) + +#define OPA_AM_START_SM_CFG_SHIFT 9 +#define OPA_AM_START_SM_CFG_MASK 0x1 +#define OPA_AM_START_SM_CFG_SMASK (OPA_AM_START_SM_CFG_MASK << \ + OPA_AM_START_SM_CFG_SHIFT) +#define OPA_AM_START_SM_CFG(am) (((am) >> OPA_AM_START_SM_CFG_SHIFT) \ + & OPA_AM_START_SM_CFG_MASK) + +#define OPA_AM_CI_ADDR_SHIFT 19 +#define OPA_AM_CI_ADDR_MASK 0xfff +#define OPA_AM_CI_ADDR_SMASK (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT) +#define OPA_AM_CI_ADDR(am) (((am) >> OPA_AM_CI_ADDR_SHIFT) & \ + OPA_AM_CI_ADDR_MASK) + +#define OPA_AM_CI_LEN_SHIFT 13 +#define OPA_AM_CI_LEN_MASK 0x3f +#define OPA_AM_CI_LEN_SMASK (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT) +#define OPA_AM_CI_LEN(am) (((am) >> OPA_AM_CI_LEN_SHIFT) & \ + OPA_AM_CI_LEN_MASK) + +/* error info macros */ +#define OPA_EI_STATUS_SMASK 0x80 +#define OPA_EI_CODE_SMASK 0x0f + +struct vl_limit { + __be16 dedicated; + __be16 shared; +}; + +struct buffer_control { + __be16 reserved; + __be16 overall_shared_limit; + struct vl_limit vl[OPA_MAX_VLS]; +}; + +struct sc2vlnt { + u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */ +}; + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 \ + cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +#endif /* _HFI1_MAD_H */ diff --git a/drivers/staging/rdma/hfi1/mmap.c b/drivers/staging/rdma/hfi1/mmap.c new file mode 100644 index 000000000..5173b1c60 --- /dev/null +++ b/drivers/staging/rdma/hfi1/mmap.c @@ -0,0 +1,192 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "verbs.h" + +/** + * hfi1_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct hfi1_mmap_info + */ +void hfi1_release_mmap_info(struct kref *ref) +{ + struct hfi1_mmap_info *ip = + container_of(ref, struct hfi1_mmap_info, ref); + struct hfi1_ibdev *dev = to_idev(ip->context->device); + + spin_lock_irq(&dev->pending_lock); + list_del(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + vfree(ip->obj); + kfree(ip); +} + +/* + * open and close keep track of how many times the CQ is mapped, + * to avoid releasing it. + */ +static void hfi1_vma_open(struct vm_area_struct *vma) +{ + struct hfi1_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void hfi1_vma_close(struct vm_area_struct *vma) +{ + struct hfi1_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, hfi1_release_mmap_info); +} + +static struct vm_operations_struct hfi1_vm_ops = { + .open = hfi1_vma_open, + .close = hfi1_vma_close, +}; + +/** + * hfi1_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct hfi1_ibdev *dev = to_idev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct hfi1_mmap_info *ip, *pp; + int ret = -EINVAL; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_irq(&dev->pending_lock); + list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + pending_mmaps) { + /* Only the creator is allowed to mmap the object */ + if (context != ip->context || (__u64) offset != ip->offset) + continue; + /* Don't allow a mmap larger than the object. */ + if (size > ip->size) + break; + + list_del_init(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) + goto done; + vma->vm_ops = &hfi1_vm_ops; + vma->vm_private_data = ip; + hfi1_vma_open(vma); + goto done; + } + spin_unlock_irq(&dev->pending_lock); +done: + return ret; +} + +/* + * Allocate information for hfi1_mmap + */ +struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj) { + struct hfi1_mmap_info *ip; + + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) + goto bail; + + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + +bail: + return ip; +} + +void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip, + u32 size, void *obj) +{ + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + ip->size = size; + ip->obj = obj; +} diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c new file mode 100644 index 000000000..bd64e4f98 --- /dev/null +++ b/drivers/staging/rdma/hfi1/mr.c @@ -0,0 +1,551 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include "hfi.h" + +/* Fast memory region */ +struct hfi1_fmr { + struct ib_fmr ibfmr; + struct hfi1_mregion mr; /* must be last */ +}; + +static inline struct hfi1_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct hfi1_fmr, ibfmr); +} + +static int init_mregion(struct hfi1_mregion *mr, struct ib_pd *pd, + int count) +{ + int m, i = 0; + int rval = 0; + + m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ; + for (; i < m; i++) { + mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL); + if (!mr->map[i]) + goto bail; + } + mr->mapsz = m; + init_completion(&mr->comp); + /* count returning the ptr to user */ + atomic_set(&mr->refcount, 1); + mr->pd = pd; + mr->max_segs = count; +out: + return rval; +bail: + while (i) + kfree(mr->map[--i]); + rval = -ENOMEM; + goto out; +} + +static void deinit_mregion(struct hfi1_mregion *mr) +{ + int i = mr->mapsz; + + mr->mapsz = 0; + while (i) + kfree(mr->map[--i]); +} + + +/** + * hfi1_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Returns the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see dma.c). + */ +struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct hfi1_mr *mr = NULL; + struct ib_mr *ret; + int rval; + + if (to_ipd(pd)->user) { + ret = ERR_PTR(-EPERM); + goto bail; + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + rval = init_mregion(&mr->mr, pd, 0); + if (rval) { + ret = ERR_PTR(rval); + goto bail; + } + + + rval = hfi1_alloc_lkey(&mr->mr, 1); + if (rval) { + ret = ERR_PTR(rval); + goto bail_mregion; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; +done: + return ret; + +bail_mregion: + deinit_mregion(&mr->mr); +bail: + kfree(mr); + goto done; +} + +static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd) +{ + struct hfi1_mr *mr; + int rval = -ENOMEM; + int m; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + HFI1_SEGSZ - 1) / HFI1_SEGSZ; + mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL); + if (!mr) + goto bail; + + rval = init_mregion(&mr->mr, pd, count); + if (rval) + goto bail; + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + rval = hfi1_alloc_lkey(&mr->mr, 0); + if (rval) + goto bail_mregion; + mr->ibmr.lkey = mr->mr.lkey; + mr->ibmr.rkey = mr->mr.lkey; +done: + return mr; + +bail_mregion: + deinit_mregion(&mr->mr); +bail: + kfree(mr); + mr = ERR_PTR(rval); + goto done; +} + +/** + * hfi1_reg_phys_mr - register a physical memory region + * @pd: protection domain for this memory region + * @buffer_list: pointer to the list of physical buffers to register + * @num_phys_buf: the number of physical buffers to register + * @iova_start: the starting address passed over IB which maps to this MR + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + struct hfi1_mr *mr; + int n, m, i; + struct ib_mr *ret; + + mr = alloc_mr(num_phys_buf, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; + goto bail; + } + + mr->mr.user_base = *iova_start; + mr->mr.iova = *iova_start; + mr->mr.access_flags = acc; + + m = 0; + n = 0; + for (i = 0; i < num_phys_buf; i++) { + mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; + mr->mr.map[m]->segs[n].length = buffer_list[i].size; + mr->mr.length += buffer_list[i].size; + n++; + if (n == HFI1_SEGSZ) { + m++; + n = 0; + } + } + + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * hfi1_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @start: starting userspace address + * @length: length of region to register + * @mr_access_flags: access flags for this memory region + * @udata: unused by the driver + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) +{ + struct hfi1_mr *mr; + struct ib_umem *umem; + struct scatterlist *sg; + int n, m, entry; + struct ib_mr *ret; + + if (length == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *) umem; + + n = umem->nmap; + + mr = alloc_mr(n, pd); + if (IS_ERR(mr)) { + ret = (struct ib_mr *)mr; + ib_umem_release(umem); + goto bail; + } + + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = ib_umem_offset(umem); + mr->mr.access_flags = mr_access_flags; + mr->umem = umem; + + if (is_power_of_2(umem->page_size)) + mr->mr.page_shift = ilog2(umem->page_size); + m = 0; + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == HFI1_SEGSZ) { + m++; + n = 0; + } + } + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * hfi1_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * Returns 0 on success. + * + * Note that this is called to free MRs created by hfi1_get_dma_mr() + * or hfi1_reg_user_mr(). + */ +int hfi1_dereg_mr(struct ib_mr *ibmr) +{ + struct hfi1_mr *mr = to_imr(ibmr); + int ret = 0; + unsigned long timeout; + + hfi1_free_lkey(&mr->mr); + + hfi1_put_mr(&mr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&mr->mr.comp, + 5 * HZ); + if (!timeout) { + dd_dev_err( + dd_from_ibdev(mr->mr.pd->device), + "hfi1_dereg_mr timeout mr %p pd %p refcount %u\n", + mr, mr->mr.pd, atomic_read(&mr->mr.refcount)); + hfi1_get_mr(&mr->mr); + ret = -EBUSY; + goto out; + } + deinit_mregion(&mr->mr); + if (mr->umem) + ib_umem_release(mr->umem); + kfree(mr); +out: + return ret; +} + +/* + * Allocate a memory region usable with the + * IB_WR_FAST_REG_MR send work request. + * + * Return the memory region on success, otherwise return an errno. + */ +struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct hfi1_mr *mr; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + mr = alloc_mr(max_num_sg, pd); + if (IS_ERR(mr)) + return (struct ib_mr *)mr; + + return &mr->ibmr; +} + +struct ib_fast_reg_page_list * +hfi1_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len) +{ + unsigned size = page_list_len * sizeof(u64); + struct ib_fast_reg_page_list *pl; + + if (size > PAGE_SIZE) + return ERR_PTR(-EINVAL); + + pl = kzalloc(sizeof(*pl), GFP_KERNEL); + if (!pl) + return ERR_PTR(-ENOMEM); + + pl->page_list = kzalloc(size, GFP_KERNEL); + if (!pl->page_list) + goto err_free; + + return pl; + +err_free: + kfree(pl); + return ERR_PTR(-ENOMEM); +} + +void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl) +{ + kfree(pl->page_list); + kfree(pl); +} + +/** + * hfi1_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct hfi1_fmr *fmr; + int m; + struct ib_fmr *ret; + int rval = -ENOMEM; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + HFI1_SEGSZ - 1) / HFI1_SEGSZ; + fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL); + if (!fmr) + goto bail; + + rval = init_mregion(&fmr->mr, pd, fmr_attr->max_pages); + if (rval) + goto bail; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + rval = hfi1_alloc_lkey(&fmr->mr, 0); + if (rval) + goto bail_mregion; + fmr->ibfmr.rkey = fmr->mr.lkey; + fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->mr.page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; +done: + return ret; + +bail_mregion: + deinit_mregion(&fmr->mr); +bail: + kfree(fmr); + ret = ERR_PTR(rval); + goto done; +} + +/** + * hfi1_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + */ + +int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova) +{ + struct hfi1_fmr *fmr = to_ifmr(ibfmr); + struct hfi1_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + int ret; + + i = atomic_read(&fmr->mr.refcount); + if (i > 2) + return -EBUSY; + + if (list_len > fmr->mr.max_segs) { + ret = -EINVAL; + goto bail; + } + rkt = &to_idev(ibfmr->device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->mr.page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; + fmr->mr.map[m]->segs[n].length = ps; + if (++n == HFI1_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * hfi1_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Returns 0 on success. + */ +int hfi1_unmap_fmr(struct list_head *fmr_list) +{ + struct hfi1_fmr *fmr; + struct hfi1_lkey_table *rkt; + unsigned long flags; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rkt = &to_idev(fmr->ibfmr.device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * hfi1_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Returns 0 on success. + */ +int hfi1_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct hfi1_fmr *fmr = to_ifmr(ibfmr); + int ret = 0; + unsigned long timeout; + + hfi1_free_lkey(&fmr->mr); + hfi1_put_mr(&fmr->mr); /* will set completion if last */ + timeout = wait_for_completion_timeout(&fmr->mr.comp, + 5 * HZ); + if (!timeout) { + hfi1_get_mr(&fmr->mr); + ret = -EBUSY; + goto out; + } + deinit_mregion(&fmr->mr); + kfree(fmr); +out: + return ret; +} diff --git a/drivers/staging/rdma/hfi1/opa_compat.h b/drivers/staging/rdma/hfi1/opa_compat.h new file mode 100644 index 000000000..f64eec1c2 --- /dev/null +++ b/drivers/staging/rdma/hfi1/opa_compat.h @@ -0,0 +1,129 @@ +#ifndef _LINUX_H +#define _LINUX_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This header file is for OPA-specific definitions which are + * required by the HFI driver, and which aren't yet in the Linux + * IB core. We'll collect these all here, then merge them into + * the kernel when that's convenient. + */ + +/* OPA SMA attribute IDs */ +#define OPA_ATTRIB_ID_CONGESTION_INFO cpu_to_be16(0x008b) +#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG cpu_to_be16(0x008f) +#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING cpu_to_be16(0x0090) +#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091) + +/* OPA PMA attribute IDs */ +#define OPA_PM_ATTRIB_ID_PORT_STATUS cpu_to_be16(0x0040) +#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS cpu_to_be16(0x0041) +#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS cpu_to_be16(0x0042) +#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS cpu_to_be16(0x0043) +#define OPA_PM_ATTRIB_ID_ERROR_INFO cpu_to_be16(0x0044) + +/* OPA status codes */ +#define OPA_PM_STATUS_REQUEST_TOO_LARGE cpu_to_be16(0x100) + +static inline u8 port_states_to_logical_state(struct opa_port_states *ps) +{ + return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE; +} + +static inline u8 port_states_to_phys_state(struct opa_port_states *ps) +{ + return ((ps->portphysstate_portstate & + OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf; +} + +/* + * OPA port physical states + * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState + * values. + * + * When writing, only values 0-3 are valid, other values are ignored. + * When reading, 0 is reserved. + * + * Returned by the ibphys_portstate() routine. + */ +enum opa_port_phys_state { + IB_PORTPHYSSTATE_NOP = 0, + /* 1 is reserved */ + IB_PORTPHYSSTATE_POLLING = 2, + IB_PORTPHYSSTATE_DISABLED = 3, + IB_PORTPHYSSTATE_TRAINING = 4, + IB_PORTPHYSSTATE_LINKUP = 5, + IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6, + IB_PORTPHYSSTATE_PHY_TEST = 7, + /* 8 is reserved */ + OPA_PORTPHYSSTATE_OFFLINE = 9, + OPA_PORTPHYSSTATE_GANGED = 10, + OPA_PORTPHYSSTATE_TEST = 11, + OPA_PORTPHYSSTATE_MAX = 11, + /* values 12-15 are reserved/ignored */ +}; + +/* OPA_PORT_TYPE_* definitions - these belong in opa_port_info.h */ +#define OPA_PORT_TYPE_UNKNOWN 0 +#define OPA_PORT_TYPE_DISCONNECTED 1 +/* port is not currently usable, CableInfo not available */ +#define OPA_PORT_TYPE_FIXED 2 +/* A fixed backplane port in a director class switch. All OPA ASICS */ +#define OPA_PORT_TYPE_VARIABLE 3 +/* A backplane port in a blade system, possibly mixed configuration */ +#define OPA_PORT_TYPE_STANDARD 4 +/* implies a SFF-8636 defined format for CableInfo (QSFP) */ +#define OPA_PORT_TYPE_SI_PHOTONICS 5 +/* A silicon photonics module implies TBD defined format for CableInfo + * as defined by Intel SFO group */ +/* 6 - 15 are reserved */ + +#endif /* _LINUX_H */ diff --git a/drivers/staging/rdma/hfi1/pcie.c b/drivers/staging/rdma/hfi1/pcie.c new file mode 100644 index 000000000..ac5653c0f --- /dev/null +++ b/drivers/staging/rdma/hfi1/pcie.c @@ -0,0 +1,1253 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "chip_registers.h" + +/* link speed vector for Gen3 speed - not in Linux headers */ +#define GEN1_SPEED_VECTOR 0x1 +#define GEN2_SPEED_VECTOR 0x2 +#define GEN3_SPEED_VECTOR 0x3 + +/* + * This file contains PCIe utility routines. + */ + +/* + * Code to adjust PCIe capabilities. + */ +static void tune_pcie_caps(struct hfi1_devdata *); + +/* + * Do all the common PCIe setup and initialization. + * devdata is not yet allocated, and is not allocated until after this + * routine returns success. Therefore dd_dev_err() can't be used for error + * printing. + */ +int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret; + + ret = pci_enable_device(pdev); + if (ret) { + /* + * This can happen (in theory) iff: + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n", + -ret); + goto done; + } + + ret = pci_request_regions(pdev, DRIVER_NAME); + if (ret) { + hfi1_early_err(&pdev->dev, + "pci_request_regions fails: err %d\n", -ret); + goto bail; + } + + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + /* + * If the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + hfi1_early_err(&pdev->dev, + "Unable to set DMA mask: %d\n", ret); + goto bail; + } + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + } else + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + hfi1_early_err(&pdev->dev, + "Unable to set DMA consistent mask: %d\n", ret); + goto bail; + } + + pci_set_master(pdev); + ret = pci_enable_pcie_error_reporting(pdev); + if (ret) { + hfi1_early_err(&pdev->dev, + "Unable to enable pcie error reporting: %d\n", + ret); + ret = 0; + } + goto done; + +bail: + hfi1_pcie_cleanup(pdev); +done: + return ret; +} + +/* + * Clean what was done in hfi1_pcie_init() + */ +void hfi1_pcie_cleanup(struct pci_dev *pdev) +{ + pci_disable_device(pdev); + /* + * Release regions should be called after the disable. OK to + * call if request regions has not been called or failed. + */ + pci_release_regions(pdev); +} + +/* + * Do remaining PCIe setup, once dd is allocated, and save away + * fields required to re-initialize after a chip reset, or for + * various other purposes + */ +int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + unsigned long len; + resource_size_t addr; + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + + /* + * The TXE PIO buffers are at the tail end of the chip space. + * Cut them off and map them separately. + */ + + /* sanity check vs expectations */ + if (len != TXE_PIO_SEND + TXE_PIO_SIZE) { + dd_dev_err(dd, "chip PIO range does not match\n"); + return -EINVAL; + } + + dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND); + if (!dd->kregbase) + return -ENOMEM; + + dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE); + if (!dd->piobase) { + iounmap(dd->kregbase); + return -ENOMEM; + } + + dd->flags |= HFI1_PRESENT; /* now register routines work */ + + dd->kregend = dd->kregbase + TXE_PIO_SEND; + dd->physaddr = addr; /* used for io_remap, etc. */ + + /* + * Re-map the chip's RcvArray as write-combining to allow us + * to write an entire cacheline worth of entries in one shot. + * If this re-map fails, just continue - the RcvArray programming + * function will handle both cases. + */ + dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT); + dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY, + dd->chip_rcv_array_count * 8); + dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc); + /* + * Save BARs and command to rewrite after device reset. + */ + dd->pcibar0 = addr; + dd->pcibar1 = addr >> 32; + pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom); + pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command); + pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl); + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl); + pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2, + &dd->pcie_devctl2); + pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0); + pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, + &dd->pci_lnkctl3); + pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2); + + return 0; +} + +/* + * Do PCIe cleanup related to dd, after chip-specific cleanup, etc. Just prior + * to releasing the dd memory. + * Void because all of the core pcie cleanup functions are void. + */ +void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd) +{ + u64 __iomem *base = (void __iomem *) dd->kregbase; + + dd->flags &= ~HFI1_PRESENT; + dd->kregbase = NULL; + iounmap(base); + if (dd->rcvarray_wc) + iounmap(dd->rcvarray_wc); + if (dd->piobase) + iounmap(dd->piobase); + + pci_set_drvdata(dd->pcidev, NULL); +} + +/* + * Do a Function Level Reset (FLR) on the device. + * Based on static function drivers/pci/pci.c:pcie_flr(). + */ +void hfi1_pcie_flr(struct hfi1_devdata *dd) +{ + int i; + u16 status; + + /* no need to check for the capability - we know the device has it */ + + /* wait for Transaction Pending bit to clear, at most a few ms */ + for (i = 0; i < 4; i++) { + if (i) + msleep((1 << (i - 1)) * 100); + + pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status); + if (!(status & PCI_EXP_DEVSTA_TRPND)) + goto clear; + } + + dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n"); + +clear: + pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_BCR_FLR); + /* PCIe spec requires the function to be back within 100ms */ + msleep(100); +} + +static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt, + struct hfi1_msix_entry *hfi1_msix_entry) +{ + int ret; + int nvec = *msixcnt; + struct msix_entry *msix_entry; + int i; + + /* We can't pass hfi1_msix_entry array to msix_setup + * so use a dummy msix_entry array and copy the allocated + * irq back to the hfi1_msix_entry array. */ + msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL); + if (!msix_entry) { + ret = -ENOMEM; + goto do_intx; + } + + for (i = 0; i < nvec; i++) + msix_entry[i] = hfi1_msix_entry[i].msix; + + ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); + if (ret < 0) + goto free_msix_entry; + nvec = ret; + + for (i = 0; i < nvec; i++) + hfi1_msix_entry[i].msix = msix_entry[i]; + + kfree(msix_entry); + *msixcnt = nvec; + return; + +free_msix_entry: + kfree(msix_entry); + +do_intx: + dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n", + nvec, ret); + *msixcnt = 0; + hfi1_enable_intx(dd->pcidev); + +} + +/* return the PCIe link speed from the given link status */ +static u32 extract_speed(u16 linkstat) +{ + u32 speed; + + switch (linkstat & PCI_EXP_LNKSTA_CLS) { + default: /* not defined, assume Gen1 */ + case PCI_EXP_LNKSTA_CLS_2_5GB: + speed = 2500; /* Gen 1, 2.5GHz */ + break; + case PCI_EXP_LNKSTA_CLS_5_0GB: + speed = 5000; /* Gen 2, 5GHz */ + break; + case GEN3_SPEED_VECTOR: + speed = 8000; /* Gen 3, 8GHz */ + break; + } + return speed; +} + +/* return the PCIe link speed from the given link status */ +static u32 extract_width(u16 linkstat) +{ + return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT; +} + +/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */ +static void update_lbus_info(struct hfi1_devdata *dd) +{ + u16 linkstat; + + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); + dd->lbus_width = extract_width(linkstat); + dd->lbus_speed = extract_speed(linkstat); + snprintf(dd->lbus_info, sizeof(dd->lbus_info), + "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width); +} + +/* + * Read in the current PCIe link width and speed. Find if the link is + * Gen3 capable. + */ +int pcie_speeds(struct hfi1_devdata *dd) +{ + u32 linkcap; + + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_err(dd, "Can't find PCI Express capability!\n"); + return -EINVAL; + } + + /* find if our max speed is Gen3 and parent supports Gen3 speeds */ + dd->link_gen3_capable = 1; + + pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap); + if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) { + dd_dev_info(dd, + "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n", + linkcap & PCI_EXP_LNKCAP_SLS); + dd->link_gen3_capable = 0; + } + + /* + * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed + */ + if (dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) { + dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n"); + dd->link_gen3_capable = 0; + } + + /* obtain the link width and current speed */ + update_lbus_info(dd); + + /* check against expected pcie width and complain if "wrong" */ + if (dd->lbus_width < 16) + dd_dev_err(dd, "PCIe width %u (x16 HFI)\n", dd->lbus_width); + + return 0; +} + +/* + * Returns in *nent: + * - actual number of interrupts allocated + * - 0 if fell back to INTx. + */ +void request_msix(struct hfi1_devdata *dd, u32 *nent, + struct hfi1_msix_entry *entry) +{ + int pos; + + pos = dd->pcidev->msix_cap; + if (*nent && pos) { + msix_setup(dd, pos, nent, entry); + /* did it, either MSI-X or INTx */ + } else { + *nent = 0; + hfi1_enable_intx(dd->pcidev); + } + + tune_pcie_caps(dd); +} + +/* + * Disable MSI-X. + */ +void hfi1_nomsix(struct hfi1_devdata *dd) +{ + pci_disable_msix(dd->pcidev); +} + +void hfi1_enable_intx(struct pci_dev *pdev) +{ + /* first, turn on INTx */ + pci_intx(pdev, 1); + /* then turn off MSI-X */ + pci_disable_msix(pdev); +} + +/* restore command and BARs after a reset has wiped them out */ +void restore_pci_variables(struct hfi1_devdata *dd) +{ + pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command); + pci_write_config_dword(dd->pcidev, + PCI_BASE_ADDRESS_0, dd->pcibar0); + pci_write_config_dword(dd->pcidev, + PCI_BASE_ADDRESS_1, dd->pcibar1); + pci_write_config_dword(dd->pcidev, + PCI_ROM_ADDRESS, dd->pci_rom); + pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl); + pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl); + pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2, + dd->pcie_devctl2); + pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0); + pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, + dd->pci_lnkctl3); + pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2); +} + + +/* + * BIOS may not set PCIe bus-utilization parameters for best performance. + * Check and optionally adjust them to maximize our throughput. + */ +static int hfi1_pcie_caps; +module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO); +MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); + +static void tune_pcie_caps(struct hfi1_devdata *dd) +{ + struct pci_dev *parent; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs; + + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; + if (!pci_is_root_bus(parent->bus)) { + dd_dev_info(dd, "Parent not root\n"); + return; + } + + if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) + return; + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; + /* Find out supported and configured values for endpoint (us) */ + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; + + /* Find max payload supported by root, endpoint */ + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; + + /* If Supported greater than limit in module param, limit it */ + if (rc_mpss > (hfi1_pcie_caps & 7)) + rc_mpss = hfi1_pcie_caps & 7; + /* If less than (allowed, supported), bump root payload */ + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); + } + /* If less than (allowed, supported), bump endpoint payload */ + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); + } + + /* + * Now the Read Request size. + * No field for max supported, but PCIe spec limits it to 4096, + * which is code '5' (log2(4096) - 7) + */ + max_mrrs = 5; + if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7)) + max_mrrs = (hfi1_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); + } + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); + } +} +/* End of PCIe capability tuning */ + +/* + * From here through hfi1_pci_err_handler definition is invoked via + * PCI error infrastructure, registered via pci + */ +static pci_ers_result_t +pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + switch (state) { + case pci_channel_io_normal: + dd_dev_info(dd, "State Normal, ignoring\n"); + break; + + case pci_channel_io_frozen: + dd_dev_info(dd, "State Frozen, requesting reset\n"); + pci_disable_device(pdev); + ret = PCI_ERS_RESULT_NEED_RESET; + break; + + case pci_channel_io_perm_failure: + if (dd) { + dd_dev_info(dd, "State Permanent Failure, disabling\n"); + /* no more register accesses! */ + dd->flags &= ~HFI1_PRESENT; + hfi1_disable_after_error(dd); + } + /* else early, or other problem */ + ret = PCI_ERS_RESULT_DISCONNECT; + break; + + default: /* shouldn't happen */ + dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n", + state); + break; + } + return ret; +} + +static pci_ers_result_t +pci_mmio_enabled(struct pci_dev *pdev) +{ + u64 words = 0U; + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + if (dd && dd->pport) { + words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL); + if (words == ~0ULL) + ret = PCI_ERS_RESULT_NEED_RESET; + dd_dev_info(dd, + "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n", + words, ret); + } + return ret; +} + +static pci_ers_result_t +pci_slot_reset(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static pci_ers_result_t +pci_link_reset(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 link_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static void +pci_resume(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 resume function called\n"); + pci_cleanup_aer_uncorrect_error_status(pdev); + /* + * Running jobs will fail, since it's asynchronous + * unlike sysfs-requested reset. Better than + * doing nothing. + */ + hfi1_init(dd, 1); /* same as re-init after reset */ +} + +const struct pci_error_handlers hfi1_pci_err_handler = { + .error_detected = pci_error_detected, + .mmio_enabled = pci_mmio_enabled, + .link_reset = pci_link_reset, + .slot_reset = pci_slot_reset, + .resume = pci_resume, +}; + +/*============================================================================*/ +/* PCIe Gen3 support */ + +/* + * This code is separated out because it is expected to be removed in the + * final shipping product. If not, then it will be revisited and items + * will be moved to more standard locations. + */ + +/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */ +#define DL_STATUS_HFI0 0x1 /* hfi0 firmware download complete */ +#define DL_STATUS_HFI1 0x2 /* hfi1 firmware download complete */ +#define DL_STATUS_BOTH 0x3 /* hfi0 and hfi1 firmware download complete */ + +/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */ +#define DL_ERR_NONE 0x0 /* no error */ +#define DL_ERR_SWAP_PARITY 0x1 /* parity error in SerDes interrupt */ + /* or response data */ +#define DL_ERR_DISABLED 0x2 /* hfi disabled */ +#define DL_ERR_SECURITY 0x3 /* security check failed */ +#define DL_ERR_SBUS 0x4 /* SBus status error */ +#define DL_ERR_XFR_PARITY 0x5 /* parity error during ROM transfer*/ + +/* gasket block secondary bus reset delay */ +#define SBR_DELAY_US 200000 /* 200ms */ + +/* mask for PCIe capability register lnkctl2 target link speed */ +#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf + +static uint pcie_target = 3; +module_param(pcie_target, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)"); + +static uint pcie_force; +module_param(pcie_force, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed"); + +static uint pcie_retry = 5; +module_param(pcie_retry, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed"); + +#define UNSET_PSET 255 +#define DEFAULT_DISCRETE_PSET 2 /* discrete HFI */ +#define DEFAULT_MCP_PSET 4 /* MCP HFI */ +static uint pcie_pset = UNSET_PSET; +module_param(pcie_pset, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10"); + +/* equalization columns */ +#define PREC 0 +#define ATTN 1 +#define POST 2 + +/* discrete silicon preliminary equalization values */ +static const u8 discrete_preliminary_eq[11][3] = { + /* prec attn post */ + { 0x00, 0x00, 0x12 }, /* p0 */ + { 0x00, 0x00, 0x0c }, /* p1 */ + { 0x00, 0x00, 0x0f }, /* p2 */ + { 0x00, 0x00, 0x09 }, /* p3 */ + { 0x00, 0x00, 0x00 }, /* p4 */ + { 0x06, 0x00, 0x00 }, /* p5 */ + { 0x09, 0x00, 0x00 }, /* p6 */ + { 0x06, 0x00, 0x0f }, /* p7 */ + { 0x09, 0x00, 0x09 }, /* p8 */ + { 0x0c, 0x00, 0x00 }, /* p9 */ + { 0x00, 0x00, 0x18 }, /* p10 */ +}; + +/* integrated silicon preliminary equalization values */ +static const u8 integrated_preliminary_eq[11][3] = { + /* prec attn post */ + { 0x00, 0x1e, 0x07 }, /* p0 */ + { 0x00, 0x1e, 0x05 }, /* p1 */ + { 0x00, 0x1e, 0x06 }, /* p2 */ + { 0x00, 0x1e, 0x04 }, /* p3 */ + { 0x00, 0x1e, 0x00 }, /* p4 */ + { 0x03, 0x1e, 0x00 }, /* p5 */ + { 0x04, 0x1e, 0x00 }, /* p6 */ + { 0x03, 0x1e, 0x06 }, /* p7 */ + { 0x03, 0x1e, 0x04 }, /* p8 */ + { 0x05, 0x1e, 0x00 }, /* p9 */ + { 0x00, 0x1e, 0x0a }, /* p10 */ +}; + +/* helper to format the value to write to hardware */ +#define eq_value(pre, curr, post) \ + ((((u32)(pre)) << \ + PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \ + | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \ + | (((u32)(post)) << \ + PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT)) + +/* + * Load the given EQ preset table into the PCIe hardware. + */ +static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs, + u8 div) +{ + struct pci_dev *pdev = dd->pcidev; + u32 hit_error = 0; + u32 violation; + u32 i; + u8 c_minus1, c0, c_plus1; + + for (i = 0; i < 11; i++) { + /* set index */ + pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i); + /* write the value */ + c_minus1 = eq[i][PREC] / div; + c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div); + c_plus1 = eq[i][POST] / div; + pci_write_config_dword(pdev, PCIE_CFG_REG_PL102, + eq_value(c_minus1, c0, c_plus1)); + /* check if these coefficients violate EQ rules */ + pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105, + &violation); + if (violation + & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){ + if (hit_error == 0) { + dd_dev_err(dd, + "Gen3 EQ Table Coefficient rule violations\n"); + dd_dev_err(dd, " prec attn post\n"); + } + dd_dev_err(dd, " p%02d: %02x %02x %02x\n", + i, (u32)eq[i][0], (u32)eq[i][1], (u32)eq[i][2]); + dd_dev_err(dd, " %02x %02x %02x\n", + (u32)c_minus1, (u32)c0, (u32)c_plus1); + hit_error = 1; + } + } + if (hit_error) + return -EINVAL; + return 0; +} + +/* + * Steps to be done after the PCIe firmware is downloaded and + * before the SBR for the Pcie Gen3. + * The hardware mutex is already being held. + */ +static void pcie_post_steps(struct hfi1_devdata *dd) +{ + int i; + + set_sbus_fast_mode(dd); + /* + * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1. + * This avoids a spurious framing error that can otherwise be + * generated by the MAC layer. + * + * Use individual addresses since no broadcast is set up. + */ + for (i = 0; i < NUM_PCIE_SERDES; i++) { + sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i], + 0x03, WRITE_SBUS_RECEIVER, 0x00022132); + } + + clear_sbus_fast_mode(dd); +} + +/* + * Trigger a secondary bus reset (SBR) on ourselves using our parent. + * + * Based on pci_parent_bus_reset() which is not exported by the + * kernel core. + */ +static int trigger_sbr(struct hfi1_devdata *dd) +{ + struct pci_dev *dev = dd->pcidev; + struct pci_dev *pdev; + + /* need a parent */ + if (!dev->bus->self) { + dd_dev_err(dd, "%s: no parent device\n", __func__); + return -ENOTTY; + } + + /* should not be anyone else on the bus */ + list_for_each_entry(pdev, &dev->bus->devices, bus_list) + if (pdev != dev) { + dd_dev_err(dd, + "%s: another device is on the same bus\n", + __func__); + return -ENOTTY; + } + + /* + * A secondary bus reset (SBR) issues a hot reset to our device. + * The following routine does a 1s wait after the reset is dropped + * per PCI Trhfa (recovery time). PCIe 3.0 section 6.6.1 - + * Conventional Reset, paragraph 3, line 35 also says that a 1s + * delay after a reset is required. Per spec requirements, + * the link is either working or not after that point. + */ + pci_reset_bridge_secondary_bus(dev->bus->self); + + return 0; +} + +/* + * Write the given gasket interrupt register. + */ +static void write_gasket_interrupt(struct hfi1_devdata *dd, int index, + u16 code, u16 data) +{ + write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8), + (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) + |((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT))); +} + +/* + * Tell the gasket logic how to react to the reset. + */ +static void arm_gasket_logic(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = (((u64)1 << dd->hfi1_id) + << ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) + | ((u64)pcie_serdes_broadcast[dd->hfi1_id] + << ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT + | ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK + | ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) + << ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT + ); + write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg); + /* read back to push the write */ + read_csr(dd, ASIC_PCIE_SD_HOST_CMD); +} + +/* + * Do all the steps needed to transition the PCIe link to Gen3 speed. + */ +int do_pcie_gen3_transition(struct hfi1_devdata *dd) +{ + struct pci_dev *parent; + u64 fw_ctrl; + u64 reg, therm; + u32 reg32, fs, lf; + u32 status, err; + int ret; + int do_retry, retry_count = 0; + uint default_pset; + u16 target_vector, target_speed; + u16 lnkctl, lnkctl2, vendor; + u8 nsbr = 1; + u8 div; + const u8 (*eq)[3]; + int return_error = 0; + + /* PCIe Gen3 is for the ASIC only */ + if (dd->icode != ICODE_RTL_SILICON) + return 0; + + if (pcie_target == 1) { /* target Gen1 */ + target_vector = GEN1_SPEED_VECTOR; + target_speed = 2500; + } else if (pcie_target == 2) { /* target Gen2 */ + target_vector = GEN2_SPEED_VECTOR; + target_speed = 5000; + } else if (pcie_target == 3) { /* target Gen3 */ + target_vector = GEN3_SPEED_VECTOR; + target_speed = 8000; + } else { + /* off or invalid target - skip */ + dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__); + return 0; + } + + /* if already at target speed, done (unless forced) */ + if (dd->lbus_speed == target_speed) { + dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__, + pcie_target, + pcie_force ? "re-doing anyway" : "skipping"); + if (!pcie_force) + return 0; + } + + /* + * A0 needs an additional SBR + */ + if (is_a0(dd)) + nsbr++; + + /* + * Do the Gen3 transition. Steps are those of the PCIe Gen3 + * recipe. + */ + + /* step 1: pcie link working in gen1/gen2 */ + + /* step 2: if either side is not capable of Gen3, done */ + if (pcie_target == 3 && !dd->link_gen3_capable) { + dd_dev_err(dd, "The PCIe link is not Gen3 capable\n"); + ret = -ENOSYS; + goto done_no_mutex; + } + + /* hold the HW mutex across the firmware download and SBR */ + ret = acquire_hw_mutex(dd); + if (ret) + return ret; + + /* make sure thermal polling is not causing interrupts */ + therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN); + if (therm) { + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0); + msleep(100); + dd_dev_info(dd, "%s: Disabled therm polling\n", + __func__); + } + + /* step 3: download SBus Master firmware */ + /* step 4: download PCIe Gen3 SerDes firmware */ +retry: + dd_dev_info(dd, "%s: downloading firmware\n", __func__); + ret = load_pcie_firmware(dd); + if (ret) + goto done; + + /* step 5: set up device parameter settings */ + dd_dev_info(dd, "%s: setting PCIe registers\n", __func__); + + /* + * PcieCfgSpcie1 - Link Control 3 + * Leave at reset value. No need to set PerfEq - link equalization + * will be performed automatically after the SBR when the target + * speed is 8GT/s. + */ + + /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */ + pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff); + + /* step 5a: Set Synopsys Port Logic registers */ + + /* + * PcieCfgRegPl2 - Port Force Link + * + * Set the low power field to 0x10 to avoid unnecessary power + * management messages. All other fields are zero. + */ + reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32); + + /* + * PcieCfgRegPl100 - Gen3 Control + * + * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl + * turn on PcieCfgRegPl100.EqEieosCnt (erratum) + * Everything else zero. + */ + reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32); + + /* + * PcieCfgRegPl101 - Gen3 EQ FS and LF + * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping + * PcieCfgRegPl103 - Gen3 EQ Preset Index + * PcieCfgRegPl105 - Gen3 EQ Status + * + * Give initial EQ settings. + */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */ + /* 1000mV, FS=24, LF = 8 */ + fs = 24; + lf = 8; + div = 3; + eq = discrete_preliminary_eq; + default_pset = DEFAULT_DISCRETE_PSET; + } else { + /* 400mV, FS=29, LF = 9 */ + fs = 29; + lf = 9; + div = 1; + eq = integrated_preliminary_eq; + default_pset = DEFAULT_MCP_PSET; + } + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101, + (fs << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) + | (lf << PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT)); + ret = load_eq_table(dd, eq, fs, div); + if (ret) + goto done; + + /* + * PcieCfgRegPl106 - Gen3 EQ Control + * + * Set Gen3EqPsetReqVec, leave other fields 0. + */ + if (pcie_pset == UNSET_PSET) + pcie_pset = default_pset; + if (pcie_pset > 10) { /* valid range is 0-10, inclusive */ + dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n", + __func__, pcie_pset, default_pset); + pcie_pset = default_pset; + } + dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset); + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106, + ((1 << pcie_pset) + << PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) + | PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK + | PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK); + + /* + * step 5b: Do post firmware download steps via SBus + */ + dd_dev_info(dd, "%s: doing pcie post steps\n", __func__); + pcie_post_steps(dd); + + /* + * step 5c: Program gasket interrupts + */ + /* set the Rx Bit Rate to REFCLK ratio */ + write_gasket_interrupt(dd, 0, 0x0006, 0x0050); + /* disable pCal for PCIe Gen3 RX equalization */ + write_gasket_interrupt(dd, 1, 0x0026, 0x5b01); + /* + * Enable iCal for PCIe Gen3 RX equalization, and set which + * evaluation of RX_EQ_EVAL will launch the iCal procedure. + */ + write_gasket_interrupt(dd, 2, 0x0026, 0x5202); + /* terminate list */ + write_gasket_interrupt(dd, 3, 0x0000, 0x0000); + + /* + * step 5d: program XMT margin + * Right now, leave the default alone. To change, do a + * read-modify-write of: + * CcePcieCtrl.XmtMargin + * CcePcieCtrl.XmitMarginOverwriteEnable + */ + + /* step 5e: disable active state power management (ASPM) */ + dd_dev_info(dd, "%s: clearing ASPM\n", __func__); + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &lnkctl); + lnkctl &= ~PCI_EXP_LNKCTL_ASPMC; + pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, lnkctl); + + /* + * step 5f: clear DirectSpeedChange + * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the + * change in the speed target from starting before we are ready. + * This field defaults to 0 and we are not changing it, so nothing + * needs to be done. + */ + + /* step 5g: Set target link speed */ + /* + * Set target link speed to be target on both device and parent. + * On setting the parent: Some system BIOSs "helpfully" set the + * parent target speed to Gen2 to match the ASIC's initial speed. + * We can set the target Gen3 because we have already checked + * that it is Gen3 capable earlier. + */ + dd_dev_info(dd, "%s: setting parent target link speed\n", __func__); + parent = dd->pcidev->bus->self; + pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2); + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, + (u32)lnkctl2); + /* only write to parent if target is not as high as ours */ + if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) { + lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK; + lnkctl2 |= target_vector; + dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, + (u32)lnkctl2); + pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2); + } else { + dd_dev_info(dd, "%s: ..target speed is OK\n", __func__); + } + + dd_dev_info(dd, "%s: setting target link speed\n", __func__); + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, + (u32)lnkctl2); + lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK; + lnkctl2 |= target_vector; + dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, + (u32)lnkctl2); + pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); + + /* step 5h: arm gasket logic */ + /* hold DC in reset across the SBR */ + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); + (void) read_csr(dd, CCE_DC_CTRL); /* DC reset hold */ + /* save firmware control across the SBR */ + fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL); + + dd_dev_info(dd, "%s: arming gasket logic\n", __func__); + arm_gasket_logic(dd); + + /* + * step 6: quiesce PCIe link + * The chip has already been reset, so there will be no traffic + * from the chip. Linux has no easy way to enforce that it will + * not try to access the device, so we just need to hope it doesn't + * do it while we are doing the reset. + */ + + /* + * step 7: initiate the secondary bus reset (SBR) + * step 8: hardware brings the links back up + * step 9: wait for link speed transition to be complete + */ + dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__); + ret = trigger_sbr(dd); + if (ret) + goto done; + + /* step 10: decide what to do next */ + + /* check if we can read PCI space */ + ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor); + if (ret) { + dd_dev_info(dd, + "%s: read of VendorID failed after SBR, err %d\n", + __func__, ret); + return_error = 1; + goto done; + } + if (vendor == 0xffff) { + dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__); + return_error = 1; + ret = -EIO; + goto done; + } + + /* restore PCI space registers we know were reset */ + dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__); + restore_pci_variables(dd); + /* restore firmware control */ + write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl); + + /* + * Check the gasket block status. + * + * This is the first CSR read after the SBR. If the read returns + * all 1s (fails), the link did not make it back. + * + * Once we're sure we can read and write, clear the DC reset after + * the SBR. Then check for any per-lane errors. Then look over + * the status. + */ + reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS); + dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg); + if (reg == ~0ull) { /* PCIe read failed/timeout */ + dd_dev_err(dd, "SBR failed - unable to read from device\n"); + return_error = 1; + ret = -ENOSYS; + goto done; + } + + /* clear the DC reset */ + write_csr(dd, CCE_DC_CTRL, 0); + /* Set the LED off */ + if (is_a0(dd)) + setextled(dd, 0); + + /* check for any per-lane errors */ + pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, ®32); + dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32); + + /* extract status, look for our HFI */ + status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT) + & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK; + if ((status & (1 << dd->hfi1_id)) == 0) { + dd_dev_err(dd, + "%s: gasket status 0x%x, expecting 0x%x\n", + __func__, status, 1 << dd->hfi1_id); + ret = -EIO; + goto done; + } + + /* extract error */ + err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT) + & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK; + if (err) { + dd_dev_err(dd, "%s: gasket error %d\n", __func__, err); + ret = -EIO; + goto done; + } + + /* update our link information cache */ + update_lbus_info(dd); + dd_dev_info(dd, "%s: new speed and width: %s\n", __func__, + dd->lbus_info); + + if (dd->lbus_speed != target_speed) { /* not target */ + /* maybe retry */ + do_retry = retry_count < pcie_retry; + dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n", + pcie_target, do_retry ? ", retrying" : ""); + retry_count++; + if (do_retry) { + msleep(100); /* allow time to settle */ + goto retry; + } + ret = -EIO; + } + +done: + if (therm) { + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1); + msleep(100); + dd_dev_info(dd, "%s: Re-enable therm polling\n", + __func__); + } + release_hw_mutex(dd); +done_no_mutex: + /* return no error if it is OK to be at current speed */ + if (ret && !return_error) { + dd_dev_err(dd, "Proceeding at current speed PCIe speed\n"); + ret = 0; + } + + dd_dev_info(dd, "%s: done\n", __func__); + return ret; +} diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c new file mode 100644 index 000000000..9991814a8 --- /dev/null +++ b/drivers/staging/rdma/hfi1/pio.c @@ -0,0 +1,1771 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include "hfi.h" +#include "qp.h" +#include "trace.h" + +#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */ + +#define SC(name) SEND_CTXT_##name +/* + * Send Context functions + */ +static void sc_wait_for_packet_egress(struct send_context *sc, int pause); + +/* + * Set the CM reset bit and wait for it to clear. Use the provided + * sendctrl register. This routine has no locking. + */ +void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl) +{ + write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK); + while (1) { + udelay(1); + sendctrl = read_csr(dd, SEND_CTRL); + if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0) + break; + } +} + +/* defined in header release 48 and higher */ +#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT +#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 +#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull +#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ + << SEND_CTRL_UNSUPPORTED_VL_SHIFT) +#endif + +/* global control of PIO send */ +void pio_send_control(struct hfi1_devdata *dd, int op) +{ + u64 reg, mask; + unsigned long flags; + int write = 1; /* write sendctrl back */ + int flush = 0; /* re-read sendctrl to make sure it is flushed */ + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + reg = read_csr(dd, SEND_CTRL); + switch (op) { + case PSC_GLOBAL_ENABLE: + reg |= SEND_CTRL_SEND_ENABLE_SMASK; + /* Fall through */ + case PSC_DATA_VL_ENABLE: + /* Disallow sending on VLs not enabled */ + mask = (((~0ull)<sendctrl_lock, flags); +} + +/* number of send context memory pools */ +#define NUM_SC_POOLS 2 + +/* Send Context Size (SCS) wildcards */ +#define SCS_POOL_0 -1 +#define SCS_POOL_1 -2 +/* Send Context Count (SCC) wildcards */ +#define SCC_PER_VL -1 +#define SCC_PER_CPU -2 + +#define SCC_PER_KRCVQ -3 +#define SCC_ACK_CREDITS 32 + +#define PIO_WAIT_BATCH_SIZE 5 + +/* default send context sizes */ +static struct sc_config_sizes sc_config_sizes[SC_MAX] = { + [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */ + .count = SCC_PER_VL },/* one per NUMA */ + [SC_ACK] = { .size = SCC_ACK_CREDITS, + .count = SCC_PER_KRCVQ }, + [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */ + .count = SCC_PER_CPU }, /* one per CPU */ + +}; + +/* send context memory pool configuration */ +struct mem_pool_config { + int centipercent; /* % of memory, in 100ths of 1% */ + int absolute_blocks; /* absolute block count */ +}; + +/* default memory pool configuration: 100% in pool 0 */ +static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = { + /* centi%, abs blocks */ + { 10000, -1 }, /* pool 0 */ + { 0, -1 }, /* pool 1 */ +}; + +/* memory pool information, used when calculating final sizes */ +struct mem_pool_info { + int centipercent; /* 100th of 1% of memory to use, -1 if blocks + already set */ + int count; /* count of contexts in the pool */ + int blocks; /* block size of the pool */ + int size; /* context size, in blocks */ +}; + +/* + * Convert a pool wildcard to a valid pool index. The wildcards + * start at -1 and increase negatively. Map them as: + * -1 => 0 + * -2 => 1 + * etc. + * + * Return -1 on non-wildcard input, otherwise convert to a pool number. + */ +static int wildcard_to_pool(int wc) +{ + if (wc >= 0) + return -1; /* non-wildcard */ + return -wc - 1; +} + +static const char *sc_type_names[SC_MAX] = { + "kernel", + "ack", + "user" +}; + +static const char *sc_type_name(int index) +{ + if (index < 0 || index >= SC_MAX) + return "unknown"; + return sc_type_names[index]; +} + +/* + * Read the send context memory pool configuration and send context + * size configuration. Replace any wildcards and come up with final + * counts and sizes for the send context types. + */ +int init_sc_pools_and_sizes(struct hfi1_devdata *dd) +{ + struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } }; + int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1; + int total_contexts = 0; + int fixed_blocks; + int pool_blocks; + int used_blocks; + int cp_total; /* centipercent total */ + int ab_total; /* absolute block total */ + int extra; + int i; + + /* + * Step 0: + * - copy the centipercents/absolute sizes from the pool config + * - sanity check these values + * - add up centipercents, then later check for full value + * - add up absolute blocks, then later check for over-commit + */ + cp_total = 0; + ab_total = 0; + for (i = 0; i < NUM_SC_POOLS; i++) { + int cp = sc_mem_pool_config[i].centipercent; + int ab = sc_mem_pool_config[i].absolute_blocks; + + /* + * A negative value is "unused" or "invalid". Both *can* + * be valid, but centipercent wins, so check that first + */ + if (cp >= 0) { /* centipercent valid */ + cp_total += cp; + } else if (ab >= 0) { /* absolute blocks valid */ + ab_total += ab; + } else { /* neither valid */ + dd_dev_err( + dd, + "Send context memory pool %d: both the block count and centipercent are invalid\n", + i); + return -EINVAL; + } + + mem_pool_info[i].centipercent = cp; + mem_pool_info[i].blocks = ab; + } + + /* do not use both % and absolute blocks for different pools */ + if (cp_total != 0 && ab_total != 0) { + dd_dev_err( + dd, + "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n"); + return -EINVAL; + } + + /* if any percentages are present, they must add up to 100% x 100 */ + if (cp_total != 0 && cp_total != 10000) { + dd_dev_err( + dd, + "Send context memory pool centipercent is %d, expecting 10000\n", + cp_total); + return -EINVAL; + } + + /* the absolute pool total cannot be more than the mem total */ + if (ab_total > total_blocks) { + dd_dev_err( + dd, + "Send context memory pool absolute block count %d is larger than the memory size %d\n", + ab_total, total_blocks); + return -EINVAL; + } + + /* + * Step 2: + * - copy from the context size config + * - replace context type wildcard counts with real values + * - add up non-memory pool block sizes + * - add up memory pool user counts + */ + fixed_blocks = 0; + for (i = 0; i < SC_MAX; i++) { + int count = sc_config_sizes[i].count; + int size = sc_config_sizes[i].size; + int pool; + + /* + * Sanity check count: Either a positive value or + * one of the expected wildcards is valid. The positive + * value is checked later when we compare against total + * memory available. + */ + if (i == SC_ACK) { + count = dd->n_krcv_queues; + } else if (i == SC_KERNEL) { + count = num_vls + 1 /* VL15 */; + } else if (count == SCC_PER_CPU) { + count = dd->num_rcv_contexts - dd->n_krcv_queues; + } else if (count < 0) { + dd_dev_err( + dd, + "%s send context invalid count wildcard %d\n", + sc_type_name(i), count); + return -EINVAL; + } + if (total_contexts + count > dd->chip_send_contexts) + count = dd->chip_send_contexts - total_contexts; + + total_contexts += count; + + /* + * Sanity check pool: The conversion will return a pool + * number or -1 if a fixed (non-negative) value. The fixed + * value is checked later when we compare against + * total memory available. + */ + pool = wildcard_to_pool(size); + if (pool == -1) { /* non-wildcard */ + fixed_blocks += size * count; + } else if (pool < NUM_SC_POOLS) { /* valid wildcard */ + mem_pool_info[pool].count += count; + } else { /* invalid wildcard */ + dd_dev_err( + dd, + "%s send context invalid pool wildcard %d\n", + sc_type_name(i), size); + return -EINVAL; + } + + dd->sc_sizes[i].count = count; + dd->sc_sizes[i].size = size; + } + if (fixed_blocks > total_blocks) { + dd_dev_err( + dd, + "Send context fixed block count, %u, larger than total block count %u\n", + fixed_blocks, total_blocks); + return -EINVAL; + } + + /* step 3: calculate the blocks in the pools, and pool context sizes */ + pool_blocks = total_blocks - fixed_blocks; + if (ab_total > pool_blocks) { + dd_dev_err( + dd, + "Send context fixed pool sizes, %u, larger than pool block count %u\n", + ab_total, pool_blocks); + return -EINVAL; + } + /* subtract off the fixed pool blocks */ + pool_blocks -= ab_total; + + for (i = 0; i < NUM_SC_POOLS; i++) { + struct mem_pool_info *pi = &mem_pool_info[i]; + + /* % beats absolute blocks */ + if (pi->centipercent >= 0) + pi->blocks = (pool_blocks * pi->centipercent) / 10000; + + if (pi->blocks == 0 && pi->count != 0) { + dd_dev_err( + dd, + "Send context memory pool %d has %u contexts, but no blocks\n", + i, pi->count); + return -EINVAL; + } + if (pi->count == 0) { + /* warn about wasted blocks */ + if (pi->blocks != 0) + dd_dev_err( + dd, + "Send context memory pool %d has %u blocks, but zero contexts\n", + i, pi->blocks); + pi->size = 0; + } else { + pi->size = pi->blocks / pi->count; + } + } + + /* step 4: fill in the context type sizes from the pool sizes */ + used_blocks = 0; + for (i = 0; i < SC_MAX; i++) { + if (dd->sc_sizes[i].size < 0) { + unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size); + + WARN_ON_ONCE(pool >= NUM_SC_POOLS); + dd->sc_sizes[i].size = mem_pool_info[pool].size; + } + /* make sure we are not larger than what is allowed by the HW */ +#define PIO_MAX_BLOCKS 1024 + if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS) + dd->sc_sizes[i].size = PIO_MAX_BLOCKS; + + /* calculate our total usage */ + used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count; + } + extra = total_blocks - used_blocks; + if (extra != 0) + dd_dev_info(dd, "unused send context blocks: %d\n", extra); + + return total_contexts; +} + +int init_send_contexts(struct hfi1_devdata *dd) +{ + u16 base; + int ret, i, j, context; + + ret = init_credit_return(dd); + if (ret) + return ret; + + dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8), + GFP_KERNEL); + dd->send_contexts = kcalloc(dd->num_send_contexts, + sizeof(struct send_context_info), + GFP_KERNEL); + if (!dd->send_contexts || !dd->hw_to_sw) { + dd_dev_err(dd, "Unable to allocate send context arrays\n"); + kfree(dd->hw_to_sw); + kfree(dd->send_contexts); + free_credit_return(dd); + return -ENOMEM; + } + + /* hardware context map starts with invalid send context indices */ + for (i = 0; i < TXE_NUM_CONTEXTS; i++) + dd->hw_to_sw[i] = INVALID_SCI; + + /* + * All send contexts have their credit sizes. Allocate credits + * for each context one after another from the global space. + */ + context = 0; + base = 1; + for (i = 0; i < SC_MAX; i++) { + struct sc_config_sizes *scs = &dd->sc_sizes[i]; + + for (j = 0; j < scs->count; j++) { + struct send_context_info *sci = + &dd->send_contexts[context]; + sci->type = i; + sci->base = base; + sci->credits = scs->size; + + context++; + base += scs->size; + } + } + + return 0; +} + +/* + * Allocate a software index and hardware context of the given type. + * + * Must be called with dd->sc_lock held. + */ +static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index, + u32 *hw_context) +{ + struct send_context_info *sci; + u32 index; + u32 context; + + for (index = 0, sci = &dd->send_contexts[0]; + index < dd->num_send_contexts; index++, sci++) { + if (sci->type == type && sci->allocated == 0) { + sci->allocated = 1; + /* use a 1:1 mapping, but make them non-equal */ + context = dd->chip_send_contexts - index - 1; + dd->hw_to_sw[context] = index; + *sw_index = index; + *hw_context = context; + return 0; /* success */ + } + } + dd_dev_err(dd, "Unable to locate a free type %d send context\n", type); + return -ENOSPC; +} + +/* + * Free the send context given by its software index. + * + * Must be called with dd->sc_lock held. + */ +static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context) +{ + struct send_context_info *sci; + + sci = &dd->send_contexts[sw_index]; + if (!sci->allocated) { + dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n", + __func__, sw_index, hw_context); + } + sci->allocated = 0; + dd->hw_to_sw[hw_context] = INVALID_SCI; +} + +/* return the base context of a context in a group */ +static inline u32 group_context(u32 context, u32 group) +{ + return (context >> group) << group; +} + +/* return the size of a group */ +static inline u32 group_size(u32 group) +{ + return 1 << group; +} + +/* + * Obtain the credit return addresses, kernel virtual and physical, for the + * given sc. + * + * To understand this routine: + * o va and pa are arrays of struct credit_return. One for each physical + * send context, per NUMA. + * o Each send context always looks in its relative location in a struct + * credit_return for its credit return. + * o Each send context in a group must have its return address CSR programmed + * with the same value. Use the address of the first send context in the + * group. + */ +static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa) +{ + u32 gc = group_context(sc->hw_context, sc->group); + u32 index = sc->hw_context & 0x7; + + sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index]; + *pa = (unsigned long) + &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc]; +} + +/* + * Work queue function triggered in error interrupt routine for + * kernel contexts. + */ +static void sc_halted(struct work_struct *work) +{ + struct send_context *sc; + + sc = container_of(work, struct send_context, halt_work); + sc_restart(sc); +} + +/* + * Calculate PIO block threshold for this send context using the given MTU. + * Trigger a return when one MTU plus optional header of credits remain. + * + * Parameter mtu is in bytes. + * Parameter hdrqentsize is in DWORDs. + * + * Return value is what to write into the CSR: trigger return when + * unreturned credits pass this count. + */ +u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize) +{ + u32 release_credits; + u32 threshold; + + /* add in the header size, then divide by the PIO block size */ + mtu += hdrqentsize << 2; + release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE); + + /* check against this context's credits */ + if (sc->credits <= release_credits) + threshold = 1; + else + threshold = sc->credits - release_credits; + + return threshold; +} + +/* + * Calculate credit threshold in terms of percent of the allocated credits. + * Trigger when unreturned credits equal or exceed the percentage of the whole. + * + * Return value is what to write into the CSR: trigger return when + * unreturned credits pass this count. + */ +static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) +{ + return (sc->credits * percent) / 100; +} + +/* + * Set the credit return threshold. + */ +void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold) +{ + unsigned long flags; + u32 old_threshold; + int force_return = 0; + + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + + old_threshold = (sc->credit_ctrl >> + SC(CREDIT_CTRL_THRESHOLD_SHIFT)) + & SC(CREDIT_CTRL_THRESHOLD_MASK); + + if (new_threshold != old_threshold) { + sc->credit_ctrl = + (sc->credit_ctrl + & ~SC(CREDIT_CTRL_THRESHOLD_SMASK)) + | ((new_threshold + & SC(CREDIT_CTRL_THRESHOLD_MASK)) + << SC(CREDIT_CTRL_THRESHOLD_SHIFT)); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + + /* force a credit return on change to avoid a possible stall */ + force_return = 1; + } + + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); + + if (force_return) + sc_return_credits(sc); +} + +/* + * set_pio_integrity + * + * Set the CHECK_ENABLE register for the send context 'sc'. + */ +void set_pio_integrity(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg = 0; + u32 hw_context = sc->hw_context; + int type = sc->type; + + /* + * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if + * we're snooping. + */ + if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) && + dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE) + reg = hfi1_pkt_default_send_ctxt_mask(dd, type); + + write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg); +} + +/* + * Allocate a NUMA relative send context structure of the given type along + * with a HW context. + */ +struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, + uint hdrqentsize, int numa) +{ + struct send_context_info *sci; + struct send_context *sc; + dma_addr_t pa; + unsigned long flags; + u64 reg; + u32 thresh; + u32 sw_index; + u32 hw_context; + int ret; + u8 opval, opmask; + + /* do not allocate while frozen */ + if (dd->flags & HFI1_FROZEN) + return NULL; + + sc = kzalloc_node(sizeof(struct send_context), GFP_KERNEL, numa); + if (!sc) { + dd_dev_err(dd, "Cannot allocate send context structure\n"); + return NULL; + } + + spin_lock_irqsave(&dd->sc_lock, flags); + ret = sc_hw_alloc(dd, type, &sw_index, &hw_context); + if (ret) { + spin_unlock_irqrestore(&dd->sc_lock, flags); + kfree(sc); + return NULL; + } + + sci = &dd->send_contexts[sw_index]; + sci->sc = sc; + + sc->dd = dd; + sc->node = numa; + sc->type = type; + spin_lock_init(&sc->alloc_lock); + spin_lock_init(&sc->release_lock); + spin_lock_init(&sc->credit_ctrl_lock); + INIT_LIST_HEAD(&sc->piowait); + INIT_WORK(&sc->halt_work, sc_halted); + atomic_set(&sc->buffers_allocated, 0); + init_waitqueue_head(&sc->halt_wait); + + /* grouping is always single context for now */ + sc->group = 0; + + sc->sw_index = sw_index; + sc->hw_context = hw_context; + cr_group_addresses(sc, &pa); + sc->credits = sci->credits; + +/* PIO Send Memory Address details */ +#define PIO_ADDR_CONTEXT_MASK 0xfful +#define PIO_ADDR_CONTEXT_SHIFT 16 + sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK) + << PIO_ADDR_CONTEXT_SHIFT); + + /* set base and credits */ + reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK)) + << SC(CTRL_CTXT_DEPTH_SHIFT)) + | ((sci->base & SC(CTRL_CTXT_BASE_MASK)) + << SC(CTRL_CTXT_BASE_SHIFT)); + write_kctxt_csr(dd, hw_context, SC(CTRL), reg); + + set_pio_integrity(sc); + + /* unmask all errors */ + write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1); + + /* set the default partition key */ + write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), + (DEFAULT_PKEY & + SC(CHECK_PARTITION_KEY_VALUE_MASK)) + << SC(CHECK_PARTITION_KEY_VALUE_SHIFT)); + + /* per context type checks */ + if (type == SC_USER) { + opval = USER_OPCODE_CHECK_VAL; + opmask = USER_OPCODE_CHECK_MASK; + } else { + opval = OPCODE_CHECK_VAL_DISABLED; + opmask = OPCODE_CHECK_MASK_DISABLED; + } + + /* set the send context check opcode mask and value */ + write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), + ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) | + ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT))); + + /* set up credit return */ + reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK); + write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg); + + /* + * Calculate the initial credit return threshold. + * + * For Ack contexts, set a threshold for half the credits. + * For User contexts use the given percentage. This has been + * sanitized on driver start-up. + * For Kernel contexts, use the default MTU plus a header. + */ + if (type == SC_ACK) { + thresh = sc_percent_to_threshold(sc, 50); + } else if (type == SC_USER) { + thresh = sc_percent_to_threshold(sc, + user_credit_return_threshold); + } else { /* kernel */ + thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize); + } + reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT); + /* add in early return */ + if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN)) + reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK); + else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */ + reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK); + + /* set up write-through credit_ctrl */ + sc->credit_ctrl = reg; + write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg); + + /* User send contexts should not allow sending on VL15 */ + if (type == SC_USER) { + reg = 1ULL << 15; + write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg); + } + + spin_unlock_irqrestore(&dd->sc_lock, flags); + + /* + * Allocate shadow ring to track outstanding PIO buffers _after_ + * unlocking. We don't know the size until the lock is held and + * we can't allocate while the lock is held. No one is using + * the context yet, so allocate it now. + * + * User contexts do not get a shadow ring. + */ + if (type != SC_USER) { + /* + * Size the shadow ring 1 larger than the number of credits + * so head == tail can mean empty. + */ + sc->sr_size = sci->credits + 1; + sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) * + sc->sr_size, GFP_KERNEL, numa); + if (!sc->sr) { + dd_dev_err(dd, + "Cannot allocate send context shadow ring structure\n"); + sc_free(sc); + return NULL; + } + } + + dd_dev_info(dd, + "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n", + sw_index, + hw_context, + sc_type_name(type), + sc->group, + sc->credits, + sc->credit_ctrl, + thresh); + + return sc; +} + +/* free a per-NUMA send context structure */ +void sc_free(struct send_context *sc) +{ + struct hfi1_devdata *dd; + unsigned long flags; + u32 sw_index; + u32 hw_context; + + if (!sc) + return; + + sc->flags |= SCF_IN_FREE; /* ensure no restarts */ + dd = sc->dd; + if (!list_empty(&sc->piowait)) + dd_dev_err(dd, "piowait list not empty!\n"); + sw_index = sc->sw_index; + hw_context = sc->hw_context; + sc_disable(sc); /* make sure the HW is disabled */ + flush_work(&sc->halt_work); + + spin_lock_irqsave(&dd->sc_lock, flags); + dd->send_contexts[sw_index].sc = NULL; + + /* clear/disable all registers set in sc_alloc */ + write_kctxt_csr(dd, hw_context, SC(CTRL), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0); + write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0); + write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0); + write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0); + + /* release the index and context for re-use */ + sc_hw_free(dd, sw_index, hw_context); + spin_unlock_irqrestore(&dd->sc_lock, flags); + + kfree(sc->sr); + kfree(sc); +} + +/* disable the context */ +void sc_disable(struct send_context *sc) +{ + u64 reg; + unsigned long flags; + struct pio_buf *pbuf; + + if (!sc) + return; + + /* do all steps, even if already disabled */ + spin_lock_irqsave(&sc->alloc_lock, flags); + reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL)); + reg &= ~SC(CTRL_CTXT_ENABLE_SMASK); + sc->flags &= ~SCF_ENABLED; + sc_wait_for_packet_egress(sc, 1); + write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg); + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + /* + * Flush any waiters. Once the context is disabled, + * credit return interrupts are stopped (although there + * could be one in-process when the context is disabled). + * Wait one microsecond for any lingering interrupts, then + * proceed with the flush. + */ + udelay(1); + spin_lock_irqsave(&sc->release_lock, flags); + if (sc->sr) { /* this context has a shadow ring */ + while (sc->sr_tail != sc->sr_head) { + pbuf = &sc->sr[sc->sr_tail].pbuf; + if (pbuf->cb) + (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE); + sc->sr_tail++; + if (sc->sr_tail >= sc->sr_size) + sc->sr_tail = 0; + } + } + spin_unlock_irqrestore(&sc->release_lock, flags); +} + +/* return SendEgressCtxtStatus.PacketOccupancy */ +#define packet_occupancy(r) \ + (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\ + >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT) + +/* is egress halted on the context? */ +#define egress_halted(r) \ + ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK) + +/* wait for packet egress, optionally pause for credit return */ +static void sc_wait_for_packet_egress(struct send_context *sc, int pause) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg; + u32 loop = 0; + + while (1) { + reg = read_csr(dd, sc->hw_context * 8 + + SEND_EGRESS_CTXT_STATUS); + /* done if egress is stopped */ + if (egress_halted(reg)) + break; + reg = packet_occupancy(reg); + if (reg == 0) + break; + if (loop > 100) { + dd_dev_err(dd, + "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u\n", + __func__, sc->sw_index, + sc->hw_context, (u32)reg); + break; + } + loop++; + udelay(1); + } + + if (pause) + /* Add additional delay to ensure chip returns all credits */ + pause_for_credit_return(dd); +} + +void sc_wait(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + struct send_context *sc = dd->send_contexts[i].sc; + + if (!sc) + continue; + sc_wait_for_packet_egress(sc, 0); + } +} + +/* + * Restart a context after it has been halted due to error. + * + * If the first step fails - wait for the halt to be asserted, return early. + * Otherwise complain about timeouts but keep going. + * + * It is expected that allocations (enabled flag bit) have been shut off + * already (only applies to kernel contexts). + */ +int sc_restart(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg; + u32 loop; + int count; + + /* bounce off if not halted, or being free'd */ + if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE)) + return -EINVAL; + + dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index, + sc->hw_context); + + /* + * Step 1: Wait for the context to actually halt. + * + * The error interrupt is asynchronous to actually setting halt + * on the context. + */ + loop = 0; + while (1) { + reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS)); + if (reg & SC(STATUS_CTXT_HALTED_SMASK)) + break; + if (loop > 100) { + dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n", + __func__, sc->sw_index, sc->hw_context); + return -ETIME; + } + loop++; + udelay(1); + } + + /* + * Step 2: Ensure no users are still trying to write to PIO. + * + * For kernel contexts, we have already turned off buffer allocation. + * Now wait for the buffer count to go to zero. + * + * For user contexts, the user handling code has cut off write access + * to the context's PIO pages before calling this routine and will + * restore write access after this routine returns. + */ + if (sc->type != SC_USER) { + /* kernel context */ + loop = 0; + while (1) { + count = atomic_read(&sc->buffers_allocated); + if (count == 0) + break; + if (loop > 100) { + dd_dev_err(dd, + "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n", + __func__, sc->sw_index, + sc->hw_context, count); + } + loop++; + udelay(1); + } + } + + /* + * Step 3: Wait for all packets to egress. + * This is done while disabling the send context + * + * Step 4: Disable the context + * + * This is a superset of the halt. After the disable, the + * errors can be cleared. + */ + sc_disable(sc); + + /* + * Step 5: Enable the context + * + * This enable will clear the halted flag and per-send context + * error flags. + */ + return sc_enable(sc); +} + +/* + * PIO freeze processing. To be called after the TXE block is fully frozen. + * Go through all frozen send contexts and disable them. The contexts are + * already stopped by the freeze. + */ +void pio_freeze(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + /* + * Don't disable unallocated, unfrozen, or user send contexts. + * User send contexts will be disabled when the process + * calls into the driver to reset its context. + */ + if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) + continue; + + /* only need to disable, the context is already stopped */ + sc_disable(sc); + } +} + +/* + * Unfreeze PIO for kernel send contexts. The precondition for calling this + * is that all PIO send contexts have been disabled and the SPC freeze has + * been cleared. Now perform the last step and re-enable each kernel context. + * User (PSM) processing will occur when PSM calls into the kernel to + * acknowledge the freeze. + */ +void pio_kernel_unfreeze(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) + continue; + + sc_enable(sc); /* will clear the sc frozen flag */ + } +} + +/* + * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear. + * Returns: + * -ETIMEDOUT - if we wait too long + * -EIO - if there was an error + */ +static int pio_init_wait_progress(struct hfi1_devdata *dd) +{ + u64 reg; + int max, count = 0; + + /* max is the longest possible HW init time / delay */ + max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5; + while (1) { + reg = read_csr(dd, SEND_PIO_INIT_CTXT); + if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK)) + break; + if (count >= max) + return -ETIMEDOUT; + udelay(5); + count++; + } + + return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0; +} + +/* + * Reset all of the send contexts to their power-on state. Used + * only during manual init - no lock against sc_enable needed. + */ +void pio_reset_all(struct hfi1_devdata *dd) +{ + int ret; + + /* make sure the init engine is not busy */ + ret = pio_init_wait_progress(dd); + /* ignore any timeout */ + if (ret == -EIO) { + /* clear the error */ + write_csr(dd, SEND_PIO_ERR_CLEAR, + SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK); + } + + /* reset init all */ + write_csr(dd, SEND_PIO_INIT_CTXT, + SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK); + udelay(2); + ret = pio_init_wait_progress(dd); + if (ret < 0) { + dd_dev_err(dd, + "PIO send context init %s while initializing all PIO blocks\n", + ret == -ETIMEDOUT ? "is stuck" : "had an error"); + } +} + +/* enable the context */ +int sc_enable(struct send_context *sc) +{ + u64 sc_ctrl, reg, pio; + struct hfi1_devdata *dd; + unsigned long flags; + int ret = 0; + + if (!sc) + return -EINVAL; + dd = sc->dd; + + /* + * Obtain the allocator lock to guard against any allocation + * attempts (which should not happen prior to context being + * enabled). On the release/disable side we don't need to + * worry about locking since the releaser will not do anything + * if the context accounting values have not changed. + */ + spin_lock_irqsave(&sc->alloc_lock, flags); + sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL)); + if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK))) + goto unlock; /* already enabled */ + + /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */ + + *sc->hw_free = 0; + sc->free = 0; + sc->alloc_free = 0; + sc->fill = 0; + sc->sr_head = 0; + sc->sr_tail = 0; + sc->flags = 0; + atomic_set(&sc->buffers_allocated, 0); + + /* + * Clear all per-context errors. Some of these will be set when + * we are re-enabling after a context halt. Now that the context + * is disabled, the halt will not clear until after the PIO init + * engine runs below. + */ + reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS)); + if (reg) + write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), + reg); + + /* + * The HW PIO initialization engine can handle only one init + * request at a time. Serialize access to each device's engine. + */ + spin_lock(&dd->sc_init_lock); + /* + * Since access to this code block is serialized and + * each access waits for the initialization to complete + * before releasing the lock, the PIO initialization engine + * should not be in use, so we don't have to wait for the + * InProgress bit to go down. + */ + pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) << + SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) | + SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK; + write_csr(dd, SEND_PIO_INIT_CTXT, pio); + /* + * Wait until the engine is done. Give the chip the required time + * so, hopefully, we read the register just once. + */ + udelay(2); + ret = pio_init_wait_progress(dd); + spin_unlock(&dd->sc_init_lock); + if (ret) { + dd_dev_err(dd, + "sctxt%u(%u): Context not enabled due to init failure %d\n", + sc->sw_index, sc->hw_context, ret); + goto unlock; + } + + /* + * All is well. Enable the context. + */ + sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK); + write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl); + /* + * Read SendCtxtCtrl to force the write out and prevent a timing + * hazard where a PIO write may reach the context before the enable. + */ + read_kctxt_csr(dd, sc->hw_context, SC(CTRL)); + sc->flags |= SCF_ENABLED; + +unlock: + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + return ret; +} + +/* force a credit return on the context */ +void sc_return_credits(struct send_context *sc) +{ + if (!sc) + return; + + /* a 0->1 transition schedules a credit return */ + write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), + SC(CREDIT_FORCE_FORCE_RETURN_SMASK)); + /* + * Ensure that the write is flushed and the credit return is + * scheduled. We care more about the 0 -> 1 transition. + */ + read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE)); + /* set back to 0 for next time */ + write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0); +} + +/* allow all in-flight packets to drain on the context */ +void sc_flush(struct send_context *sc) +{ + if (!sc) + return; + + sc_wait_for_packet_egress(sc, 1); +} + +/* drop all packets on the context, no waiting until they are sent */ +void sc_drop(struct send_context *sc) +{ + if (!sc) + return; + + dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n", + __func__, sc->sw_index, sc->hw_context); +} + +/* + * Start the software reaction to a context halt or SPC freeze: + * - mark the context as halted or frozen + * - stop buffer allocations + * + * Called from the error interrupt. Other work is deferred until + * out of the interrupt. + */ +void sc_stop(struct send_context *sc, int flag) +{ + unsigned long flags; + + /* mark the context */ + sc->flags |= flag; + + /* stop buffer allocations */ + spin_lock_irqsave(&sc->alloc_lock, flags); + sc->flags &= ~SCF_ENABLED; + spin_unlock_irqrestore(&sc->alloc_lock, flags); + wake_up(&sc->halt_wait); +} + +#define BLOCK_DWORDS (PIO_BLOCK_SIZE/sizeof(u32)) +#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS) + +/* + * The send context buffer "allocator". + * + * @sc: the PIO send context we are allocating from + * @len: length of whole packet - including PBC - in dwords + * @cb: optional callback to call when the buffer is finished sending + * @arg: argument for cb + * + * Return a pointer to a PIO buffer if successful, NULL if not enough room. + */ +struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, + pio_release_cb cb, void *arg) +{ + struct pio_buf *pbuf = NULL; + unsigned long flags; + unsigned long avail; + unsigned long blocks = dwords_to_blocks(dw_len); + unsigned long start_fill; + int trycount = 0; + u32 head, next; + + spin_lock_irqsave(&sc->alloc_lock, flags); + if (!(sc->flags & SCF_ENABLED)) { + spin_unlock_irqrestore(&sc->alloc_lock, flags); + goto done; + } + +retry: + avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free); + if (blocks > avail) { + /* not enough room */ + if (unlikely(trycount)) { /* already tried to get more room */ + spin_unlock_irqrestore(&sc->alloc_lock, flags); + goto done; + } + /* copy from receiver cache line and recalculate */ + sc->alloc_free = ACCESS_ONCE(sc->free); + avail = + (unsigned long)sc->credits - + (sc->fill - sc->alloc_free); + if (blocks > avail) { + /* still no room, actively update */ + spin_unlock_irqrestore(&sc->alloc_lock, flags); + sc_release_update(sc); + spin_lock_irqsave(&sc->alloc_lock, flags); + sc->alloc_free = ACCESS_ONCE(sc->free); + trycount++; + goto retry; + } + } + + /* there is enough room */ + + atomic_inc(&sc->buffers_allocated); + + /* read this once */ + head = sc->sr_head; + + /* "allocate" the buffer */ + start_fill = sc->fill; + sc->fill += blocks; + + /* + * Fill the parts that the releaser looks at before moving the head. + * The only necessary piece is the sent_at field. The credits + * we have just allocated cannot have been returned yet, so the + * cb and arg will not be looked at for a "while". Put them + * on this side of the memory barrier anyway. + */ + pbuf = &sc->sr[head].pbuf; + pbuf->sent_at = sc->fill; + pbuf->cb = cb; + pbuf->arg = arg; + pbuf->sc = sc; /* could be filled in at sc->sr init time */ + /* make sure this is in memory before updating the head */ + + /* calculate next head index, do not store */ + next = head + 1; + if (next >= sc->sr_size) + next = 0; + /* update the head - must be last! - the releaser can look at fields + in pbuf once we move the head */ + smp_wmb(); + sc->sr_head = next; + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + /* finish filling in the buffer outside the lock */ + pbuf->start = sc->base_addr + ((start_fill % sc->credits) + * PIO_BLOCK_SIZE); + pbuf->size = sc->credits * PIO_BLOCK_SIZE; + pbuf->end = sc->base_addr + pbuf->size; + pbuf->block_count = blocks; + pbuf->qw_written = 0; + pbuf->carry_bytes = 0; + pbuf->carry.val64 = 0; +done: + return pbuf; +} + +/* + * There are at least two entities that can turn on credit return + * interrupts and they can overlap. Avoid problems by implementing + * a count scheme that is enforced by a lock. The lock is needed because + * the count and CSR write must be paired. + */ + +/* + * Start credit return interrupts. This is managed by a count. If already + * on, just increment the count. + */ +void sc_add_credit_return_intr(struct send_context *sc) +{ + unsigned long flags; + + /* lock must surround both the count change and the CSR update */ + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + if (sc->credit_intr_count == 0) { + sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + } + sc->credit_intr_count++; + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); +} + +/* + * Stop credit return interrupts. This is managed by a count. Decrement the + * count, if the last user, then turn the credit interrupts off. + */ +void sc_del_credit_return_intr(struct send_context *sc) +{ + unsigned long flags; + + WARN_ON(sc->credit_intr_count == 0); + + /* lock must surround both the count change and the CSR update */ + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + sc->credit_intr_count--; + if (sc->credit_intr_count == 0) { + sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + } + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); +} + +/* + * The caller must be careful when calling this. All needint calls + * must be paired with !needint. + */ +void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint) +{ + if (needint) + sc_add_credit_return_intr(sc); + else + sc_del_credit_return_intr(sc); + trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl); + if (needint) { + mmiowb(); + sc_return_credits(sc); + } +} + +/** + * sc_piobufavail - callback when a PIO buffer is available + * @sc: the send context + * + * This is called from the interrupt handler when a PIO buffer is + * available after hfi1_verbs_send() returned an error that no buffers were + * available. Disable the interrupt if there are no more QPs waiting. + */ +static void sc_piobufavail(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + struct hfi1_ibdev *dev = &dd->verbs_dev; + struct list_head *list; + struct hfi1_qp *qps[PIO_WAIT_BATCH_SIZE]; + struct hfi1_qp *qp; + unsigned long flags; + unsigned i, n = 0; + + if (dd->send_contexts[sc->sw_index].type != SC_KERNEL) + return; + list = &sc->piowait; + /* + * Note: checking that the piowait list is empty and clearing + * the buffer available interrupt needs to be atomic or we + * could end up with QPs on the wait list with the interrupt + * disabled. + */ + write_seqlock_irqsave(&dev->iowait_lock, flags); + while (!list_empty(list)) { + struct iowait *wait; + + if (n == ARRAY_SIZE(qps)) + goto full; + wait = list_first_entry(list, struct iowait, list); + qp = container_of(wait, struct hfi1_qp, s_iowait); + list_del_init(&qp->s_iowait.list); + /* refcount held until actual wake up */ + qps[n++] = qp; + } + /* + * Counting: only call wantpiobuf_intr() if there were waiters and they + * are now all gone. + */ + if (n) + hfi1_sc_wantpiobuf_intr(sc, 0); +full: + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + + for (i = 0; i < n; i++) + hfi1_qp_wakeup(qps[i], HFI1_S_WAIT_PIO); +} + +/* translate a send credit update to a bit code of reasons */ +static inline int fill_code(u64 hw_free) +{ + int code = 0; + + if (hw_free & CR_STATUS_SMASK) + code |= PRC_STATUS_ERR; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK) + code |= PRC_PBC; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK) + code |= PRC_THRESHOLD; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK) + code |= PRC_FILL_ERR; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK) + code |= PRC_SC_DISABLE; + return code; +} + +/* use the jiffies compare to get the wrap right */ +#define sent_before(a, b) time_before(a, b) /* a < b */ + +/* + * The send context buffer "releaser". + */ +void sc_release_update(struct send_context *sc) +{ + struct pio_buf *pbuf; + u64 hw_free; + u32 head, tail; + unsigned long old_free; + unsigned long extra; + unsigned long flags; + int code; + + if (!sc) + return; + + spin_lock_irqsave(&sc->release_lock, flags); + /* update free */ + hw_free = le64_to_cpu(*sc->hw_free); /* volatile read */ + old_free = sc->free; + extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT) + - (old_free & CR_COUNTER_MASK)) + & CR_COUNTER_MASK; + sc->free = old_free + extra; + trace_hfi1_piofree(sc, extra); + + /* call sent buffer callbacks */ + code = -1; /* code not yet set */ + head = ACCESS_ONCE(sc->sr_head); /* snapshot the head */ + tail = sc->sr_tail; + while (head != tail) { + pbuf = &sc->sr[tail].pbuf; + + if (sent_before(sc->free, pbuf->sent_at)) { + /* not sent yet */ + break; + } + if (pbuf->cb) { + if (code < 0) /* fill in code on first user */ + code = fill_code(hw_free); + (*pbuf->cb)(pbuf->arg, code); + } + + tail++; + if (tail >= sc->sr_size) + tail = 0; + } + /* update tail, in case we moved it */ + sc->sr_tail = tail; + spin_unlock_irqrestore(&sc->release_lock, flags); + sc_piobufavail(sc); +} + +/* + * Send context group releaser. Argument is the send context that caused + * the interrupt. Called from the send context interrupt handler. + * + * Call release on all contexts in the group. + * + * This routine takes the sc_lock without an irqsave because it is only + * called from an interrupt handler. Adjust if that changes. + */ +void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context) +{ + struct send_context *sc; + u32 sw_index; + u32 gc, gc_end; + + spin_lock(&dd->sc_lock); + sw_index = dd->hw_to_sw[hw_context]; + if (unlikely(sw_index >= dd->num_send_contexts)) { + dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n", + __func__, hw_context, sw_index); + goto done; + } + sc = dd->send_contexts[sw_index].sc; + if (unlikely(!sc)) + goto done; + + gc = group_context(hw_context, sc->group); + gc_end = gc + group_size(sc->group); + for (; gc < gc_end; gc++) { + sw_index = dd->hw_to_sw[gc]; + if (unlikely(sw_index >= dd->num_send_contexts)) { + dd_dev_err(dd, + "%s: invalid hw (%u) to sw (%u) mapping\n", + __func__, hw_context, sw_index); + continue; + } + sc_release_update(dd->send_contexts[sw_index].sc); + } +done: + spin_unlock(&dd->sc_lock); +} + +int init_pervl_scs(struct hfi1_devdata *dd) +{ + int i; + u64 mask, all_vl_mask = (u64) 0x80ff; /* VLs 0-7, 15 */ + u32 ctxt; + + dd->vld[15].sc = sc_alloc(dd, SC_KERNEL, + dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->vld[15].sc) + goto nomem; + hfi1_init_ctxt(dd->vld[15].sc); + dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); + for (i = 0; i < num_vls; i++) { + /* + * Since this function does not deal with a specific + * receive context but we need the RcvHdrQ entry size, + * use the size from rcd[0]. It is guaranteed to be + * valid at this point and will remain the same for all + * receive contexts. + */ + dd->vld[i].sc = sc_alloc(dd, SC_KERNEL, + dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->vld[i].sc) + goto nomem; + + hfi1_init_ctxt(dd->vld[i].sc); + + /* non VL15 start with the max MTU */ + dd->vld[i].mtu = hfi1_max_mtu; + } + sc_enable(dd->vld[15].sc); + ctxt = dd->vld[15].sc->hw_context; + mask = all_vl_mask & ~(1LL << 15); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + dd_dev_info(dd, + "Using send context %u(%u) for VL15\n", + dd->vld[15].sc->sw_index, ctxt); + for (i = 0; i < num_vls; i++) { + sc_enable(dd->vld[i].sc); + ctxt = dd->vld[i].sc->hw_context; + mask = all_vl_mask & ~(1LL << i); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + } + return 0; +nomem: + sc_free(dd->vld[15].sc); + for (i = 0; i < num_vls; i++) + sc_free(dd->vld[i].sc); + return -ENOMEM; +} + +int init_credit_return(struct hfi1_devdata *dd) +{ + int ret; + int num_numa; + int i; + + num_numa = num_online_nodes(); + /* enforce the expectation that the numas are compact */ + for (i = 0; i < num_numa; i++) { + if (!node_online(i)) { + dd_dev_err(dd, "NUMA nodes are not compact\n"); + ret = -EINVAL; + goto done; + } + } + + dd->cr_base = kcalloc( + num_numa, + sizeof(struct credit_return_base), + GFP_KERNEL); + if (!dd->cr_base) { + dd_dev_err(dd, "Unable to allocate credit return base\n"); + ret = -ENOMEM; + goto done; + } + for (i = 0; i < num_numa; i++) { + int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return); + + set_dev_node(&dd->pcidev->dev, i); + dd->cr_base[i].va = dma_zalloc_coherent( + &dd->pcidev->dev, + bytes, + &dd->cr_base[i].pa, + GFP_KERNEL); + if (dd->cr_base[i].va == NULL) { + set_dev_node(&dd->pcidev->dev, dd->node); + dd_dev_err(dd, + "Unable to allocate credit return DMA range for NUMA %d\n", + i); + ret = -ENOMEM; + goto done; + } + } + set_dev_node(&dd->pcidev->dev, dd->node); + + ret = 0; +done: + return ret; +} + +void free_credit_return(struct hfi1_devdata *dd) +{ + int num_numa; + int i; + + if (!dd->cr_base) + return; + + num_numa = num_online_nodes(); + for (i = 0; i < num_numa; i++) { + if (dd->cr_base[i].va) { + dma_free_coherent(&dd->pcidev->dev, + TXE_NUM_CONTEXTS + * sizeof(struct credit_return), + dd->cr_base[i].va, + dd->cr_base[i].pa); + } + } + kfree(dd->cr_base); + dd->cr_base = NULL; +} diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h new file mode 100644 index 000000000..0bb885ca3 --- /dev/null +++ b/drivers/staging/rdma/hfi1/pio.h @@ -0,0 +1,224 @@ +#ifndef _PIO_H +#define _PIO_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + + +/* send context types */ +#define SC_KERNEL 0 +#define SC_ACK 1 +#define SC_USER 2 +#define SC_MAX 3 + +/* invalid send context index */ +#define INVALID_SCI 0xff + +/* PIO buffer release callback function */ +typedef void (*pio_release_cb)(void *arg, int code); + +/* PIO release codes - in bits, as there could more than one that apply */ +#define PRC_OK 0 /* no known error */ +#define PRC_STATUS_ERR 0x01 /* credit return due to status error */ +#define PRC_PBC 0x02 /* credit return due to PBC */ +#define PRC_THRESHOLD 0x04 /* credit return due to threshold */ +#define PRC_FILL_ERR 0x08 /* credit return due fill error */ +#define PRC_FORCE 0x10 /* credit return due credit force */ +#define PRC_SC_DISABLE 0x20 /* clean-up after a context disable */ + +/* byte helper */ +union mix { + u64 val64; + u32 val32[2]; + u8 val8[8]; +}; + +/* an allocated PIO buffer */ +struct pio_buf { + struct send_context *sc;/* back pointer to owning send context */ + pio_release_cb cb; /* called when the buffer is released */ + void *arg; /* argument for cb */ + void __iomem *start; /* buffer start address */ + void __iomem *end; /* context end address */ + unsigned long size; /* context size, in bytes */ + unsigned long sent_at; /* buffer is sent when <= free */ + u32 block_count; /* size of buffer, in blocks */ + u32 qw_written; /* QW written so far */ + u32 carry_bytes; /* number of valid bytes in carry */ + union mix carry; /* pending unwritten bytes */ +}; + +/* cache line aligned pio buffer array */ +union pio_shadow_ring { + struct pio_buf pbuf; + u64 unused[16]; /* cache line spacer */ +} ____cacheline_aligned; + +/* per-NUMA send context */ +struct send_context { + /* read-only after init */ + struct hfi1_devdata *dd; /* device */ + void __iomem *base_addr; /* start of PIO memory */ + union pio_shadow_ring *sr; /* shadow ring */ + volatile __le64 *hw_free; /* HW free counter */ + struct work_struct halt_work; /* halted context work queue entry */ + unsigned long flags; /* flags */ + int node; /* context home node */ + int type; /* context type */ + u32 sw_index; /* software index number */ + u32 hw_context; /* hardware context number */ + u32 credits; /* number of blocks in context */ + u32 sr_size; /* size of the shadow ring */ + u32 group; /* credit return group */ + /* allocator fields */ + spinlock_t alloc_lock ____cacheline_aligned_in_smp; + unsigned long fill; /* official alloc count */ + unsigned long alloc_free; /* copy of free (less cache thrash) */ + u32 sr_head; /* shadow ring head */ + /* releaser fields */ + spinlock_t release_lock ____cacheline_aligned_in_smp; + unsigned long free; /* official free count */ + u32 sr_tail; /* shadow ring tail */ + /* list for PIO waiters */ + struct list_head piowait ____cacheline_aligned_in_smp; + spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp; + u64 credit_ctrl; /* cache for credit control */ + u32 credit_intr_count; /* count of credit intr users */ + atomic_t buffers_allocated; /* count of buffers allocated */ + wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */ +}; + +/* send context flags */ +#define SCF_ENABLED 0x01 +#define SCF_IN_FREE 0x02 +#define SCF_HALTED 0x04 +#define SCF_FROZEN 0x08 + +struct send_context_info { + struct send_context *sc; /* allocated working context */ + u16 allocated; /* has this been allocated? */ + u16 type; /* context type */ + u16 base; /* base in PIO array */ + u16 credits; /* size in PIO array */ +}; + +/* DMA credit return, index is always (context & 0x7) */ +struct credit_return { + volatile __le64 cr[8]; +}; + +/* NUMA indexed credit return array */ +struct credit_return_base { + struct credit_return *va; + dma_addr_t pa; +}; + +/* send context configuration sizes (one per type) */ +struct sc_config_sizes { + short int size; + short int count; +}; + +/* send context functions */ +int init_credit_return(struct hfi1_devdata *dd); +void free_credit_return(struct hfi1_devdata *dd); +int init_sc_pools_and_sizes(struct hfi1_devdata *dd); +int init_send_contexts(struct hfi1_devdata *dd); +int init_credit_return(struct hfi1_devdata *dd); +int init_pervl_scs(struct hfi1_devdata *dd); +struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, + uint hdrqentsize, int numa); +void sc_free(struct send_context *sc); +int sc_enable(struct send_context *sc); +void sc_disable(struct send_context *sc); +int sc_restart(struct send_context *sc); +void sc_return_credits(struct send_context *sc); +void sc_flush(struct send_context *sc); +void sc_drop(struct send_context *sc); +void sc_stop(struct send_context *sc, int bit); +struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, + pio_release_cb cb, void *arg); +void sc_release_update(struct send_context *sc); +void sc_return_credits(struct send_context *sc); +void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context); +void sc_add_credit_return_intr(struct send_context *sc); +void sc_del_credit_return_intr(struct send_context *sc); +void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold); +u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize); +void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint); +void sc_wait(struct hfi1_devdata *dd); +void set_pio_integrity(struct send_context *sc); + +/* support functions */ +void pio_reset_all(struct hfi1_devdata *dd); +void pio_freeze(struct hfi1_devdata *dd); +void pio_kernel_unfreeze(struct hfi1_devdata *dd); + +/* global PIO send control operations */ +#define PSC_GLOBAL_ENABLE 0 +#define PSC_GLOBAL_DISABLE 1 +#define PSC_GLOBAL_VLARB_ENABLE 2 +#define PSC_GLOBAL_VLARB_DISABLE 3 +#define PSC_CM_RESET 4 +#define PSC_DATA_VL_ENABLE 5 +#define PSC_DATA_VL_DISABLE 6 + +void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl); +void pio_send_control(struct hfi1_devdata *dd, int op); + + +/* PIO copy routines */ +void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, + const void *from, size_t count); +void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, + const void *from, size_t nbytes); +void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes); +void seg_pio_copy_end(struct pio_buf *pbuf); + +#endif /* _PIO_H */ diff --git a/drivers/staging/rdma/hfi1/pio_copy.c b/drivers/staging/rdma/hfi1/pio_copy.c new file mode 100644 index 000000000..8972bbc02 --- /dev/null +++ b/drivers/staging/rdma/hfi1/pio_copy.c @@ -0,0 +1,858 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" + +/* additive distance between non-SOP and SOP space */ +#define SOP_DISTANCE (TXE_PIO_SIZE / 2) +#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE-1) +/* number of QUADWORDs in a block */ +#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE/sizeof(u64)) + +/** + * pio_copy - copy data block to MMIO space + * @pbuf: a number of blocks allocated within a PIO send context + * @pbc: PBC to send + * @from: source, must be 8 byte aligned + * @count: number of DWORD (32-bit) quantities to copy from source + * + * Copy data from source to PIO Send Buffer memory, 8 bytes at a time. + * Must always write full BLOCK_SIZE bytes blocks. The first block must + * be written to the corresponding SOP=1 address. + * + * Known: + * o pbuf->start always starts on a block boundary + * o pbuf can wrap only at a block boundary + */ +void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, + const void *from, size_t count) +{ + void __iomem *dest = pbuf->start + SOP_DISTANCE; + void __iomem *send = dest + PIO_BLOCK_SIZE; + void __iomem *dend; /* 8-byte data end */ + + /* write the PBC */ + writeq(pbc, dest); + dest += sizeof(u64); + + /* calculate where the QWORD data ends - in SOP=1 space */ + dend = dest + ((count>>1) * sizeof(u64)); + + if (dend < send) { + /* all QWORD data is within the SOP block, does *not* + reach the end of the SOP block */ + + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* + * No boundary checks are needed here: + * 0. We're not on the SOP block boundary + * 1. The possible DWORD dangle will still be within + * the SOP block + * 2. We cannot wrap except on a block boundary. + */ + } else { + /* QWORD data extends _to_ or beyond the SOP block */ + + /* write 8-byte SOP chunk data */ + while (dest < send) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* drop out of the SOP range */ + dest -= SOP_DISTANCE; + dend -= SOP_DISTANCE; + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written, but we will wrap in + * case there is a dangling DWORD. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + } + /* at this point we have wrapped if we are going to wrap */ + + /* write dangling u32, if any */ + if (count & 1) { + union mix val; + + val.val64 = 0; + val.val32[0] = *(u32 *)from; + writeq(val.val64, dest); + dest += sizeof(u64); + } + /* fill in rest of block, no need to check pbuf->end + as we only wrap on a block boundary */ + while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) { + writeq(0, dest); + dest += sizeof(u64); + } + + /* finished with this buffer */ + atomic_dec(&pbuf->sc->buffers_allocated); +} + +/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */ +#define USE_SHIFTS 1 +#ifdef USE_SHIFTS +/* + * Handle carry bytes using shifts and masks. + * + * NOTE: the value the unused portion of carry is expected to always be zero. + */ + +/* + * "zero" shift - bit shift used to zero out upper bytes. Input is + * the count of LSB bytes to preserve. + */ +#define zshift(x) (8 * (8-(x))) + +/* + * "merge" shift - bit shift used to merge with carry bytes. Input is + * the LSB byte count to move beyond. + */ +#define mshift(x) (8 * (x)) + +/* + * Read nbytes bytes from "from" and return them in the LSB bytes + * of pbuf->carry. Other bytes are zeroed. Any previous value + * pbuf->carry is lost. + * + * NOTES: + * o do not read from from if nbytes is zero + * o from may _not_ be u64 aligned + * o nbytes must not span a QW boundary + */ +static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, + unsigned int nbytes) +{ + unsigned long off; + + if (nbytes == 0) { + pbuf->carry.val64 = 0; + } else { + /* align our pointer */ + off = (unsigned long)from & 0x7; + from = (void *)((unsigned long)from & ~0x7l); + pbuf->carry.val64 = ((*(u64 *)from) + << zshift(nbytes + off))/* zero upper bytes */ + >> zshift(nbytes); /* place at bottom */ + } + pbuf->carry_bytes = nbytes; +} + +/* + * Read nbytes bytes from "from" and put them at the next significant bytes + * of pbuf->carry. Unused bytes are zeroed. It is expected that the extra + * read does not overfill carry. + * + * NOTES: + * o from may _not_ be u64 aligned + * o nbytes may span a QW boundary + */ +static inline void read_extra_bytes(struct pio_buf *pbuf, + const void *from, unsigned int nbytes) +{ + unsigned long off = (unsigned long)from & 0x7; + unsigned int room, xbytes; + + /* align our pointer */ + from = (void *)((unsigned long)from & ~0x7l); + + /* check count first - don't read anything if count is zero */ + while (nbytes) { + /* find the number of bytes in this u64 */ + room = 8 - off; /* this u64 has room for this many bytes */ + xbytes = nbytes > room ? room : nbytes; + + /* + * shift down to zero lower bytes, shift up to zero upper + * bytes, shift back down to move into place + */ + pbuf->carry.val64 |= (((*(u64 *)from) + >> mshift(off)) + << zshift(xbytes)) + >> zshift(xbytes+pbuf->carry_bytes); + off = 0; + pbuf->carry_bytes += xbytes; + nbytes -= xbytes; + from += sizeof(u64); + } +} + +/* + * Zero extra bytes from the end of pbuf->carry. + * + * NOTES: + * o zbytes <= old_bytes + */ +static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes) +{ + unsigned int remaining; + + if (zbytes == 0) /* nothing to do */ + return; + + remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */ + + /* NOTE: zshift only guaranteed to work if remaining != 0 */ + if (remaining) + pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining)) + >> zshift(remaining); + else + pbuf->carry.val64 = 0; + pbuf->carry_bytes = remaining; +} + +/* + * Write a quad word using parts of pbuf->carry and the next 8 bytes of src. + * Put the unused part of the next 8 bytes of src into the LSB bytes of + * pbuf->carry with the upper bytes zeroed.. + * + * NOTES: + * o result must keep unused bytes zeroed + * o src must be u64 aligned + */ +static inline void merge_write8( + struct pio_buf *pbuf, + void __iomem *dest, + const void *src) +{ + u64 new, temp; + + new = *(u64 *)src; + temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes)); + writeq(temp, dest); + pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes); +} + +/* + * Write a quad word using all bytes of carry. + */ +static inline void carry8_write8(union mix carry, void __iomem *dest) +{ + writeq(carry.val64, dest); +} + +/* + * Write a quad word using all the valid bytes of carry. If carry + * has zero valid bytes, nothing is written. + * Returns 0 on nothing written, non-zero on quad word written. + */ +static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest) +{ + if (pbuf->carry_bytes) { + /* unused bytes are always kept zeroed, so just write */ + writeq(pbuf->carry.val64, dest); + return 1; + } + + return 0; +} + +#else /* USE_SHIFTS */ +/* + * Handle carry bytes using byte copies. + * + * NOTE: the value the unused portion of carry is left uninitialized. + */ + +/* + * Jump copy - no-loop copy for < 8 bytes. + */ +static inline void jcopy(u8 *dest, const u8 *src, u32 n) +{ + switch (n) { + case 7: + *dest++ = *src++; + case 6: + *dest++ = *src++; + case 5: + *dest++ = *src++; + case 4: + *dest++ = *src++; + case 3: + *dest++ = *src++; + case 2: + *dest++ = *src++; + case 1: + *dest++ = *src++; + } +} + +/* + * Read nbytes from "from" and and place them in the low bytes + * of pbuf->carry. Other bytes are left as-is. Any previous + * value in pbuf->carry is lost. + * + * NOTES: + * o do not read from from if nbytes is zero + * o from may _not_ be u64 aligned. + */ +static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, + unsigned int nbytes) +{ + jcopy(&pbuf->carry.val8[0], from, nbytes); + pbuf->carry_bytes = nbytes; +} + +/* + * Read nbytes bytes from "from" and put them at the end of pbuf->carry. + * It is expected that the extra read does not overfill carry. + * + * NOTES: + * o from may _not_ be u64 aligned + * o nbytes may span a QW boundary + */ +static inline void read_extra_bytes(struct pio_buf *pbuf, + const void *from, unsigned int nbytes) +{ + jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes); + pbuf->carry_bytes += nbytes; +} + +/* + * Zero extra bytes from the end of pbuf->carry. + * + * We do not care about the value of unused bytes in carry, so just + * reduce the byte count. + * + * NOTES: + * o zbytes <= old_bytes + */ +static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes) +{ + pbuf->carry_bytes -= zbytes; +} + +/* + * Write a quad word using parts of pbuf->carry and the next 8 bytes of src. + * Put the unused part of the next 8 bytes of src into the low bytes of + * pbuf->carry. + */ +static inline void merge_write8( + struct pio_buf *pbuf, + void *dest, + const void *src) +{ + u32 remainder = 8 - pbuf->carry_bytes; + + jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder); + writeq(pbuf->carry.val64, dest); + jcopy(&pbuf->carry.val8[0], src+remainder, pbuf->carry_bytes); +} + +/* + * Write a quad word using all bytes of carry. + */ +static inline void carry8_write8(union mix carry, void *dest) +{ + writeq(carry.val64, dest); +} + +/* + * Write a quad word using all the valid bytes of carry. If carry + * has zero valid bytes, nothing is written. + * Returns 0 on nothing written, non-zero on quad word written. + */ +static inline int carry_write8(struct pio_buf *pbuf, void *dest) +{ + if (pbuf->carry_bytes) { + u64 zero = 0; + + jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero, + 8 - pbuf->carry_bytes); + writeq(pbuf->carry.val64, dest); + return 1; + } + + return 0; +} +#endif /* USE_SHIFTS */ + +/* + * Segmented PIO Copy - start + * + * Start a PIO copy. + * + * @pbuf: destination buffer + * @pbc: the PBC for the PIO buffer + * @from: data source, QWORD aligned + * @nbytes: bytes to copy + */ +void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, + const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + SOP_DISTANCE; + void __iomem *send = dest + PIO_BLOCK_SIZE; + void __iomem *dend; /* 8-byte data end */ + + writeq(pbc, dest); + dest += sizeof(u64); + + /* calculate where the QWORD data ends - in SOP=1 space */ + dend = dest + ((nbytes>>3) * sizeof(u64)); + + if (dend < send) { + /* all QWORD data is within the SOP block, does *not* + reach the end of the SOP block */ + + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* + * No boundary checks are needed here: + * 0. We're not on the SOP block boundary + * 1. The possible DWORD dangle will still be within + * the SOP block + * 2. We cannot wrap except on a block boundary. + */ + } else { + /* QWORD data extends _to_ or beyond the SOP block */ + + /* write 8-byte SOP chunk data */ + while (dest < send) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* drop out of the SOP range */ + dest -= SOP_DISTANCE; + dend -= SOP_DISTANCE; + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written, but we will wrap in + * case there is a dangling DWORD. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + } + /* at this point we have wrapped if we are going to wrap */ + + /* ...but it doesn't matter as we're done writing */ + + /* save dangling bytes, if any */ + read_low_bytes(pbuf, from, nbytes & 0x7); + + pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3); +} + +/* + * Mid copy helper, "mixed case" - source is 64-bit aligned but carry + * bytes are non-zero. + * + * Whole u64s must be written to the chip, so bytes must be manually merged. + * + * @pbuf: destination buffer + * @from: data source, is QWORD aligned. + * @nbytes: bytes to copy + * + * Must handle nbytes < 8. + */ +static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + void __iomem *dend; /* 8-byte data end */ + unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3; + unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7; + + /* calculate 8-byte data end */ + dend = dest + (qw_to_write * sizeof(u64)); + + if (pbuf->qw_written < PIO_BLOCK_QWS) { + /* + * Still within SOP block. We don't need to check for + * wrap because we are still in the first block and + * can only wrap on block boundaries. + */ + void __iomem *send; /* SOP end */ + void __iomem *xend; + + /* calculate the end of data or end of block, whichever + comes first */ + send = pbuf->start + PIO_BLOCK_SIZE; + xend = send < dend ? send : dend; + + /* shift up to SOP=1 space */ + dest += SOP_DISTANCE; + xend += SOP_DISTANCE; + + /* write 8-byte chunk data */ + while (dest < xend) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* shift down to SOP=0 space */ + dest -= SOP_DISTANCE; + } + /* + * At this point dest could be (either, both, or neither): + * - at dend + * - at the wrap + */ + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If dest is at the wrap, we will fall into the if, + * not do the loop, when wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* adjust carry */ + if (pbuf->carry_bytes < bytes_left) { + /* need to read more */ + read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes); + } else { + /* remove invalid bytes */ + zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left); + } + + pbuf->qw_written += qw_to_write; +} + +/* + * Mid copy helper, "straight case" - source pointer is 64-bit aligned + * with no carry bytes. + * + * @pbuf: destination buffer + * @from: data source, is QWORD aligned + * @nbytes: bytes to copy + * + * Must handle nbytes < 8. + */ +static void mid_copy_straight(struct pio_buf *pbuf, + const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + void __iomem *dend; /* 8-byte data end */ + + /* calculate 8-byte data end */ + dend = dest + ((nbytes>>3) * sizeof(u64)); + + if (pbuf->qw_written < PIO_BLOCK_QWS) { + /* + * Still within SOP block. We don't need to check for + * wrap because we are still in the first block and + * can only wrap on block boundaries. + */ + void __iomem *send; /* SOP end */ + void __iomem *xend; + + /* calculate the end of data or end of block, whichever + comes first */ + send = pbuf->start + PIO_BLOCK_SIZE; + xend = send < dend ? send : dend; + + /* shift up to SOP=1 space */ + dest += SOP_DISTANCE; + xend += SOP_DISTANCE; + + /* write 8-byte chunk data */ + while (dest < xend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* shift down to SOP=0 space */ + dest -= SOP_DISTANCE; + } + /* + * At this point dest could be (either, both, or neither): + * - at dend + * - at the wrap + */ + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If dest is at the wrap, we will fall into the if, + * not do the loop, when wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* we know carry_bytes was zero on entry to this routine */ + read_low_bytes(pbuf, from, nbytes & 0x7); + + pbuf->qw_written += nbytes>>3; +} + +/* + * Segmented PIO Copy - middle + * + * Must handle any aligned tail and any aligned source with any byte count. + * + * @pbuf: a number of blocks allocated within a PIO send context + * @from: data source + * @nbytes: number of bytes to copy + */ +void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes) +{ + unsigned long from_align = (unsigned long)from & 0x7; + + if (pbuf->carry_bytes + nbytes < 8) { + /* not enough bytes to fill a QW */ + read_extra_bytes(pbuf, from, nbytes); + return; + } + + if (from_align) { + /* misaligned source pointer - align it */ + unsigned long to_align; + + /* bytes to read to align "from" */ + to_align = 8 - from_align; + + /* + * In the advance-to-alignment logic below, we do not need + * to check if we are using more than nbytes. This is because + * if we are here, we already know that carry+nbytes will + * fill at least one QW. + */ + if (pbuf->carry_bytes + to_align < 8) { + /* not enough align bytes to fill a QW */ + read_extra_bytes(pbuf, from, to_align); + from += to_align; + nbytes -= to_align; + } else { + /* bytes to fill carry */ + unsigned long to_fill = 8 - pbuf->carry_bytes; + /* bytes left over to be read */ + unsigned long extra = to_align - to_fill; + void __iomem *dest; + + /* fill carry... */ + read_extra_bytes(pbuf, from, to_fill); + from += to_fill; + nbytes -= to_fill; + + /* ...now write carry */ + dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + + /* + * The two checks immediately below cannot both be + * true, hence the else. If we have wrapped, we + * cannot still be within the first block. + * Conversely, if we are still in the first block, we + * cannot have wrapped. We do the wrap check first + * as that is more likely. + */ + /* adjust if we've wrapped */ + if (dest >= pbuf->end) + dest -= pbuf->size; + /* jump to SOP range if within the first block */ + else if (pbuf->qw_written < PIO_BLOCK_QWS) + dest += SOP_DISTANCE; + + carry8_write8(pbuf->carry, dest); + pbuf->qw_written++; + + /* read any extra bytes to do final alignment */ + /* this will overwrite anything in pbuf->carry */ + read_low_bytes(pbuf, from, extra); + from += extra; + nbytes -= extra; + } + + /* at this point, from is QW aligned */ + } + + if (pbuf->carry_bytes) + mid_copy_mix(pbuf, from, nbytes); + else + mid_copy_straight(pbuf, from, nbytes); +} + +/* + * Segmented PIO Copy - end + * + * Write any remainder (in pbuf->carry) and finish writing the whole block. + * + * @pbuf: a number of blocks allocated within a PIO send context + */ +void seg_pio_copy_end(struct pio_buf *pbuf) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + + /* + * The two checks immediately below cannot both be true, hence the + * else. If we have wrapped, we cannot still be within the first + * block. Conversely, if we are still in the first block, we + * cannot have wrapped. We do the wrap check first as that is + * more likely. + */ + /* adjust if we have wrapped */ + if (dest >= pbuf->end) + dest -= pbuf->size; + /* jump to the SOP range if within the first block */ + else if (pbuf->qw_written < PIO_BLOCK_QWS) + dest += SOP_DISTANCE; + + /* write final bytes, if any */ + if (carry_write8(pbuf, dest)) { + dest += sizeof(u64); + /* + * NOTE: We do not need to recalculate whether dest needs + * SOP_DISTANCE or not. + * + * If we are in the first block and the dangle write + * keeps us in the same block, dest will need + * to retain SOP_DISTANCE in the loop below. + * + * If we are in the first block and the dangle write pushes + * us to the next block, then loop below will not run + * and dest is not used. Hence we do not need to update + * it. + * + * If we are past the first block, then SOP_DISTANCE + * was never added, so there is nothing to do. + */ + } + + /* fill in rest of block */ + while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) { + writeq(0, dest); + dest += sizeof(u64); + } + + /* finished with this buffer */ + atomic_dec(&pbuf->sc->buffers_allocated); +} diff --git a/drivers/staging/rdma/hfi1/platform_config.h b/drivers/staging/rdma/hfi1/platform_config.h new file mode 100644 index 000000000..8a94a8342 --- /dev/null +++ b/drivers/staging/rdma/hfi1/platform_config.h @@ -0,0 +1,286 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef __PLATFORM_CONFIG_H +#define __PLATFORM_CONFIG_H + +#define METADATA_TABLE_FIELD_START_SHIFT 0 +#define METADATA_TABLE_FIELD_START_LEN_BITS 15 +#define METADATA_TABLE_FIELD_LEN_SHIFT 16 +#define METADATA_TABLE_FIELD_LEN_LEN_BITS 16 + +/* Header structure */ +#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT 0 +#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS 6 +#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT 16 +#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS 12 +#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT 28 +#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS 4 + +enum platform_config_table_type_encoding { + PLATFORM_CONFIG_TABLE_RESERVED, + PLATFORM_CONFIG_SYSTEM_TABLE, + PLATFORM_CONFIG_PORT_TABLE, + PLATFORM_CONFIG_RX_PRESET_TABLE, + PLATFORM_CONFIG_TX_PRESET_TABLE, + PLATFORM_CONFIG_QSFP_ATTEN_TABLE, + PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE, + PLATFORM_CONFIG_TABLE_MAX +}; + +enum platform_config_system_table_fields { + SYSTEM_TABLE_RESERVED, + SYSTEM_TABLE_NODE_STRING, + SYSTEM_TABLE_SYSTEM_IMAGE_GUID, + SYSTEM_TABLE_NODE_GUID, + SYSTEM_TABLE_REVISION, + SYSTEM_TABLE_VENDOR_OUI, + SYSTEM_TABLE_META_VERSION, + SYSTEM_TABLE_DEVICE_ID, + SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP, + SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G, + SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT, + SYSTEM_TABLE_MAX +}; + +enum platform_config_port_table_fields { + PORT_TABLE_RESERVED, + PORT_TABLE_PORT_TYPE, + PORT_TABLE_ATTENUATION_12G, + PORT_TABLE_ATTENUATION_25G, + PORT_TABLE_LINK_SPEED_SUPPORTED, + PORT_TABLE_LINK_WIDTH_SUPPORTED, + PORT_TABLE_VL_CAP, + PORT_TABLE_MTU_CAP, + PORT_TABLE_TX_LANE_ENABLE_MASK, + PORT_TABLE_LOCAL_MAX_TIMEOUT, + PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED, + PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED, + PORT_TABLE_TX_PRESET_IDX_PASSIVE_CU, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ, + PORT_TABLE_RX_PRESET_IDX, + PORT_TABLE_CABLE_REACH_CLASS, + PORT_TABLE_MAX +}; + +enum platform_config_rx_preset_table_fields { + RX_PRESET_TABLE_RESERVED, + RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, + RX_PRESET_TABLE_QSFP_RX_EQ_APPLY, + RX_PRESET_TABLE_QSFP_RX_AMP_APPLY, + RX_PRESET_TABLE_QSFP_RX_CDR, + RX_PRESET_TABLE_QSFP_RX_EQ, + RX_PRESET_TABLE_QSFP_RX_AMP, + RX_PRESET_TABLE_MAX +}; + +enum platform_config_tx_preset_table_fields { + TX_PRESET_TABLE_RESERVED, + TX_PRESET_TABLE_PRECUR, + TX_PRESET_TABLE_ATTN, + TX_PRESET_TABLE_POSTCUR, + TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, + TX_PRESET_TABLE_QSFP_TX_EQ_APPLY, + TX_PRESET_TABLE_QSFP_TX_CDR, + TX_PRESET_TABLE_QSFP_TX_EQ, + TX_PRESET_TABLE_MAX +}; + +enum platform_config_qsfp_attn_table_fields { + QSFP_ATTEN_TABLE_RESERVED, + QSFP_ATTEN_TABLE_TX_PRESET_IDX, + QSFP_ATTEN_TABLE_RX_PRESET_IDX, + QSFP_ATTEN_TABLE_MAX +}; + +enum platform_config_variable_settings_table_fields { + VARIABLE_SETTINGS_TABLE_RESERVED, + VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX, + VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX, + VARIABLE_SETTINGS_TABLE_MAX +}; + +struct platform_config_data { + u32 *table; + u32 *table_metadata; + u32 num_table; +}; + +/* + * This struct acts as a quick reference into the platform_data binary image + * and is populated by parse_platform_config(...) depending on the specific + * META_VERSION + */ +struct platform_config_cache { + u8 cache_valid; + struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX]; +}; + +static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = { + 0, + SYSTEM_TABLE_MAX, + PORT_TABLE_MAX, + RX_PRESET_TABLE_MAX, + TX_PRESET_TABLE_MAX, + QSFP_ATTEN_TABLE_MAX, + VARIABLE_SETTINGS_TABLE_MAX +}; + +/* This section defines default values and encodings for the + * fields defined for each table above + */ + +/*===================================================== + * System table encodings + *====================================================*/ +#define PLATFORM_CONFIG_MAGIC_NUM 0x3d4f5041 +#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN 4 + +/* + * These power classes are the same as defined in SFF 8636 spec rev 2.4 + * describing byte 129 in table 6-16, except enumerated in a different order + */ +enum platform_config_qsfp_power_class_encoding { + QSFP_POWER_CLASS_1 = 1, + QSFP_POWER_CLASS_2, + QSFP_POWER_CLASS_3, + QSFP_POWER_CLASS_4, + QSFP_POWER_CLASS_5, + QSFP_POWER_CLASS_6, + QSFP_POWER_CLASS_7 +}; + + +/*===================================================== + * Port table encodings + *==================================================== */ +enum platform_config_port_type_encoding { + PORT_TYPE_RESERVED, + PORT_TYPE_DISCONNECTED, + PORT_TYPE_FIXED, + PORT_TYPE_VARIABLE, + PORT_TYPE_QSFP, + PORT_TYPE_MAX +}; + +enum platform_config_link_speed_supported_encoding { + LINK_SPEED_SUPP_12G = 1, + LINK_SPEED_SUPP_25G, + LINK_SPEED_SUPP_12G_25G, + LINK_SPEED_SUPP_MAX +}; + +/* + * This is a subset (not strict) of the link downgrades + * supported. The link downgrades supported are expected + * to be supplied to the driver by another entity such as + * the fabric manager + */ +enum platform_config_link_width_supported_encoding { + LINK_WIDTH_SUPP_1X = 1, + LINK_WIDTH_SUPP_2X, + LINK_WIDTH_SUPP_2X_1X, + LINK_WIDTH_SUPP_3X, + LINK_WIDTH_SUPP_3X_1X, + LINK_WIDTH_SUPP_3X_2X, + LINK_WIDTH_SUPP_3X_2X_1X, + LINK_WIDTH_SUPP_4X, + LINK_WIDTH_SUPP_4X_1X, + LINK_WIDTH_SUPP_4X_2X, + LINK_WIDTH_SUPP_4X_2X_1X, + LINK_WIDTH_SUPP_4X_3X, + LINK_WIDTH_SUPP_4X_3X_1X, + LINK_WIDTH_SUPP_4X_3X_2X, + LINK_WIDTH_SUPP_4X_3X_2X_1X, + LINK_WIDTH_SUPP_MAX +}; + +enum platform_config_virtual_lane_capability_encoding { + VL_CAP_VL0 = 1, + VL_CAP_VL0_1, + VL_CAP_VL0_2, + VL_CAP_VL0_3, + VL_CAP_VL0_4, + VL_CAP_VL0_5, + VL_CAP_VL0_6, + VL_CAP_VL0_7, + VL_CAP_VL0_8, + VL_CAP_VL0_9, + VL_CAP_VL0_10, + VL_CAP_VL0_11, + VL_CAP_VL0_12, + VL_CAP_VL0_13, + VL_CAP_VL0_14, + VL_CAP_MAX +}; + +/* Max MTU */ +enum platform_config_mtu_capability_encoding { + MTU_CAP_256 = 1, + MTU_CAP_512 = 2, + MTU_CAP_1024 = 3, + MTU_CAP_2048 = 4, + MTU_CAP_4096 = 5, + MTU_CAP_8192 = 6, + MTU_CAP_10240 = 7 +}; + +enum platform_config_local_max_timeout_encoding { + LOCAL_MAX_TIMEOUT_10_MS = 1, + LOCAL_MAX_TIMEOUT_100_MS, + LOCAL_MAX_TIMEOUT_1_S, + LOCAL_MAX_TIMEOUT_10_S, + LOCAL_MAX_TIMEOUT_100_S, + LOCAL_MAX_TIMEOUT_1000_S +}; + +#endif /*__PLATFORM_CONFIG_H*/ diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c new file mode 100644 index 000000000..df1fa56ea --- /dev/null +++ b/drivers/staging/rdma/hfi1/qp.c @@ -0,0 +1,1687 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "qp.h" +#include "trace.h" +#include "sdma.h" + +#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) + +static unsigned int hfi1_qp_table_size = 256; +module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +static void flush_tx_list(struct hfi1_qp *qp); +static int iowait_sleep( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *stx, + unsigned seq); +static void iowait_wakeup(struct iowait *wait, int reason); + +static inline unsigned mk_qpn(struct hfi1_qpn_table *qpt, + struct qpn_map *map, unsigned off) +{ + return (map - qpt->map) * BITS_PER_PAGE + off; +} + +/* + * Convert the AETH credit code into the number of credits. + */ +static const u16 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + +static void get_map_page(struct hfi1_qpn_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock(&qpt->lock); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock(&qpt->lock); +} + +/* + * Allocate the next available QPN or + * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI. + */ +static int alloc_qpn(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt, + enum ib_qp_type type, u8 port) +{ + u32 i, offset, max_scan, qpn; + struct qpn_map *map; + u32 ret; + + if (type == IB_QPT_SMI || type == IB_QPT_GSI) { + unsigned n; + + ret = type == IB_QPT_GSI; + n = 1 << (ret + 2 * (port - 1)); + spin_lock(&qpt->lock); + if (qpt->flags & n) + ret = -EINVAL; + else + qpt->flags |= n; + spin_unlock(&qpt->lock); + goto bail; + } + + qpn = qpt->last + qpt->incr; + if (qpn >= QPN_MAX) + qpn = qpt->incr | ((qpt->last & 1) ^ 1); + /* offset carries bit 0 */ + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) + break; + } + do { + if (!test_and_set_bit(offset, map->page)) { + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset += qpt->incr; + /* + * This qpn might be bogus if offset >= BITS_PER_PAGE. + * That is OK. It gets re-assigned below + */ + qpn = mk_qpn(qpt, map, offset); + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + /* start at incr with current bit 0 */ + offset = qpt->incr | (offset & 1); + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + /* start at incr with current bit 0 */ + offset = qpt->incr | (offset & 1); + } else { + map = &qpt->map[0]; + /* wrap to first map page, invert bit 0 */ + offset = qpt->incr | ((offset & 1) ^ 1); + } + /* there can be no bits at shift and below */ + WARN_ON(offset & (dd->qos_shift - 1)); + qpn = mk_qpn(qpt, map, offset); + } + + ret = -ENOMEM; + +bail: + return ret; +} + +static void free_qpn(struct hfi1_qpn_table *qpt, u32 qpn) +{ + struct qpn_map *map; + + map = qpt->map + qpn / BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); +} + +/* + * Put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static void insert_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + unsigned long flags; + + atomic_inc(&qp->refcount); + spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags); + + if (qp->ibqp.qp_num <= 1) { + rcu_assign_pointer(ibp->qp[qp->ibqp.qp_num], qp); + } else { + u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num); + + qp->next = dev->qp_dev->qp_table[n]; + rcu_assign_pointer(dev->qp_dev->qp_table[n], qp); + trace_hfi1_qpinsert(qp, n); + } + + spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags); +} + +/* + * Remove the QP from the table so it can't be found asynchronously by + * the receive interrupt routine. + */ +static void remove_qp(struct hfi1_ibdev *dev, struct hfi1_qp *qp) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + u32 n = qpn_hash(dev->qp_dev, qp->ibqp.qp_num); + unsigned long flags; + int removed = 1; + + spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags); + + if (rcu_dereference_protected(ibp->qp[0], + lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) { + RCU_INIT_POINTER(ibp->qp[0], NULL); + } else if (rcu_dereference_protected(ibp->qp[1], + lockdep_is_held(&dev->qp_dev->qpt_lock)) == qp) { + RCU_INIT_POINTER(ibp->qp[1], NULL); + } else { + struct hfi1_qp *q; + struct hfi1_qp __rcu **qpp; + + removed = 0; + qpp = &dev->qp_dev->qp_table[n]; + for (; (q = rcu_dereference_protected(*qpp, + lockdep_is_held(&dev->qp_dev->qpt_lock))) + != NULL; + qpp = &q->next) + if (q == qp) { + RCU_INIT_POINTER(*qpp, + rcu_dereference_protected(qp->next, + lockdep_is_held(&dev->qp_dev->qpt_lock))); + removed = 1; + trace_hfi1_qpremove(qp, n); + break; + } + } + + spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags); + if (removed) { + synchronize_rcu(); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +/** + * free_all_qps - check for QPs still in use + * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. + */ +static unsigned free_all_qps(struct hfi1_devdata *dd) +{ + struct hfi1_ibdev *dev = &dd->verbs_dev; + unsigned long flags; + struct hfi1_qp *qp; + unsigned n, qp_inuse = 0; + + for (n = 0; n < dd->num_pports; n++) { + struct hfi1_ibport *ibp = &dd->pport[n].ibport_data; + + if (!hfi1_mcast_tree_empty(ibp)) + qp_inuse++; + rcu_read_lock(); + if (rcu_dereference(ibp->qp[0])) + qp_inuse++; + if (rcu_dereference(ibp->qp[1])) + qp_inuse++; + rcu_read_unlock(); + } + + if (!dev->qp_dev) + goto bail; + spin_lock_irqsave(&dev->qp_dev->qpt_lock, flags); + for (n = 0; n < dev->qp_dev->qp_table_size; n++) { + qp = rcu_dereference_protected(dev->qp_dev->qp_table[n], + lockdep_is_held(&dev->qp_dev->qpt_lock)); + RCU_INIT_POINTER(dev->qp_dev->qp_table[n], NULL); + + for (; qp; qp = rcu_dereference_protected(qp->next, + lockdep_is_held(&dev->qp_dev->qpt_lock))) + qp_inuse++; + } + spin_unlock_irqrestore(&dev->qp_dev->qpt_lock, flags); + synchronize_rcu(); +bail: + return qp_inuse; +} + +/** + * reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + * @type: the QP type + */ +static void reset_qp(struct hfi1_qp *qp, enum ib_qp_type type) +{ + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + iowait_init( + &qp->s_iowait, + 1, + hfi1_do_send, + iowait_sleep, + iowait_wakeup); + qp->s_flags &= HFI1_S_SIGNAL_REQ_WR; + qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_draining = 0; + qp->s_next_psn = 0; + qp->s_last_psn = 0; + qp->s_sending_psn = 0; + qp->s_sending_hpsn = 0; + qp->s_psn = 0; + qp->r_psn = 0; + qp->r_msn = 0; + if (type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_acked = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + clear_ahg(qp); + qp->s_mig_state = IB_MIG_MIGRATED; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; + if (qp->r_rq.wq) { + qp->r_rq.wq->head = 0; + qp->r_rq.wq->tail = 0; + } + qp->r_sge.num_sge = 0; +} + +static void clear_mr_refs(struct hfi1_qp *qp, int clr_sends) +{ + unsigned n; + + if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags)) + hfi1_put_ss(&qp->s_rdma_read_sge); + + hfi1_put_ss(&qp->r_sge); + + if (clr_sends) { + while (qp->s_last != qp->s_head) { + struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + unsigned i; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct hfi1_sge *sge = &wqe->sg_list[i]; + + hfi1_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount); + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } + if (qp->s_rdma_mr) { + hfi1_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + } + + if (qp->ibqp.qp_type != IB_QPT_RC) + return; + + for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { + struct hfi1_ack_entry *e = &qp->s_ack_queue[n]; + + if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && + e->rdma_sge.mr) { + hfi1_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + } +} + +/** + * hfi1_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. + * The QP r_lock and s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ +int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err) +{ + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + int ret = 0; + + if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) + goto bail; + + qp->state = IB_QPS_ERR; + + if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) { + qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + if (qp->s_flags & HFI1_S_ANY_WAIT_SEND) + qp->s_flags &= ~HFI1_S_ANY_WAIT_SEND; + + write_seqlock(&dev->iowait_lock); + if (!list_empty(&qp->s_iowait.list) && !(qp->s_flags & HFI1_S_BUSY)) { + qp->s_flags &= ~HFI1_S_ANY_WAIT_IO; + list_del_init(&qp->s_iowait.list); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + write_sequnlock(&dev->iowait_lock); + + if (!(qp->s_flags & HFI1_S_BUSY)) { + qp->s_hdrwords = 0; + if (qp->s_rdma_mr) { + hfi1_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + flush_tx_list(qp); + } + + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + hfi1_schedule_send(qp); + + clear_mr_refs(qp, 0); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wc.status = IB_WC_WR_FLUSH_ERR; + + if (qp->r_rq.wq) { + struct hfi1_rwq *wq; + u32 head; + u32 tail; + + spin_lock(&qp->r_rq.lock); + + /* sanity check pointers before trusting them */ + wq = qp->r_rq.wq; + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + while (tail != head) { + wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; + if (++tail >= qp->r_rq.size) + tail = 0; + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wq->tail = tail; + + spin_unlock(&qp->r_rq.lock); + } else if (qp->ibqp.event_handler) + ret = 1; + +bail: + return ret; +} + +static void flush_tx_list(struct hfi1_qp *qp) +{ + while (!list_empty(&qp->s_iowait.tx_head)) { + struct sdma_txreq *tx; + + tx = list_first_entry( + &qp->s_iowait.tx_head, + struct sdma_txreq, + list); + list_del_init(&tx->list); + hfi1_put_txreq( + container_of(tx, struct verbs_txreq, txreq)); + } +} + +static void flush_iowait(struct hfi1_qp *qp) +{ + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + if (!list_empty(&qp->s_iowait.list)) { + list_del_init(&qp->s_iowait.list); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + write_sequnlock_irqrestore(&dev->iowait_lock, flags); +} + +static inline int opa_mtu_enum_to_int(int mtu) +{ + switch (mtu) { + case OPA_MTU_8192: return 8192; + case OPA_MTU_10240: return 10240; + default: return -1; + } +} + +/** + * This function is what we would push to the core layer if we wanted to be a + * "first class citizen". Instead we hide this here and rely on Verbs ULPs + * to blindly pass the MTU enum value from the PathRecord to us. + * + * The actual flag used to determine "8k MTU" will change and is currently + * unknown. + */ +static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) +{ + int val = opa_mtu_enum_to_int((int)mtu); + + if (val > 0) + return val; + return ib_mtu_enum_to_int(mtu); +} + + +/** + * hfi1_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for libibverbs.so + * + * Returns 0 on success, otherwise returns an errno. + */ +int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct hfi1_ibdev *dev = to_idev(ibqp->device); + struct hfi1_qp *qp = to_iqp(ibqp); + enum ib_qp_state cur_state, new_state; + struct ib_event ev; + int lastwqe = 0; + int mig = 0; + int ret; + u32 pmtu = 0; /* for gcc warning only */ + struct hfi1_devdata *dd; + + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) + goto inval; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid >= HFI1_MULTICAST_LID_BASE) + goto inval; + if (hfi1_check_ah(qp->ibqp.device, &attr->ah_attr)) + goto inval; + } + + if (attr_mask & IB_QP_ALT_PATH) { + if (attr->alt_ah_attr.dlid >= HFI1_MULTICAST_LID_BASE) + goto inval; + if (hfi1_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) + goto inval; + if (attr->alt_pkey_index >= hfi1_get_npkeys(dd_from_dev(dev))) + goto inval; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + if (attr->pkey_index >= hfi1_get_npkeys(dd_from_dev(dev))) + goto inval; + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + if (attr->min_rnr_timer > 31) + goto inval; + + if (attr_mask & IB_QP_PORT) + if (qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI || + attr->port_num == 0 || + attr->port_num > ibqp->device->phys_port_cnt) + goto inval; + + if (attr_mask & IB_QP_DEST_QPN) + if (attr->dest_qp_num > HFI1_QPN_MASK) + goto inval; + + if (attr_mask & IB_QP_RETRY_CNT) + if (attr->retry_cnt > 7) + goto inval; + + if (attr_mask & IB_QP_RNR_RETRY) + if (attr->rnr_retry > 7) + goto inval; + + /* + * Don't allow invalid path_mtu values. OK to set greater + * than the active mtu (or even the max_cap, if we have tuned + * that to a small mtu. We'll set qp->path_mtu + * to the lesser of requested attribute mtu and active, + * for packetizing messages. + * Note that the QP port has to be set in INIT and MTU in RTR. + */ + if (attr_mask & IB_QP_PATH_MTU) { + int mtu, pidx = qp->port_num - 1; + + dd = dd_from_dev(dev); + mtu = verbs_mtu_enum_to_int(ibqp->device, attr->path_mtu); + if (mtu == -1) + goto inval; + + if (mtu > dd->pport[pidx].ibmtu) + pmtu = mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048); + else + pmtu = attr->path_mtu; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + if (attr->path_mig_state == IB_MIG_REARM) { + if (qp->s_mig_state == IB_MIG_ARMED) + goto inval; + if (new_state != IB_QPS_RTS) + goto inval; + } else if (attr->path_mig_state == IB_MIG_MIGRATED) { + if (qp->s_mig_state == IB_MIG_REARM) + goto inval; + if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD) + goto inval; + if (qp->s_mig_state == IB_MIG_ARMED) + mig = 1; + } else + goto inval; + } + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > HFI1_MAX_RDMA_ATOMIC) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + flush_iowait(qp); + qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT); + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + /* Stop the sending work queue and retry timer */ + cancel_work_sync(&qp->s_iowait.iowork); + del_timer_sync(&qp->s_timer); + iowait_sdma_drain(&qp->s_iowait); + flush_tx_list(qp); + remove_qp(dev, qp); + wait_event(qp->wait, !atomic_read(&qp->refcount)); + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + clear_mr_refs(qp, 1); + clear_ahg(qp); + reset_qp(qp, ibqp->qp_type); + } + break; + + case IB_QPS_RTR: + /* Allow event to re-trigger if QP set to RTR more than once */ + qp->r_flags &= ~HFI1_R_COMM_EST; + qp->state = new_state; + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + + case IB_QPS_ERR: + lastwqe = hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + qp->state = new_state; + break; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + if (attr_mask & IB_QP_PORT) + qp->port_num = attr->port_num; + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_next_psn = attr->sq_psn & PSN_MODIFY_MASK; + qp->s_psn = qp->s_next_psn; + qp->s_sending_psn = qp->s_next_psn; + qp->s_last_psn = qp->s_next_psn - 1; + qp->s_sending_hpsn = qp->s_last_psn; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn & PSN_MODIFY_MASK; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + qp->remote_ah_attr = attr->ah_attr; + qp->s_srate = attr->ah_attr.static_rate; + qp->srate_mbps = ib_rate_to_mbps(qp->s_srate); + } + + if (attr_mask & IB_QP_ALT_PATH) { + qp->alt_ah_attr = attr->alt_ah_attr; + qp->s_alt_pkey_index = attr->alt_pkey_index; + } + + if (attr_mask & IB_QP_PATH_MIG_STATE) { + qp->s_mig_state = attr->path_mig_state; + if (mig) { + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + qp->s_flags |= HFI1_S_AHG_CLEAR; + } + } + + if (attr_mask & IB_QP_PATH_MTU) { + struct hfi1_ibport *ibp; + u8 sc, vl; + u32 mtu; + + dd = dd_from_dev(dev); + ibp = &dd->pport[qp->port_num - 1].ibport_data; + + sc = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + vl = sc_to_vlt(dd, sc); + + mtu = verbs_mtu_enum_to_int(ibqp->device, pmtu); + if (vl < PER_VL_SEND_CONTEXTS) + mtu = min_t(u32, mtu, dd->vld[vl].mtu); + pmtu = mtu_to_enum(mtu, OPA_MTU_8192); + + qp->path_mtu = pmtu; + qp->pmtu = mtu; + } + + if (attr_mask & IB_QP_RETRY_CNT) { + qp->s_retry_cnt = attr->retry_cnt; + qp->s_retry = attr->retry_cnt; + } + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry_cnt = attr->rnr_retry; + qp->s_rnr_retry = attr->rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + qp->r_min_rnr_timer = attr->min_rnr_timer; + + if (attr_mask & IB_QP_TIMEOUT) { + qp->timeout = attr->timeout; + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + } + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + insert_qp(dev, qp); + + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + if (mig) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + ret = 0; + goto bail; + +inval: + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + ret = -EINVAL; + +bail: + return ret; +} + +int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = qp->s_mig_state; + attr->qkey = qp->qkey; + attr->rq_psn = mask_psn(qp->r_psn); + attr->sq_psn = mask_psn(qp->s_next_psn); + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + attr->alt_ah_attr = qp->alt_ah_attr; + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = qp->s_alt_pkey_index; + attr->en_sqd_async_notify = 0; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; + attr->min_rnr_timer = qp->r_min_rnr_timer; + attr->port_num = qp->port_num; + attr->timeout = qp->timeout; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry_cnt; + attr->alt_port_num = qp->alt_ah_attr.port_num; + attr->alt_timeout = qp->alt_timeout; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + if (qp->s_flags & HFI1_S_SIGNAL_REQ_WR) + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + else + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = qp->port_num; + return 0; +} + +/** + * hfi1_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 hfi1_compute_aeth(struct hfi1_qp *qp) +{ + u32 aeth = qp->r_msn & HFI1_MSN_MASK; + + if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + struct hfi1_rwq *wq = qp->r_rq.wq; + u32 head; + u32 tail; + + /* sanity check pointers before trusting them */ + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * There is a small chance that the pair of reads are + * not atomic, which is OK, since the fuzziness is + * resolved as further ACKs go out. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) + max = x; + else if (min == x) + break; + else + min = x; + } + aeth |= x << HFI1_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * hfi1_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: user data for libibverbs.so + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct hfi1_qp *qp; + int err; + struct hfi1_swqe *swq = NULL; + struct hfi1_ibdev *dev; + struct hfi1_devdata *dd; + size_t sz; + size_t sg_list_sz; + struct ib_qp *ret; + + if (init_attr->cap.max_send_sge > hfi1_max_sges || + init_attr->cap.max_send_wr > hfi1_max_qp_wrs || + init_attr->create_flags) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > hfi1_max_sges || + init_attr->cap.max_recv_wr > hfi1_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + } + + switch (init_attr->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (init_attr->port_num == 0 || + init_attr->port_num > ibpd->device->phys_port_cnt) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + case IB_QPT_UC: + case IB_QPT_RC: + case IB_QPT_UD: + sz = sizeof(struct hfi1_sge) * + init_attr->cap.max_send_sge + + sizeof(struct hfi1_swqe); + swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + if (swq == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + sz = sizeof(*qp); + sg_list_sz = 0; + if (init_attr->srq) { + struct hfi1_srq *srq = to_isrq(init_attr->srq); + + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kzalloc(sz + sg_list_sz, GFP_KERNEL); + if (!qp) { + ret = ERR_PTR(-ENOMEM); + goto bail_swq; + } + RCU_INIT_POINTER(qp->next, NULL); + qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL); + if (!qp->s_hdr) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + qp->timeout_jiffies = + usecs_to_jiffies((4096UL * (1UL << qp->timeout)) / + 1000UL); + if (init_attr->srq) + sz = 0; + else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + + sizeof(struct hfi1_rwqe); + qp->r_rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + + qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->r_lock); + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + init_timer(&qp->s_timer); + qp->s_timer.data = (unsigned long)qp; + INIT_LIST_HEAD(&qp->rspwait); + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_max_sge = init_attr->cap.max_send_sge; + if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) + qp->s_flags = HFI1_S_SIGNAL_REQ_WR; + dev = to_idev(ibpd->device); + dd = dd_from_dev(dev); + err = alloc_qpn(dd, &dev->qp_dev->qpn_table, init_attr->qp_type, + init_attr->port_num); + if (err < 0) { + ret = ERR_PTR(err); + vfree(qp->r_rq.wq); + goto bail_qp; + } + qp->ibqp.qp_num = err; + qp->port_num = init_attr->port_num; + reset_qp(qp, init_attr->qp_type); + + break; + + default: + /* Don't support raw QPs */ + ret = ERR_PTR(-ENOSYS); + goto bail; + } + + init_attr->cap.max_inline_data = 0; + + /* + * Return the address of the RWQ as the offset to mmap. + * See hfi1_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else { + u32 s = sizeof(struct hfi1_rwq) + qp->r_rq.size * sz; + + qp->ip = hfi1_create_mmap_info(dev, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + err = ib_copy_to_udata(udata, &(qp->ip->offset), + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } + } + + spin_lock(&dev->n_qps_lock); + if (dev->n_qps_allocated == hfi1_max_qps) { + spin_unlock(&dev->n_qps_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_qps_allocated++; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &qp->ibqp; + + /* + * We have our QP and its good, now keep track of what types of opcodes + * can be processed on this QP. We do this by keeping track of what the + * 3 high order bits of the opcode are. + */ + switch (init_attr->qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & OPCODE_QP_MASK; + break; + case IB_QPT_RC: + qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & OPCODE_QP_MASK; + break; + case IB_QPT_UC: + qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & OPCODE_QP_MASK; + break; + default: + ret = ERR_PTR(-EINVAL); + goto bail_ip; + } + + goto bail; + +bail_ip: + if (qp->ip) + kref_put(&qp->ip->ref, hfi1_release_mmap_info); + else + vfree(qp->r_rq.wq); + free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num); +bail_qp: + kfree(qp->s_hdr); + kfree(qp); +bail_swq: + vfree(swq); +bail: + return ret; +} + +/** + * hfi1_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int hfi1_destroy_qp(struct ib_qp *ibqp) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + struct hfi1_ibdev *dev = to_idev(ibqp->device); + + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + flush_iowait(qp); + qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_ANY_WAIT); + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + cancel_work_sync(&qp->s_iowait.iowork); + del_timer_sync(&qp->s_timer); + iowait_sdma_drain(&qp->s_iowait); + flush_tx_list(qp); + remove_qp(dev, qp); + wait_event(qp->wait, !atomic_read(&qp->refcount)); + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_lock); + clear_mr_refs(qp, 1); + clear_ahg(qp); + } + spin_unlock(&qp->s_lock); + spin_unlock_irq(&qp->r_lock); + + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qp_dev->qpn_table, qp->ibqp.qp_num); + spin_lock(&dev->n_qps_lock); + dev->n_qps_allocated--; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) + kref_put(&qp->ip->ref, hfi1_release_mmap_info); + else + vfree(qp->r_rq.wq); + vfree(qp->s_wq); + kfree(qp->s_hdr); + kfree(qp); + return 0; +} + +/** + * init_qpn_table - initialize the QP number table for a device + * @qpt: the QPN table + */ +static int init_qpn_table(struct hfi1_devdata *dd, struct hfi1_qpn_table *qpt) +{ + u32 offset, qpn, i; + struct qpn_map *map; + int ret = 0; + + spin_lock_init(&qpt->lock); + + qpt->last = 0; + qpt->incr = 1 << dd->qos_shift; + + /* insure we don't assign QPs from KDETH 64K window */ + qpn = kdeth_qp << 16; + qpt->nmaps = qpn / BITS_PER_PAGE; + /* This should always be zero */ + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpt->nmaps]; + dd_dev_info(dd, "Reserving QPNs for KDETH window from 0x%x to 0x%x\n", + qpn, qpn + 65535); + for (i = 0; i < 65536; i++) { + if (!map->page) { + get_map_page(qpt, map); + if (!map->page) { + ret = -ENOMEM; + break; + } + } + set_bit(offset, map->page); + offset++; + if (offset == BITS_PER_PAGE) { + /* next page */ + qpt->nmaps++; + map++; + offset = 0; + } + } + return ret; +} + +/** + * free_qpn_table - free the QP number table for a device + * @qpt: the QPN table + */ +static void free_qpn_table(struct hfi1_qpn_table *qpt) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(qpt->map); i++) + free_page((unsigned long) qpt->map[i].page); +} + +/** + * hfi1_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == HFI1_AETH_CREDIT_INVAL) { + if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) { + qp->s_flags |= HFI1_S_UNLIMITED_CREDIT; + if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT; + hfi1_schedule_send(qp); + } + } + } else if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK; + if (cmp_msn(credit, qp->s_lsn) > 0) { + qp->s_lsn = credit; + if (qp->s_flags & HFI1_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~HFI1_S_WAIT_SSN_CREDIT; + hfi1_schedule_send(qp); + } + } + } +} + +void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & flag) { + qp->s_flags &= ~flag; + trace_hfi1_qpwakeup(qp, flag); + hfi1_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + /* Notify hfi1_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static int iowait_sleep( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *stx, + unsigned seq) +{ + struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq); + struct hfi1_qp *qp; + unsigned long flags; + int ret = 0; + struct hfi1_ibdev *dev; + + qp = tx->qp; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { + + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + /* Make a common routine? */ + dev = &sde->dd->verbs_dev; + list_add_tail(&stx->list, &wait->tx_head); + write_seqlock(&dev->iowait_lock); + if (sdma_progress(sde, seq, stx)) + goto eagain; + if (list_empty(&qp->s_iowait.list)) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + + ibp->n_dmawait++; + qp->s_flags |= HFI1_S_WAIT_DMA_DESC; + list_add_tail(&qp->s_iowait.list, &sde->dmawait); + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_DMA_DESC); + atomic_inc(&qp->refcount); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~HFI1_S_BUSY; + spin_unlock_irqrestore(&qp->s_lock, flags); + ret = -EBUSY; + } else { + spin_unlock_irqrestore(&qp->s_lock, flags); + hfi1_put_txreq(tx); + } + return ret; +eagain: + write_sequnlock(&dev->iowait_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + list_del_init(&stx->list); + return -EAGAIN; +} + +static void iowait_wakeup(struct iowait *wait, int reason) +{ + struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait); + + WARN_ON(reason != SDMA_AVAIL_REASON); + hfi1_qp_wakeup(qp, HFI1_S_WAIT_DMA_DESC); +} + +int hfi1_qp_init(struct hfi1_ibdev *dev) +{ + struct hfi1_devdata *dd = dd_from_dev(dev); + int i; + int ret = -ENOMEM; + + /* allocate parent object */ + dev->qp_dev = kzalloc(sizeof(*dev->qp_dev), GFP_KERNEL); + if (!dev->qp_dev) + goto nomem; + /* allocate hash table */ + dev->qp_dev->qp_table_size = hfi1_qp_table_size; + dev->qp_dev->qp_table_bits = ilog2(hfi1_qp_table_size); + dev->qp_dev->qp_table = + kmalloc(dev->qp_dev->qp_table_size * + sizeof(*dev->qp_dev->qp_table), + GFP_KERNEL); + if (!dev->qp_dev->qp_table) + goto nomem; + for (i = 0; i < dev->qp_dev->qp_table_size; i++) + RCU_INIT_POINTER(dev->qp_dev->qp_table[i], NULL); + spin_lock_init(&dev->qp_dev->qpt_lock); + /* initialize qpn map */ + ret = init_qpn_table(dd, &dev->qp_dev->qpn_table); + if (ret) + goto nomem; + return ret; +nomem: + if (dev->qp_dev) { + kfree(dev->qp_dev->qp_table); + free_qpn_table(&dev->qp_dev->qpn_table); + kfree(dev->qp_dev); + } + return ret; +} + +void hfi1_qp_exit(struct hfi1_ibdev *dev) +{ + struct hfi1_devdata *dd = dd_from_dev(dev); + u32 qps_inuse; + + qps_inuse = free_all_qps(dd); + if (qps_inuse) + dd_dev_err(dd, "QP memory leak! %u still in use\n", + qps_inuse); + if (dev->qp_dev) { + kfree(dev->qp_dev->qp_table); + free_qpn_table(&dev->qp_dev->qpn_table); + kfree(dev->qp_dev); + } +} + +/** + * + * qp_to_sdma_engine - map a qp to a send engine + * @qp: the QP + * @sc5: the 5 bit sc + * + * Return: + * A send engine for the qp or NULL for SMI type qp. + */ +struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct sdma_engine *sde; + + if (!(dd->flags & HFI1_HAS_SEND_DMA)) + return NULL; + switch (qp->ibqp.qp_type) { + case IB_QPT_UC: + case IB_QPT_RC: + break; + case IB_QPT_SMI: + return NULL; + default: + break; + } + sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5); + return sde; +} + +struct qp_iter { + struct hfi1_ibdev *dev; + struct hfi1_qp *qp; + int specials; + int n; +}; + +struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev) +{ + struct qp_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + iter->specials = dev->ibdev.phys_port_cnt * 2; + if (qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int qp_iter_next(struct qp_iter *iter) +{ + struct hfi1_ibdev *dev = iter->dev; + int n = iter->n; + int ret = 1; + struct hfi1_qp *pqp = iter->qp; + struct hfi1_qp *qp; + + /* + * The approach is to consider the special qps + * as an additional table entries before the + * real hash table. Since the qp code sets + * the qp->next hash link to NULL, this works just fine. + * + * iter->specials is 2 * # ports + * + * n = 0..iter->specials is the special qp indices + * + * n = iter->specials..dev->qp_dev->qp_table_size+iter->specials are + * the potential hash bucket entries + * + */ + for (; n < dev->qp_dev->qp_table_size + iter->specials; n++) { + if (pqp) { + qp = rcu_dereference(pqp->next); + } else { + if (n < iter->specials) { + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + int pidx; + + pidx = n % dev->ibdev.phys_port_cnt; + ppd = &dd_from_dev(dev)->pport[pidx]; + ibp = &ppd->ibport_data; + + if (!(n & 1)) + qp = rcu_dereference(ibp->qp[0]); + else + qp = rcu_dereference(ibp->qp[1]); + } else { + qp = rcu_dereference( + dev->qp_dev->qp_table[ + (n - iter->specials)]); + } + } + pqp = qp; + if (qp) { + iter->qp = qp; + iter->n = n; + return 0; + } + } + return ret; +} + +static const char * const qp_type_str[] = { + "SMI", "GSI", "RC", "UC", "UD", +}; + +static int qp_idle(struct hfi1_qp *qp) +{ + return + qp->s_last == qp->s_acked && + qp->s_acked == qp->s_cur && + qp->s_cur == qp->s_tail && + qp->s_tail == qp->s_head; +} + +void qp_iter_print(struct seq_file *s, struct qp_iter *iter) +{ + struct hfi1_swqe *wqe; + struct hfi1_qp *qp = iter->qp; + struct sdma_engine *sde; + + sde = qp_to_sdma_engine(qp, qp->s_sc); + wqe = get_swqe_ptr(qp, qp->s_last); + seq_printf(s, + "N %d %s QP%u R %u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x SL %u MTU %d %u %u %u SDE %p,%u\n", + iter->n, + qp_idle(qp) ? "I" : "B", + qp->ibqp.qp_num, + atomic_read(&qp->refcount), + qp_type_str[qp->ibqp.qp_type], + qp->state, + wqe ? wqe->wr.opcode : 0, + qp->s_hdrwords, + qp->s_flags, + atomic_read(&qp->s_iowait.sdma_busy), + !list_empty(&qp->s_iowait.list), + qp->timeout, + wqe ? wqe->ssn : 0, + qp->s_lsn, + qp->s_last_psn, + qp->s_psn, qp->s_next_psn, + qp->s_sending_psn, qp->s_sending_hpsn, + qp->s_last, qp->s_acked, qp->s_cur, + qp->s_tail, qp->s_head, qp->s_size, + qp->remote_qpn, + qp->remote_ah_attr.dlid, + qp->remote_ah_attr.sl, + qp->pmtu, + qp->s_retry_cnt, + qp->timeout, + qp->s_rnr_retry_cnt, + sde, + sde ? sde->this_idx : 0); +} + +void qp_comm_est(struct hfi1_qp *qp) +{ + qp->r_flags |= HFI1_R_COMM_EST; + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_COMM_EST; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} diff --git a/drivers/staging/rdma/hfi1/qp.h b/drivers/staging/rdma/hfi1/qp.h new file mode 100644 index 000000000..6b505859b --- /dev/null +++ b/drivers/staging/rdma/hfi1/qp.h @@ -0,0 +1,235 @@ +#ifndef _QP_H +#define _QP_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include "verbs.h" + +#define QPN_MAX (1 << 24) +#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct qpn_map { + void *page; +}; + +struct hfi1_qpn_table { + spinlock_t lock; /* protect changes in this struct */ + unsigned flags; /* flags for QP0/1 allocated for each port */ + u32 last; /* last QP number allocated */ + u32 nmaps; /* size of the map table */ + u16 limit; + u8 incr; + /* bit map of free QP numbers other than 0/1 */ + struct qpn_map map[QPNMAP_ENTRIES]; +}; + +struct hfi1_qp_ibdev { + u32 qp_table_size; + u32 qp_table_bits; + struct hfi1_qp __rcu **qp_table; + spinlock_t qpt_lock; + struct hfi1_qpn_table qpn_table; +}; + +static inline u32 qpn_hash(struct hfi1_qp_ibdev *dev, u32 qpn) +{ + return hash_32(qpn, dev->qp_table_bits); +} + +/** + * hfi1_lookup_qpn - return the QP with the given QPN + * @ibp: the ibport + * @qpn: the QP number to look up + * + * The caller must hold the rcu_read_lock(), and keep the lock until + * the returned qp is no longer in use. + */ +static inline struct hfi1_qp *hfi1_lookup_qpn(struct hfi1_ibport *ibp, + u32 qpn) __must_hold(RCU) +{ + struct hfi1_qp *qp = NULL; + + if (unlikely(qpn <= 1)) { + qp = rcu_dereference(ibp->qp[qpn]); + } else { + struct hfi1_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev; + u32 n = qpn_hash(dev->qp_dev, qpn); + + for (qp = rcu_dereference(dev->qp_dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) + if (qp->ibqp.qp_num == qpn) + break; + } + return qp; +} + +/** + * hfi1_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. + * The QP r_lock and s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ +int hfi1_error_qp(struct hfi1_qp *qp, enum ib_wc_status err); + +/** + * hfi1_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for libibverbs.so + * + * Returns 0 on success, otherwise returns an errno. + */ +int hfi1_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int hfi1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); + +/** + * hfi1_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 hfi1_compute_aeth(struct hfi1_qp *qp); + +/** + * hfi1_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: user data for libibverbs.so + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +/** + * hfi1_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int hfi1_destroy_qp(struct ib_qp *ibqp); + +/** + * hfi1_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void hfi1_get_credit(struct hfi1_qp *qp, u32 aeth); + +/** + * hfi1_qp_init - allocate QP tables + * @dev: a pointer to the hfi1_ibdev + */ +int hfi1_qp_init(struct hfi1_ibdev *dev); + +/** + * hfi1_qp_exit - free the QP related structures + * @dev: a pointer to the hfi1_ibdev + */ +void hfi1_qp_exit(struct hfi1_ibdev *dev); + +/** + * hfi1_qp_waitup - wake up on the indicated event + * @qp: the QP + * @flag: flag the qp on which the qp is stalled + */ +void hfi1_qp_wakeup(struct hfi1_qp *qp, u32 flag); + +struct sdma_engine *qp_to_sdma_engine(struct hfi1_qp *qp, u8 sc5); + +struct qp_iter; + +/** + * qp_iter_init - wake up on the indicated event + * @dev: the hfi1_ibdev + */ +struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev); + +/** + * qp_iter_next - wakeup on the indicated event + * @iter: the iterator for the qp hash list + */ +int qp_iter_next(struct qp_iter *iter); + +/** + * qp_iter_next - wake up on the indicated event + * @s: the seq_file to emit the qp information on + * @iter: the iterator for the qp hash list + */ +void qp_iter_print(struct seq_file *s, struct qp_iter *iter); + +/** + * qp_comm_est - handle trap with QP established + * @qp: the QP + */ +void qp_comm_est(struct hfi1_qp *qp); + +#endif /* _QP_H */ diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c new file mode 100644 index 000000000..313893615 --- /dev/null +++ b/drivers/staging/rdma/hfi1/qsfp.c @@ -0,0 +1,546 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +#include "hfi.h" +#include "twsi.h" + +/* + * QSFP support for hfi driver, using "Two Wire Serial Interface" driver + * in twsi.c + */ +#define I2C_MAX_RETRY 4 + +/* + * Unlocked i2c write. Must hold dd->qsfp_i2c_mutex. + */ +static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + int ret, cnt; + u8 *buff = bp; + + /* Make sure TWSI bus is in sane state. */ + ret = hfi1_twsi_reset(dd, target); + if (ret) { + hfi1_dev_porterr(dd, ppd->port, + "I2C interface Reset for write failed\n"); + return -EIO; + } + + cnt = 0; + while (cnt < len) { + int wlen = len - cnt; + + ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset, + buff + cnt, wlen); + if (ret) { + /* hfi1_twsi_blk_wr() 1 for error, else 0 */ + return -EIO; + } + offset += wlen; + cnt += wlen; + } + + /* Must wait min 20us between qsfp i2c transactions */ + udelay(20); + + return cnt; +} + +int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, + void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + int ret; + + ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex); + if (!ret) { + ret = __i2c_write(ppd, target, i2c_addr, offset, bp, len); + mutex_unlock(&dd->qsfp_i2c_mutex); + } + + return ret; +} + +/* + * Unlocked i2c read. Must hold dd->qsfp_i2c_mutex. + */ +static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + int ret, cnt, pass = 0; + int stuck = 0; + u8 *buff = bp; + + /* Make sure TWSI bus is in sane state. */ + ret = hfi1_twsi_reset(dd, target); + if (ret) { + hfi1_dev_porterr(dd, ppd->port, + "I2C interface Reset for read failed\n"); + ret = -EIO; + stuck = 1; + goto exit; + } + + cnt = 0; + while (cnt < len) { + int rlen = len - cnt; + + ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset, + buff + cnt, rlen); + /* Some QSFP's fail first try. Retry as experiment */ + if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY) + continue; + if (ret) { + /* hfi1_twsi_blk_rd() 1 for error, else 0 */ + ret = -EIO; + goto exit; + } + offset += rlen; + cnt += rlen; + } + + ret = cnt; + +exit: + if (stuck) + dd_dev_err(dd, "I2C interface bus stuck non-idle\n"); + + if (pass >= I2C_MAX_RETRY && ret) + hfi1_dev_porterr(dd, ppd->port, + "I2C failed even retrying\n"); + else if (pass) + hfi1_dev_porterr(dd, ppd->port, "I2C retries: %d\n", pass); + + /* Must wait min 20us between qsfp i2c transactions */ + udelay(20); + + return ret; +} + +int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, + void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + int ret; + + ret = mutex_lock_interruptible(&dd->qsfp_i2c_mutex); + if (!ret) { + ret = __i2c_read(ppd, target, i2c_addr, offset, bp, len); + mutex_unlock(&dd->qsfp_i2c_mutex); + } + + return ret; +} + +int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + int count = 0; + int offset; + int nwrite; + int ret; + u8 page; + + ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex); + if (ret) + return ret; + + while (count < len) { + /* + * Set the qsfp page based on a zero-based addresss + * and a page size of QSFP_PAGESIZE bytes. + */ + page = (u8)(addr / QSFP_PAGESIZE); + + ret = __i2c_write(ppd, target, QSFP_DEV, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + if (ret != 1) { + hfi1_dev_porterr( + ppd->dd, + ppd->port, + "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret); + ret = -EIO; + break; + } + + /* truncate write to end of page if crossing page boundary */ + offset = addr % QSFP_PAGESIZE; + nwrite = len - count; + if ((offset + nwrite) > QSFP_PAGESIZE) + nwrite = QSFP_PAGESIZE - offset; + + ret = __i2c_write(ppd, target, QSFP_DEV, offset, bp + count, + nwrite); + if (ret <= 0) /* stop on error or nothing read */ + break; + + count += ret; + addr += ret; + } + + mutex_unlock(&ppd->dd->qsfp_i2c_mutex); + + if (ret < 0) + return ret; + return count; +} + +int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + int count = 0; + int offset; + int nread; + int ret; + u8 page; + + ret = mutex_lock_interruptible(&ppd->dd->qsfp_i2c_mutex); + if (ret) + return ret; + + while (count < len) { + /* + * Set the qsfp page based on a zero-based address + * and a page size of QSFP_PAGESIZE bytes. + */ + page = (u8)(addr / QSFP_PAGESIZE); + ret = __i2c_write(ppd, target, QSFP_DEV, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + if (ret != 1) { + hfi1_dev_porterr( + ppd->dd, + ppd->port, + "can't write QSFP_PAGE_SELECT_BYTE: %d\n", ret); + ret = -EIO; + break; + } + + /* truncate read to end of page if crossing page boundary */ + offset = addr % QSFP_PAGESIZE; + nread = len - count; + if ((offset + nread) > QSFP_PAGESIZE) + nread = QSFP_PAGESIZE - offset; + + ret = __i2c_read(ppd, target, QSFP_DEV, offset, bp + count, + nread); + if (ret <= 0) /* stop on error or nothing read */ + break; + + count += ret; + addr += ret; + } + + mutex_unlock(&ppd->dd->qsfp_i2c_mutex); + + if (ret < 0) + return ret; + return count; +} + +/* + * This function caches the QSFP memory range in 128 byte chunks. + * As an example, the next byte after address 255 is byte 128 from + * upper page 01H (if existing) rather than byte 0 from lower page 00H. + */ +int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) +{ + u32 target = ppd->dd->hfi1_id; + int ret; + unsigned long flags; + u8 *cache = &cp->cache[0]; + + /* ensure sane contents on invalid reads, for cable swaps */ + memset(cache, 0, (QSFP_MAX_NUM_PAGES*128)); + dd_dev_info(ppd->dd, "%s: called\n", __func__); + if (!qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto bail; + } + + ret = qsfp_read(ppd, target, 0, cache, 256); + if (ret != 256) { + dd_dev_info(ppd->dd, + "%s: Read of pages 00H failed, expected 256, got %d\n", + __func__, ret); + goto bail; + } + + if (cache[0] != 0x0C && cache[0] != 0x0D) + goto bail; + + /* Is paging enabled? */ + if (!(cache[2] & 4)) { + + /* Paging enabled, page 03 required */ + if ((cache[195] & 0xC0) == 0xC0) { + /* all */ + ret = qsfp_read(ppd, target, 384, cache + 256, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 640, cache + 384, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + } else if ((cache[195] & 0x80) == 0x80) { + /* only page 2 and 3 */ + ret = qsfp_read(ppd, target, 640, cache + 384, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + } else if ((cache[195] & 0x40) == 0x40) { + /* only page 1 and 3 */ + ret = qsfp_read(ppd, target, 384, cache + 256, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + } else { + /* only page 3 */ + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s: failed\n", __func__); + goto bail; + } + } + } + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 1; + ppd->qsfp_info.cache_refresh_required = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + + return 0; + +bail: + memset(cache, 0, (QSFP_MAX_NUM_PAGES*128)); + return ret; +} + +const char * const hfi1_qsfp_devtech[16] = { + "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP", + "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML", + "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq", + "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq" +}; + +#define QSFP_DUMP_CHUNK 16 /* Holds longest string */ +#define QSFP_DEFAULT_HDR_CNT 224 + +static const char *pwr_codes = "1.5W2.0W2.5W3.5W"; + +int qsfp_mod_present(struct hfi1_pportdata *ppd) +{ + if (HFI1_CAP_IS_KSET(QSFP_ENABLED)) { + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + + reg = read_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); + return !(reg & QSFP_HFI0_MODPRST_N); + } + /* always return cable present */ + return 1; +} + +/* + * This function maps QSFP memory addresses in 128 byte chunks in the following + * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1 + * spec + * For addr 000-127, lower page 00h + * For addr 128-255, upper page 00h + * For addr 256-383, upper page 01h + * For addr 384-511, upper page 02h + * For addr 512-639, upper page 03h + * + * For addresses beyond this range, it returns the invalid range of data buffer + * set to 0. + * For upper pages that are optional, if they are not valid, returns the + * particular range of bytes in the data buffer set to 0. + */ +int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len, + u8 *data) +{ + struct hfi1_pportdata *ppd; + u32 excess_len = 0; + int ret = 0; + + if (port_num > dd->num_pports || port_num < 1) { + dd_dev_info(dd, "%s: Invalid port number %d\n", + __func__, port_num); + ret = -EINVAL; + goto set_zeroes; + } + + ppd = dd->pport + (port_num - 1); + if (!qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto set_zeroes; + } + + if (!ppd->qsfp_info.cache_valid) { + ret = -EINVAL; + goto set_zeroes; + } + + if (addr >= (QSFP_MAX_NUM_PAGES * 128)) { + ret = -ERANGE; + goto set_zeroes; + } + + if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) { + excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128); + memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len)); + data += (len - excess_len); + goto set_zeroes; + } + + memcpy(data, &ppd->qsfp_info.cache[addr], len); + return 0; + +set_zeroes: + memset(data, 0, excess_len); + return ret; +} + +int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) +{ + u8 *cache = &ppd->qsfp_info.cache[0]; + u8 bin_buff[QSFP_DUMP_CHUNK]; + char lenstr[6]; + int sofar, ret; + int bidx = 0; + u8 *atten = &cache[QSFP_ATTEN_OFFS]; + u8 *vendor_oui = &cache[QSFP_VOUI_OFFS]; + + sofar = 0; + lenstr[0] = ' '; + lenstr[1] = '\0'; + + if (ppd->qsfp_info.cache_valid) { + + if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) + sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", + pwr_codes + + (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4)); + + sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", + lenstr, + hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]); + + sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n", + QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n", + QSFP_OUI(vendor_oui)); + + sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n", + QSFP_PN_LEN, &cache[QSFP_PN_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n", + QSFP_REV_LEN, &cache[QSFP_REV_OFFS]); + + if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) + sofar += scnprintf(buf + sofar, len - sofar, + "Atten:%d, %d\n", + QSFP_ATTEN_SDR(atten), + QSFP_ATTEN_DDR(atten)); + + sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n", + QSFP_SN_LEN, &cache[QSFP_SN_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n", + QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n", + QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]); + + while (bidx < QSFP_DEFAULT_HDR_CNT) { + int iidx; + + memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK); + for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) { + sofar += scnprintf(buf + sofar, len-sofar, + " %02X", bin_buff[iidx]); + } + sofar += scnprintf(buf + sofar, len - sofar, "\n"); + bidx += QSFP_DUMP_CHUNK; + } + } + ret = sofar; + return ret; +} diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h new file mode 100644 index 000000000..d30c2a6ba --- /dev/null +++ b/drivers/staging/rdma/hfi1/qsfp.h @@ -0,0 +1,222 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +/* QSFP support common definitions, for hfi driver */ + +#define QSFP_DEV 0xA0 +#define QSFP_PWR_LAG_MSEC 2000 +#define QSFP_MODPRS_LAG_MSEC 20 +/* 128 byte pages, per SFF 8636 rev 2.4 */ +#define QSFP_MAX_NUM_PAGES 5 + +/* + * Below are masks for QSFP pins. Pins are the same for HFI0 and HFI1. + * _N means asserted low + */ +#define QSFP_HFI0_I2CCLK (1 << 0) +#define QSFP_HFI0_I2CDAT (1 << 1) +#define QSFP_HFI0_RESET_N (1 << 2) +#define QSFP_HFI0_INT_N (1 << 3) +#define QSFP_HFI0_MODPRST_N (1 << 4) + +/* QSFP is paged at 256 bytes */ +#define QSFP_PAGESIZE 256 + +/* Defined fields that Intel requires of qualified cables */ +/* Byte 0 is Identifier, not checked */ +/* Byte 1 is reserved "status MSB" */ +/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */ +/* + * Rest of first 128 not used, although 127 is reserved for page select + * if module is not "Flat memory". + */ +#define QSFP_PAGE_SELECT_BYTE_OFFS 127 +/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ +#define QSFP_MOD_ID_OFFS 128 +/* + * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class + * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + */ +#define QSFP_MOD_PWR_OFFS 129 +/* Byte 130 is Connector type. Not Intel req'd */ +/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */ +/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */ +/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not Intel req'd */ +/* Byte 141 is Extended Rate Select. Not Intel req'd */ +/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */ +/* Byte 146 is length for Copper. Units of 1 meter */ +#define QSFP_MOD_LEN_OFFS 146 +/* + * Byte 147 is Device technology. D0..3 not Intel req'd + * D4..7 select from 15 choices, translated by table: + */ +#define QSFP_MOD_TECH_OFFS 147 +extern const char *const hfi1_qsfp_devtech[16]; +/* Active Equalization includes fiber, copper full EQ, and copper near Eq */ +#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1) +/* Active Equalization includes fiber, copper full EQ, and copper far Eq */ +#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1) +/* Attenuation should be valid for copper other than full/near Eq */ +#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1) +/* Length is only valid if technology is "copper" */ +#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1) +#define QSFP_TECH_1490 9 + +#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \ + oui[2]) +#define QSFP_OUI_AMPHENOL 0x415048 +#define QSFP_OUI_FINISAR 0x009065 +#define QSFP_OUI_GORE 0x002177 + +/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */ +#define QSFP_VEND_OFFS 148 +#define QSFP_VEND_LEN 16 +/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */ +#define QSFP_IBXCV_OFFS 164 +/* Bytes 165..167 are Vendor OUI number */ +#define QSFP_VOUI_OFFS 165 +#define QSFP_VOUI_LEN 3 +/* Bytes 168..183 are Vendor Part Number, string */ +#define QSFP_PN_OFFS 168 +#define QSFP_PN_LEN 16 +/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */ +#define QSFP_REV_OFFS 184 +#define QSFP_REV_LEN 2 +/* + * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd + * If copper, they are attenuation in dB: + * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR) + */ +#define QSFP_ATTEN_OFFS 186 +#define QSFP_ATTEN_LEN 2 +/* Bytes 188,189 are Wavelength tolerance, not Intel req'd */ +/* Byte 190 is Max Case Temp. Not Intel req'd */ +/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */ +#define QSFP_CC_OFFS 191 +/* Bytes 192..195 are Options implemented in qsfp. Not Intel req'd */ +/* Bytes 196..211 are Serial Number, String */ +#define QSFP_SN_OFFS 196 +#define QSFP_SN_LEN 16 +/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */ +#define QSFP_DATE_OFFS 212 +#define QSFP_DATE_LEN 6 +/* Bytes 218,219 are optional lot-code, string */ +#define QSFP_LOT_OFFS 218 +#define QSFP_LOT_LEN 2 +/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */ +/* Byte 223 is LSB of sum of bytes 192..222 */ +#define QSFP_CC_EXT_OFFS 223 + +/* + * Interrupt flag masks + */ +#define QSFP_DATA_NOT_READY 0x01 + +#define QSFP_HIGH_TEMP_ALARM 0x80 +#define QSFP_LOW_TEMP_ALARM 0x40 +#define QSFP_HIGH_TEMP_WARNING 0x20 +#define QSFP_LOW_TEMP_WARNING 0x10 + +#define QSFP_HIGH_VCC_ALARM 0x80 +#define QSFP_LOW_VCC_ALARM 0x40 +#define QSFP_HIGH_VCC_WARNING 0x20 +#define QSFP_LOW_VCC_WARNING 0x10 + +#define QSFP_HIGH_POWER_ALARM 0x88 +#define QSFP_LOW_POWER_ALARM 0x44 +#define QSFP_HIGH_POWER_WARNING 0x22 +#define QSFP_LOW_POWER_WARNING 0x11 + +#define QSFP_HIGH_BIAS_ALARM 0x88 +#define QSFP_LOW_BIAS_ALARM 0x44 +#define QSFP_HIGH_BIAS_WARNING 0x22 +#define QSFP_LOW_BIAS_WARNING 0x11 + +/* + * struct qsfp_data encapsulates state of QSFP device for one port. + * it will be part of port-specific data if a board supports QSFP. + * + * Since multiple board-types use QSFP, and their pport_data structs + * differ (in the chip-specific section), we need a pointer to its head. + * + * Avoiding premature optimization, we will have one work_struct per port, + * and let the qsfp_lock arbitrate access to common resources. + * + */ + +#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) +#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) + +struct qsfp_data { + /* Helps to find our way */ + struct hfi1_pportdata *ppd; + struct work_struct qsfp_work; + u8 cache[QSFP_MAX_NUM_PAGES*128]; + spinlock_t qsfp_lock; + u8 check_interrupt_flags; + u8 qsfp_interrupt_functional; + u8 cache_valid; + u8 cache_refresh_required; +}; + +int refresh_qsfp_cache(struct hfi1_pportdata *ppd, + struct qsfp_data *cp); +int qsfp_mod_present(struct hfi1_pportdata *ppd); +int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, + u32 len, u8 *data); + +int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len); +int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len); +int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c new file mode 100644 index 000000000..632dd5ba7 --- /dev/null +++ b/drivers/staging/rdma/hfi1/rc.c @@ -0,0 +1,2426 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#include "hfi.h" +#include "qp.h" +#include "sdma.h" +#include "trace.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static void rc_timeout(unsigned long arg); + +static u32 restart_sge(struct hfi1_sge_state *ss, struct hfi1_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = delta_psn(psn, wqe->psn) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ss->total_len = wqe->length; + hfi1_skip_sge(ss, len, 0); + return wqe->length - len; +} + +static void start_timer(struct hfi1_qp *qp) +{ + qp->s_flags |= HFI1_S_TIMER; + qp->s_timer.function = rc_timeout; + /* 4.096 usec. * (1 << qp->timeout) */ + qp->s_timer.expires = jiffies + qp->timeout_jiffies; + add_timer(&qp->s_timer); +} + +/** + * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @dev: the device for this QP + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int make_rc_ack(struct hfi1_ibdev *dev, struct hfi1_qp *qp, + struct hfi1_other_headers *ohdr, u32 pmtu) +{ + struct hfi1_ack_entry *e; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + int middle = 0; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->rdma_sge.mr) { + hfi1_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + /* FALLTHROUGH */ + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & HFI1_S_ACK_PENDING) + goto normal; + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* + * If a RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester re-sends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + qp->s_rdma_mr = e->rdma_sge.mr; + if (qp->s_rdma_mr) + hfi1_get_mr(qp->s_rdma_mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = hfi1_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = hfi1_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = mask_psn(e->psn); + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + qp->s_cur_sge = &qp->s_ack_rdma_sge; + qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr; + if (qp->s_rdma_mr) + hfi1_get_mr(qp->s_rdma_mr); + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + } else { + ohdr->u.aeth = hfi1_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = mask_psn(qp->s_ack_rdma_psn++); + break; + + default: +normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~HFI1_S_ACK_PENDING; + qp->s_cur_sge = NULL; + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) | + (qp->s_nak_state << + HFI1_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = hfi1_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = mask_psn(qp->s_ack_psn); + } + qp->s_rdma_ack_cnt++; + qp->s_hdrwords = hwords; + qp->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle); + return 1; + +bail: + qp->s_ack_state = OP(ACKNOWLEDGE); + /* + * Ensure s_rdma_ack_cnt changes are committed prior to resetting + * HFI1_S_RESP_PENDING + */ + smp_wmb(); + qp->s_flags &= ~(HFI1_S_RESP_PENDING + | HFI1_S_ACK_PENDING + | HFI1_S_AHG_VALID); + return 0; +} + +/** + * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_rc_req(struct hfi1_qp *qp) +{ + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_other_headers *ohdr; + struct hfi1_sge_state *ss; + struct hfi1_swqe *wqe; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + u32 hwords = 5; + u32 len; + u32 bth0 = 0; + u32 bth2; + u32 pmtu = qp->pmtu; + char newreq; + unsigned long flags; + int ret = 0; + int middle = 0; + int delta; + + ohdr = &qp->s_hdr->ibh.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr->ibh.u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout re-sends. + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if ((qp->s_flags & HFI1_S_RESP_PENDING) && + make_rc_ack(dev, qp, ohdr, pmtu)) + goto done; + + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) { + if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_iowait.sdma_busy)) { + qp->s_flags |= HFI1_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = get_swqe_ptr(qp, qp->s_last); + hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + /* will get called again */ + goto done; + } + + if (qp->s_flags & (HFI1_S_WAIT_RNR | HFI1_S_WAIT_ACK)) + goto bail; + + if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { + if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { + qp->s_flags |= HFI1_S_WAIT_PSN; + goto bail; + } + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + } + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) { + clear_ahg(qp); + goto bail; + } + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= HFI1_S_WAIT_FENCE; + goto bail; + } + wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = mask_psn(qp->s_psn); + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) && + cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT; + goto bail; + } + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && !(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT) && + cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= HFI1_S_WAIT_SSN_CREDIT; + goto bail; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= HFI1_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= HFI1_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & HFI1_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else { + qp->s_psn++; + if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + } + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_FIRST is used by the ACK processing + * thread to indicate a SEND needs to be restarted from an + * earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = mask_psn(qp->s_psn++); + if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_LAST is used by the ACK processing + * thread to indicate a RDMA write needs to be restarted from + * an earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = mask_psn(qp->s_psn++); + if (cmp_psn(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing + * thread to indicate a RDMA read needs to be restarted from + * an earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_sending_hpsn = bth2; + delta = delta_psn(bth2, wqe->psn); + if (delta && delta % HFI1_PSN_CREDIT == 0) + bth2 |= IB_BTH_REQ_ACK; + if (qp->s_flags & HFI1_S_SEND_ONE) { + qp->s_flags &= ~HFI1_S_SEND_ONE; + qp->s_flags |= HFI1_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + hfi1_make_ruc_header( + qp, + ohdr, + bth0 | (qp->s_state << 24), + bth2, + middle); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~HFI1_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * hfi1_send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP + * + * This is called from hfi1_rc_rcv() and handle_receive_interrupt(). + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. + */ +void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct hfi1_qp *qp, + int is_fecn) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u64 pbc, pbc_flags = 0; + u16 lrh0; + u16 sc5; + u32 bth0; + u32 hwords; + u32 vl, plen; + struct send_context *sc; + struct pio_buf *pbuf; + struct hfi1_ib_header hdr; + struct hfi1_other_headers *ohdr; + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->s_flags & HFI1_S_RESP_PENDING) + goto queue_ack; + + /* Ensure s_rdma_ack_cnt changes are committed */ + smp_read_barrier_depends(); + if (qp->s_rdma_ack_cnt) + goto queue_ack; + + /* Construct the header */ + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ + hwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + hwords += hfi1_make_grh(ibp, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = HFI1_LRH_GRH; + } else { + ohdr = &hdr.u.oth; + lrh0 = HFI1_LRH_BTH; + } + /* read pkey_index w/o lock (its atomic) */ + bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24); + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) | + (qp->r_nak_state << + HFI1_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = hfi1_compute_aeth(qp); + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); + lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT); + ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + if (driver_lstate(ppd) != IB_PORT_ACTIVE) + return; + + sc = rcd->sc; + plen = 2 /* PBC */ + hwords; + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + + pbuf = sc_buffer_alloc(sc, plen, NULL, NULL); + if (!pbuf) { + /* + * We have no room to send at the moment. Pass + * responsibility for sending the ACK to the send tasklet + * so that when enough buffer space becomes available, + * the ACK is sent ahead of other outgoing packets. + */ + goto queue_ack; + } + + trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr); + + /* write the pbc and data */ + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords); + + return; + +queue_ack: + this_cpu_inc(*ibp->rc_qacks); + spin_lock(&qp->s_lock); + qp->s_flags |= HFI1_S_ACK_PENDING | HFI1_S_RESP_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + if (is_fecn) + qp->s_flags |= HFI1_S_ECN; + + /* Schedule the send tasklet. */ + hfi1_schedule_send(qp); + spin_unlock(&qp->s_lock); +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from hfi1_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct hfi1_qp *qp, u32 psn) +{ + u32 n = qp->s_acked; + struct hfi1_swqe *wqe = get_swqe_ptr(qp, n); + u32 opcode; + + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (cmp_psn(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, n); + diff = cmp_psn(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; + } + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See hfi1_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + qp->s_psn = psn; + /* + * Set HFI1_S_WAIT_PSN as rc_complete() may start the timer + * asynchronously before the send tasklet can get scheduled. + * Doing it in hfi1_make_rc_req() is too late. + */ + if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && + (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) + qp->s_flags |= HFI1_S_WAIT_PSN; + qp->s_flags &= ~HFI1_S_AHG_VALID; +} + +/* + * Back up requester to resend the last un-ACKed request. + * The QP r_lock and s_lock should be held and interrupts disabled. + */ +static void restart_rc(struct hfi1_qp *qp, u32 psn, int wait) +{ + struct hfi1_swqe *wqe = get_swqe_ptr(qp, qp->s_acked); + struct hfi1_ibport *ibp; + + if (qp->s_retry == 0) { + if (qp->s_mig_state == IB_MIG_ARMED) { + hfi1_migrate_qp(qp); + qp->s_retry = qp->s_retry_cnt; + } else if (qp->s_last == qp->s_acked) { + hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + return; + } else /* need to handle delayed completion */ + return; + } else + qp->s_retry--; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_RDMA_READ) + ibp->n_rc_resends++; + else + ibp->n_rc_resends += delta_psn(qp->s_psn, psn); + + qp->s_flags &= ~(HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | + HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_PSN | + HFI1_S_WAIT_ACK); + if (wait) + qp->s_flags |= HFI1_S_SEND_ONE; + reset_psn(qp, psn); +} + +/* + * This is called from s_timer for missing responses. + */ +static void rc_timeout(unsigned long arg) +{ + struct hfi1_qp *qp = (struct hfi1_qp *)arg; + struct hfi1_ibport *ibp; + unsigned long flags; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (qp->s_flags & HFI1_S_TIMER) { + ibp = to_iport(qp->ibqp.device, qp->port_num); + ibp->n_rc_timeouts++; + qp->s_flags &= ~HFI1_S_TIMER; + del_timer(&qp->s_timer); + restart_rc(qp, qp->s_last_psn + 1, 1); + hfi1_schedule_send(qp); + } + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, flags); +} + +/* + * This is called from s_timer for RNR timeouts. + */ +void hfi1_rc_rnr_retry(unsigned long arg) +{ + struct hfi1_qp *qp = (struct hfi1_qp *)arg; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & HFI1_S_WAIT_RNR) { + qp->s_flags &= ~HFI1_S_WAIT_RNR; + del_timer(&qp->s_timer); + hfi1_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +/* + * Set qp->s_sending_psn to the next PSN after the given one. + * This would be psn+1 except when RDMA reads are present. + */ +static void reset_sending_psn(struct hfi1_qp *qp, u32 psn) +{ + struct hfi1_swqe *wqe; + u32 n = qp->s_last; + + /* Find the work request corresponding to the given PSN. */ + for (;;) { + wqe = get_swqe_ptr(qp, n); + if (cmp_psn(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_sending_psn = wqe->lpsn + 1; + else + qp->s_sending_psn = psn + 1; + break; + } + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + } +} + +/* + * This should be called with the QP s_lock held and interrupts disabled. + */ +void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr) +{ + struct hfi1_other_headers *ohdr; + struct hfi1_swqe *wqe; + struct ib_wc wc; + unsigned i; + u32 opcode; + u32 psn; + + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND)) + return; + + /* Find out where the BTH is */ + if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + WARN_ON(!qp->s_rdma_ack_cnt); + qp->s_rdma_ack_cnt--; + return; + } + + psn = be32_to_cpu(ohdr->bth[2]); + reset_sending_psn(qp, psn); + + /* + * Start timer after a packet requesting an ACK has been sent and + * there are still requests that haven't been acked. + */ + if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + !(qp->s_flags & + (HFI1_S_TIMER | HFI1_S_WAIT_RNR | HFI1_S_WAIT_PSN)) && + (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) + start_timer(qp); + + while (qp->s_last != qp->s_acked) { + wqe = get_swqe_ptr(qp, qp->s_last); + if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) + break; + for (i = 0; i < wqe->wr.num_sge; i++) { + struct hfi1_sge *sge = &wqe->sg_list[i]; + + hfi1_put_mr(sge->mr); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } + /* + * If we were waiting for sends to complete before re-sending, + * and they are now complete, restart sending. + */ + trace_hfi1_rc_sendcomplete(qp, psn); + if (qp->s_flags & HFI1_S_WAIT_PSN && + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + qp->s_flags &= ~HFI1_S_WAIT_PSN; + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + hfi1_schedule_send(qp); + } +} + +static inline void update_last_psn(struct hfi1_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/* + * Generate a SWQE completion. + * This is similar to hfi1_send_complete but has to check to be sure + * that the SGEs are not being referenced if the SWQE is being resent. + */ +static struct hfi1_swqe *do_rc_completion(struct hfi1_qp *qp, + struct hfi1_swqe *wqe, + struct hfi1_ibport *ibp) +{ + struct ib_wc wc; + unsigned i; + + /* + * Don't decrement refcount and don't generate a + * completion if the SWQE is being resent until the send + * is finished. + */ + if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + for (i = 0; i < wqe->wr.num_sge; i++) { + struct hfi1_sge *sge = &wqe->sg_list[i]; + + hfi1_put_mr(sge->mr); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + } else { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + this_cpu_inc(*ibp->rc_delayed_comp); + /* + * If send progress not running attempt to progress + * SDMA queue. + */ + if (ppd->dd->flags & HFI1_HAS_SEND_DMA) { + struct sdma_engine *engine; + u8 sc5; + + /* For now use sc to find engine */ + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + engine = qp_to_sdma_engine(qp, sc5); + sdma_engine_progress_schedule(engine); + } + } + + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, wqe->lpsn); + + /* + * If we are completing a request which is in the process of + * being resent, we can stop re-sending it since we know the + * responder has already seen it. + */ + if (qp->s_acked == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_acked = qp->s_cur; + wqe = get_swqe_ptr(qp, qp->s_cur); + if (qp->s_acked != qp->s_tail) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } + } else { + if (++qp->s_acked >= qp->s_size) + qp->s_acked = 0; + if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) + qp->s_draining = 0; + wqe = get_swqe_ptr(qp, qp->s_acked); + } + return wqe; +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct hfi1_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct hfi1_ctxtdata *rcd) +{ + struct hfi1_ibport *ibp; + enum ib_wc_status status; + struct hfi1_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + + /* Remove QP from retry timer */ + if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) { + qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; + wqe = get_swqe_ptr(qp, qp->s_acked); + ibp = to_iport(qp->ibqp.device, qp->port_num); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { + /* Retry this request. */ + if (!(qp->r_flags & HFI1_R_RDMAR_SEQ)) { + qp->r_flags |= HFI1_R_RDMAR_SEQ; + restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= HFI1_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, + &rcd->qp_wait_list); + } + } + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + u64 *vaddr = wqe->sg_list[0].vaddr; + *vaddr = val; + } + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if ((qp->s_flags & HFI1_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(HFI1_S_WAIT_FENCE | + HFI1_S_WAIT_ACK); + hfi1_schedule_send(qp); + } else if (qp->s_flags & HFI1_S_WAIT_RDMAR) { + qp->s_flags &= ~(HFI1_S_WAIT_RDMAR | + HFI1_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + } + wqe = do_rc_completion(qp, wqe, ibp); + if (qp->s_acked == qp->s_tail) + break; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + this_cpu_inc(*ibp->rc_acks); + if (qp->s_acked != qp->s_tail) { + /* + * We are expecting more ACKs so + * reset the re-transmit timer. + */ + start_timer(qp); + /* + * We can stop re-sending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } else if (cmp_psn(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + if (qp->s_flags & HFI1_S_WAIT_ACK) { + qp->s_flags &= ~HFI1_S_WAIT_ACK; + hfi1_schedule_send(qp); + } + hfi1_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, psn); + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + ibp->n_rnr_naks++; + if (qp->s_acked == qp->s_tail) + goto bail; + if (qp->s_flags & HFI1_S_WAIT_RNR) + goto bail; + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + + ibp->n_rc_resends += delta_psn(qp->s_psn, psn); + + reset_psn(qp, psn); + + qp->s_flags &= ~(HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_ACK); + qp->s_flags |= HFI1_S_WAIT_RNR; + qp->s_timer.function = hfi1_rc_rnr_retry; + qp->s_timer.expires = jiffies + usecs_to_jiffies( + ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) & + HFI1_AETH_CREDIT_MASK]); + add_timer(&qp->s_timer); + goto bail; + + case 3: /* NAK */ + if (qp->s_acked == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) & + HFI1_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + ibp->n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + restart_rc(qp, psn, 0); + hfi1_schedule_send(qp); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + ibp->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + ibp->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + ibp->n_other_naks++; +class_b: + if (qp->s_last == qp->s_acked) { + hfi1_send_complete(qp, wqe, status); + hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_retry = qp->s_retry_cnt; + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ +reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/* + * We have seen an out of sequence RDMA read middle or last packet. + * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. + */ +static void rdma_seq_err(struct hfi1_qp *qp, struct hfi1_ibport *ibp, u32 psn, + struct hfi1_ctxtdata *rcd) +{ + struct hfi1_swqe *wqe; + + /* Remove QP from retry timer */ + if (qp->s_flags & (HFI1_S_TIMER | HFI1_S_WAIT_RNR)) { + qp->s_flags &= ~(HFI1_S_TIMER | HFI1_S_WAIT_RNR); + del_timer(&qp->s_timer); + } + + wqe = get_swqe_ptr(qp, qp->s_acked); + + while (cmp_psn(psn, wqe->lpsn) > 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + break; + wqe = do_rc_completion(qp, wqe, ibp); + } + + ibp->n_rdma_seq++; + qp->r_flags |= HFI1_R_RDMAR_SEQ; + restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= HFI1_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +/** + * rc_rcv_resp - process an incoming RC response packet + * @ibp: the port this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * + * This is called from hfi1_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static void rc_rcv_resp(struct hfi1_ibport *ibp, + struct hfi1_other_headers *ohdr, + void *data, u32 tlen, struct hfi1_qp *qp, + u32 opcode, u32 psn, u32 hdrsize, u32 pmtu, + struct hfi1_ctxtdata *rcd) +{ + struct hfi1_swqe *wqe; + enum ib_wc_status status; + unsigned long flags; + int diff; + u32 pad; + u32 aeth; + u64 val; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Ignore invalid responses. */ + if (cmp_psn(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = cmp_psn(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + aeth = be32_to_cpu(ohdr->u.aeth); + if ((aeth >> 29) == 0) + hfi1_get_credit(qp, aeth); + } + goto ack_done; + } + + /* + * Skip everything other than the PSN we expect, if we are waiting + * for a reply to a restarted RDMA read or atomic op. + */ + if (qp->r_flags & HFI1_R_RDMAR_SEQ) { + if (cmp_psn(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~HFI1_R_RDMAR_SEQ; + } + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_acked); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + aeth = be32_to_cpu(ohdr->u.aeth); + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_acked); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; +read_middle: + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* + * We got a response so update the timeout. + * 4.096 usec. * (1 << qp->timeout) + */ + qp->s_flags |= HFI1_S_TIMER; + mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); + if (qp->s_flags & HFI1_S_WAIT_ACK) { + qp->s_flags &= ~HFI1_S_WAIT_ACK; + hfi1_schedule_send(qp); + } + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + aeth = be32_to_cpu(ohdr->u.aeth); + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = get_swqe_ptr(qp, qp->s_acked); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 4))) + goto ack_len_err; +read_last: + tlen -= hdrsize + pad + 4; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + aeth = be32_to_cpu(ohdr->u.aeth); + hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + WARN_ON(qp->s_rdma_read_sge.num_sge); + (void) do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0, rcd); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_seq_err: + rdma_seq_err(qp, ibp, psn, rcd); + goto ack_done; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + if (qp->s_last == qp->s_acked) { + hfi1_send_complete(qp, wqe, status); + hfi1_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +/** + * rc_rcv_error - process an incoming duplicate or error RC packet + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * + * This is called from hfi1_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, + struct hfi1_qp *qp, u32 opcode, u32 psn, int diff, + struct hfi1_ctxtdata *rcd) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_ack_entry *e; + unsigned long flags; + u8 i, prev; + int old_req; + + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + ibp->n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence NAK until all packets + * in the receive queue have been processed. + * Otherwise, we end up propagating congestion. + */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= HFI1_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + * old_req is true if there is an older response that is scheduled + * to be sent before sending this one. + */ + e = NULL; + old_req = 1; + ibp->n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = HFI1_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (cmp_psn(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue && + cmp_psn(psn, e->lpsn) <= 0) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST)) + goto unlock_done; + /* RETH comes after BTH */ + reth = &ohdr->u.rc.reth; + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = delta_psn(psn, e->psn) * qp->pmtu; + len = be32_to_cpu(reth->length); + if (unlikely(offset + len != e->rdma_sge.sge_length)) + goto unlock_done; + if (e->rdma_sge.mr) { + hfi1_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->psn = psn; + if (old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + default: + /* + * Ignore this operation if it doesn't request an ACK + * or an earlier RDMA read or atomic is going to be resent. + */ + if (!(psn & IB_BTH_REQ_ACK) || old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_tail_ack_queue = i; + break; + } + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_flags |= HFI1_S_RESP_PENDING; + qp->r_nak_state = 0; + hfi1_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = hfi1_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void update_ack_queue(struct hfi1_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > HFI1_MAX_RDMA_ATOMIC) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, + u32 lqpn, u32 rqpn, u8 svc_type) +{ + struct opa_hfi1_cong_log_event_internal *cc_event; + + if (sl >= OPA_MAX_SLS) + return; + + spin_lock(&ppd->cc_log_lock); + + ppd->threshold_cong_event_map[sl/8] |= 1 << (sl % 8); + ppd->threshold_event_counter++; + + cc_event = &ppd->cc_events[ppd->cc_log_idx++]; + if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) + ppd->cc_log_idx = 0; + cc_event->lqpn = lqpn & HFI1_QPN_MASK; + cc_event->rqpn = rqpn & HFI1_QPN_MASK; + cc_event->sl = sl; + cc_event->svc_type = svc_type; + cc_event->rlid = rlid; + /* keep timestamp in units of 1.024 usec */ + cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024; + + spin_unlock(&ppd->cc_log_lock); +} + +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, + u32 rqpn, u8 svc_type) +{ + struct cca_timer *cca_timer; + u16 ccti, ccti_incr, ccti_timer, ccti_limit; + u8 trigger_threshold; + struct cc_state *cc_state; + + if (sl >= OPA_MAX_SLS) + return; + + cca_timer = &ppd->cca_timer[sl]; + + cc_state = get_cc_state(ppd); + + if (cc_state == NULL) + return; + + /* + * 1) increase CCTI (for this SL) + * 2) select IPG (i.e., call set_link_ipg()) + * 3) start timer + */ + ccti_limit = cc_state->cct.ccti_limit; + ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase; + ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; + trigger_threshold = + cc_state->cong_setting.entries[sl].trigger_threshold; + + spin_lock(&ppd->cca_timer_lock); + + if (cca_timer->ccti < ccti_limit) { + if (cca_timer->ccti + ccti_incr <= ccti_limit) + cca_timer->ccti += ccti_incr; + else + cca_timer->ccti = ccti_limit; + set_link_ipg(ppd); + } + + spin_unlock(&ppd->cca_timer_lock); + + ccti = cca_timer->ccti; + + if (!hrtimer_active(&cca_timer->hrtimer)) { + /* ccti_timer is in units of 1.024 usec */ + unsigned long nsec = 1024 * ccti_timer; + + hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), + HRTIMER_MODE_REL); + } + + if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) + log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); +} + +/** + * hfi1_rc_rcv - process an incoming RC packet + * @rcd: the context pointer + * @hdr: the header of this packet + * @rcv_flags: flags relevant to rcv processing + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from qp_rcv() to process an incoming RC packet + * for the given QP. + * Called at interrupt level. + */ +void hfi1_rc_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_ib_header *hdr = packet->hdr; + u32 rcv_flags = packet->rcv_flags; + void *data = packet->ebuf; + u32 tlen = packet->tlen; + struct hfi1_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_other_headers *ohdr = packet->ohdr; + u32 bth0, opcode; + u32 hdrsize = packet->hlen; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + int diff; + struct ib_reth *reth; + unsigned long flags; + u32 bth1; + int ret, is_fecn = 0; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0)) + return; + + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { + if (bth1 & HFI1_BECN_SMASK) { + u16 rlid = qp->remote_ah_attr.dlid; + u32 lqpn, rqpn; + + lqpn = qp->ibqp.qp_num; + rqpn = qp->remote_qpn; + process_becn( + ppd, + qp->remote_ah_attr.sl, + rlid, lqpn, rqpn, + IB_CC_SVCTYPE_RC); + } + is_fecn = bth1 & HFI1_FECN_SMASK; + } + + psn = be32_to_cpu(ohdr->bth[2]); + opcode = bth0 >> 24; + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, rcd); + if (is_fecn) + goto send_ack; + return; + } + + /* Compute 24 bits worth of difference. */ + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) + return; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST)) + qp_comm_est(qp); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + ret = hfi1_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): +send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + hfi1_copy_sge(&qp->r_sge, data, pmtu, 1); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + ret = hfi1_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + ret = hfi1_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): + case OP(RDMA_WRITE_LAST): +no_immediate_data: + wc.wc_flags = 0; + wc.ex.imm_data = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (bth0 >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + hfi1_copy_sge(&qp->r_sge, data, tlen, 1); + hfi1_put_ss(&qp->r_sge); + qp->r_msn++; + if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + /* + * It seems that IB mandates the presence of an SL in a + * work completion only for the UD transport (see section + * 11.4.2 of IBTA Vol. 1). + * + * However, the way the SL is chosen below is consistent + * with the way that IB/qib works and is trying avoid + * introducing incompatibilities. + * + * See also OPA Vol. 1, section 9.7.6, and table 9-17. + */ + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (bth0 & IB_BTH_SOLICITED) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + reth = &ohdr->u.rc.reth; + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto no_immediate_data; + ret = hfi1_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + wc.ex.imm_data = ohdr->u.rc.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + + case OP(RDMA_READ_REQUEST): { + struct hfi1_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ + if (next > HFI1_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + hfi1_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + reth = &ohdr->u.rc.reth; + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = hfi1_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; + } else { + e->rdma_sge.mr = NULL; + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = qp->r_psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= HFI1_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + struct hfi1_ack_entry *e; + u64 vaddr; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > HFI1_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + hfi1_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + ateth = &ohdr->u.atomic_eth; + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = be64_to_cpu(ateth->swap_data); + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + hfi1_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = psn; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= HFI1_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & (1 << 31)) + goto send_ack; + return; + +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + /* Queue RNR NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= HFI1_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_op_err: + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= HFI1_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= HFI1_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } + return; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(rcd, qp, is_fecn); +} + +void hfi1_rc_hdrerr( + struct hfi1_ctxtdata *rcd, + struct hfi1_ib_header *hdr, + u32 rcv_flags, + struct hfi1_qp *qp) +{ + int has_grh = rcv_flags & HFI1_HAS_GRH; + struct hfi1_other_headers *ohdr; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + int diff; + u8 opcode; + u32 psn; + + /* Check for GRH */ + ohdr = &hdr->u.oth; + if (has_grh) + ohdr = &hdr->u.l.oth; + + opcode = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) + return; + + psn = be32_to_cpu(ohdr->bth[2]); + opcode >>= 24; + + /* Only deal with RDMA Writes for now */ + if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { + diff = delta_psn(psn, qp->r_psn); + if (!qp->r_nak_state && diff >= 0) { + ibp->n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence + * NAK until all packets + * in the receive queue have + * been processed. + * Otherwise, we end up + * propagating congestion. + */ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= HFI1_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail( + &qp->rspwait, + &rcd->qp_wait_list); + } + } /* Out of sequence NAK */ + } /* QP Request NAKs */ +} diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c new file mode 100644 index 000000000..a4115288d --- /dev/null +++ b/drivers/staging/rdma/hfi1/ruc.c @@ -0,0 +1,948 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#include "hfi.h" +#include "mad.h" +#include "qp.h" +#include "sdma.h" + +/* + * Convert the AETH RNR timeout code into the number of microseconds. + */ +const u32 ib_hfi1_rnr_table[32] = { + 655360, /* 00: 655.36 */ + 10, /* 01: .01 */ + 20, /* 02 .02 */ + 30, /* 03: .03 */ + 40, /* 04: .04 */ + 60, /* 05: .06 */ + 80, /* 06: .08 */ + 120, /* 07: .12 */ + 160, /* 08: .16 */ + 240, /* 09: .24 */ + 320, /* 0A: .32 */ + 480, /* 0B: .48 */ + 640, /* 0C: .64 */ + 960, /* 0D: .96 */ + 1280, /* 0E: 1.28 */ + 1920, /* 0F: 1.92 */ + 2560, /* 10: 2.56 */ + 3840, /* 11: 3.84 */ + 5120, /* 12: 5.12 */ + 7680, /* 13: 7.68 */ + 10240, /* 14: 10.24 */ + 15360, /* 15: 15.36 */ + 20480, /* 16: 20.48 */ + 30720, /* 17: 30.72 */ + 40960, /* 18: 40.96 */ + 61440, /* 19: 61.44 */ + 81920, /* 1A: 81.92 */ + 122880, /* 1B: 122.88 */ + 163840, /* 1C: 163.84 */ + 245760, /* 1D: 245.76 */ + 327680, /* 1E: 327.68 */ + 491520 /* 1F: 491.52 */ +}; + +/* + * Validate a RWQE and fill in the SGE state. + * Return 1 if OK. + */ +static int init_sge(struct hfi1_qp *qp, struct hfi1_rwqe *wqe) +{ + int i, j, ret; + struct ib_wc wc; + struct hfi1_lkey_table *rkt; + struct hfi1_pd *pd; + struct hfi1_sge_state *ss; + + rkt = &to_idev(qp->ibqp.device)->lk_table; + pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + ss = &qp->r_sge; + ss->sg_list = qp->r_sg_list; + qp->r_len = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if (!hfi1_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + qp->r_len += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ss->total_len = qp->r_len; + ret = 1; + goto bail; + +bad_lkey: + while (j) { + struct hfi1_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + + hfi1_put_mr(sge->mr); + } + ss->num_sge = 0; + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + +/** + * hfi1_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return -1 if there is a local error, 0 if no RWQE is available, + * otherwise return 1. + * + * Can be called from interrupt level. + */ +int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only) +{ + unsigned long flags; + struct hfi1_rq *rq; + struct hfi1_rwq *wq; + struct hfi1_srq *srq; + struct hfi1_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = get_rwqe_ptr(rq, tail); + /* + * Even though we update the tail index in memory, the verbs + * consumer is not supposed to post more entries until a + * completion is generated. + */ + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + if (!wr_id_only && !init_sge(qp, wqe)) { + ret = -1; + goto unlock; + } + qp->r_wr_id = wqe->wr_id; + + ret = 1; + set_bit(HFI1_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * Validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} + +/* + * Switch to alternate path. + * The QP s_lock should be held and interrupts disabled. + */ +void hfi1_migrate_qp(struct hfi1_qp *qp) +{ + struct ib_event ev; + + qp->s_mig_state = IB_MIG_MIGRATED; + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + qp->s_flags |= HFI1_S_AHG_CLEAR; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); +} + +static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index) +{ + if (!index) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + return cpu_to_be64(ppd->guid); + } + return ibp->guids[index - 1]; +} + +static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) +{ + return (gid->global.interface_id == id && + (gid->global.subnet_prefix == gid_prefix || + gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX)); +} + +/* + * + * This should be called with the QP r_lock held. + * + * The s_lock will be acquired around the hfi1_migrate_qp() call. + */ +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, + int has_grh, struct hfi1_qp *qp, u32 bth0) +{ + __be64 guid; + unsigned long flags; + u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + + if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) { + if (!has_grh) { + if (qp->alt_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + goto err; + if (!gid_ok(&hdr->u.l.grh.sgid, + qp->alt_ah_attr.grh.dgid.global.subnet_prefix, + qp->alt_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, + sc5, be16_to_cpu(hdr->lrh[3])))) { + hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num) + goto err; + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_migrate_qp(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else { + if (!has_grh) { + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, + qp->remote_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid)) + goto err; + if (!gid_ok(&hdr->u.l.grh.sgid, + qp->remote_ah_attr.grh.dgid.global.subnet_prefix, + qp->remote_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, + sc5, be16_to_cpu(hdr->lrh[3])))) { + hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->port_num) + goto err; + if (qp->s_mig_state == IB_MIG_REARM && + !(bth0 & IB_BTH_MIG_REQ)) + qp->s_mig_state = IB_MIG_ARMED; + } + + return 0; + +err: + return 1; +} + +/** + * ruc_loopback - handle UC and RC loopback requests + * @sqp: the sending QP + * + * This is called from hfi1_do_send() to + * forward a WQE addressed to the same HFI. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +static void ruc_loopback(struct hfi1_qp *sqp) +{ + struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct hfi1_qp *qp; + struct hfi1_swqe *wqe; + struct hfi1_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + int release; + int ret; + + rcu_read_lock(); + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + qp = hfi1_lookup_qpn(ibp, sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT)) || + !(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= HFI1_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work request. */ + if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_NEXT_SEND_OK)) { + if (!(ib_hfi1_state_ops[sqp->state] & HFI1_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + ibp->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof(wc)); + send_status = IB_WC_SUCCESS; + + release = 1; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + ret = hfi1_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = hfi1_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + if (wqe->length == 0) + break; + if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!hfi1_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = 0; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!hfi1_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); + hfi1_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + hfi1_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + hfi1_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + ibp->n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + hfi1_send_complete(sqp, wqe, send_status); + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + ibp->n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_hfi1_state_ops[sqp->state] & HFI1_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= HFI1_S_WAIT_RNR; + sqp->s_timer.function = hfi1_rc_rnr_retry; + sqp->s_timer.expires = jiffies + + usecs_to_jiffies(ib_hfi1_rnr_table[qp->r_min_rnr_timer]); + add_timer(&sqp->s_timer); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + hfi1_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + hfi1_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = hfi1_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~HFI1_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~HFI1_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + rcu_read_unlock(); +} + +/** + * hfi1_make_grh - construct a GRH header + * @ibp: a pointer to the IB port + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) | + (grh->traffic_class << IB_GRH_TCLASS_SHIFT) | + (grh->flow_label << IB_GRH_FLOW_SHIFT)); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = IB_GRH_NEXT_HDR; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = ibp->gid_prefix; + hdr->sgid.global.interface_id = + grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ? + ibp->guids[grh->sgid_index - 1] : + cpu_to_be64(ppd_from_ibp(ibp)->guid); + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +/* + * free_ahg - clear ahg from QP + */ +void clear_ahg(struct hfi1_qp *qp) +{ + qp->s_hdr->ahgcount = 0; + qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR); + if (qp->s_sde) + sdma_ahg_free(qp->s_sde, qp->s_ahgidx); + qp->s_ahgidx = -1; + qp->s_sde = NULL; +} + +#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4) + +/** + * build_ahg - create ahg in s_hdr + * @qp: a pointer to QP + * @npsn: the next PSN for the request/response + * + * This routine handles the AHG by allocating an ahg entry and causing the + * copy of the first middle. + * + * Subsequent middles use the copied entry, editing the + * PSN with 1 or 2 edits. + */ +static inline void build_ahg(struct hfi1_qp *qp, u32 npsn) +{ + if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR)) + clear_ahg(qp); + if (!(qp->s_flags & HFI1_S_AHG_VALID)) { + /* first middle that needs copy */ + if (qp->s_ahgidx < 0) { + if (!qp->s_sde) + qp->s_sde = qp_to_sdma_engine(qp, qp->s_sc); + qp->s_ahgidx = sdma_ahg_alloc(qp->s_sde); + } + if (qp->s_ahgidx >= 0) { + qp->s_ahgpsn = npsn; + qp->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY; + /* save to protect a change in another thread */ + qp->s_hdr->sde = qp->s_sde; + qp->s_hdr->ahgidx = qp->s_ahgidx; + qp->s_flags |= HFI1_S_AHG_VALID; + } + } else { + /* subsequent middle after valid */ + if (qp->s_ahgidx >= 0) { + qp->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG; + qp->s_hdr->ahgidx = qp->s_ahgidx; + qp->s_hdr->ahgcount++; + qp->s_hdr->ahgdesc[0] = + sdma_build_ahg_descriptor( + (__force u16)cpu_to_be16((u16)npsn), + BTH2_OFFSET, + 16, + 16); + if ((npsn & 0xffff0000) != + (qp->s_ahgpsn & 0xffff0000)) { + qp->s_hdr->ahgcount++; + qp->s_hdr->ahgdesc[1] = + sdma_build_ahg_descriptor( + (__force u16)cpu_to_be16( + (u16)(npsn >> 16)), + BTH2_OFFSET, + 0, + 16); + } + } + } +} + +void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, + u32 bth0, u32 bth2, int middle) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + u16 lrh0; + u32 nwords; + u32 extra_bytes; + u8 sc5; + u32 bth1; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = HFI1_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = HFI1_LRH_GRH; + middle = 0; + } + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; + qp->s_sc = sc5; + /* + * reset s_hdr/AHG fields + * + * This insures that the ahgentry/ahgcount + * are at a non-AHG default to protect + * build_verbs_tx_desc() from using + * an include ahgidx. + * + * build_ahg() will modify as appropriate + * to use the AHG feature. + */ + qp->s_hdr->tx_flags = 0; + qp->s_hdr->ahgcount = 0; + qp->s_hdr->ahgidx = 0; + qp->s_hdr->sde = NULL; + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + else + middle = 0; + if (middle) + build_ahg(qp, bth2); + else + qp->s_flags &= ~HFI1_S_AHG_VALID; + qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr->ibh.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr->ibh.lrh[2] = + cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr->ibh.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0); + bth1 = qp->remote_qpn; + if (qp->s_flags & HFI1_S_ECN) { + qp->s_flags &= ~HFI1_S_ECN; + /* we recently received a FECN, so return a BECN */ + bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT); + } + ohdr->bth[1] = cpu_to_be32(bth1); + ohdr->bth[2] = cpu_to_be32(bth2); +} + +/** + * hfi1_do_send - perform a send on a QP + * @work: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, two threads could send packets out of order. + */ +void hfi1_do_send(struct work_struct *work) +{ + struct iowait *wait = container_of(work, struct iowait, iowork); + struct hfi1_qp *qp = container_of(wait, struct hfi1_qp, s_iowait); + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + int (*make_req)(struct hfi1_qp *qp); + unsigned long flags; + + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + !loopback && + (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) { + ruc_loopback(qp); + return; + } + + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = hfi1_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = hfi1_make_uc_req; + else + make_req = hfi1_make_ud_req; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if (!hfi1_send_ok(qp)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + return; + } + + qp->s_flags |= HFI1_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + do { + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (hfi1_verbs_send(qp, qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) + break; + /* Record that s_hdr is empty. */ + qp->s_hdrwords = 0; + } + } while (make_req(qp)); +} + +/* + * This should be called with s_lock held. + */ +void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + unsigned i; + + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_OR_FLUSH_SEND)) + return; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct hfi1_sge *sge = &wqe->sg_list[i]; + + hfi1_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount); + + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & HFI1_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + hfi1_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + last = qp->s_last; + old_last = last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} diff --git a/drivers/staging/rdma/hfi1/sdma.c b/drivers/staging/rdma/hfi1/sdma.c new file mode 100644 index 000000000..aecd1a747 --- /dev/null +++ b/drivers/staging/rdma/hfi1/sdma.c @@ -0,0 +1,2962 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "common.h" +#include "qp.h" +#include "sdma.h" +#include "iowait.h" +#include "trace.h" + +/* must be a power of 2 >= 64 <= 32768 */ +#define SDMA_DESCQ_CNT 1024 +#define INVALID_TAIL 0xffff + +static uint sdma_descq_cnt = SDMA_DESCQ_CNT; +module_param(sdma_descq_cnt, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries"); + +static uint sdma_idle_cnt = 250; +module_param(sdma_idle_cnt, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)"); + +uint mod_num_sdma; +module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO); +MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use"); + +#define SDMA_WAIT_BATCH_SIZE 20 +/* max wait time for a SDMA engine to indicate it has halted */ +#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */ +/* all SDMA engine errors that cause a halt */ + +#define SD(name) SEND_DMA_##name +#define ALL_SDMA_ENG_HALT_ERRS \ + (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK)) + +/* sdma_sendctrl operations */ +#define SDMA_SENDCTRL_OP_ENABLE (1U << 0) +#define SDMA_SENDCTRL_OP_INTENABLE (1U << 1) +#define SDMA_SENDCTRL_OP_HALT (1U << 2) +#define SDMA_SENDCTRL_OP_CLEANUP (1U << 3) + +/* handle long defines */ +#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \ +SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK +#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \ +SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT + +static const char * const sdma_state_names[] = { + [sdma_state_s00_hw_down] = "s00_HwDown", + [sdma_state_s10_hw_start_up_halt_wait] = "s10_HwStartUpHaltWait", + [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait", + [sdma_state_s20_idle] = "s20_Idle", + [sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait", + [sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait", + [sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait", + [sdma_state_s60_idle_halt_wait] = "s60_IdleHaltWait", + [sdma_state_s80_hw_freeze] = "s80_HwFreeze", + [sdma_state_s82_freeze_sw_clean] = "s82_FreezeSwClean", + [sdma_state_s99_running] = "s99_Running", +}; + +static const char * const sdma_event_names[] = { + [sdma_event_e00_go_hw_down] = "e00_GoHwDown", + [sdma_event_e10_go_hw_start] = "e10_GoHwStart", + [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone", + [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone", + [sdma_event_e30_go_running] = "e30_GoRunning", + [sdma_event_e40_sw_cleaned] = "e40_SwCleaned", + [sdma_event_e50_hw_cleaned] = "e50_HwCleaned", + [sdma_event_e60_hw_halted] = "e60_HwHalted", + [sdma_event_e70_go_idle] = "e70_GoIdle", + [sdma_event_e80_hw_freeze] = "e80_HwFreeze", + [sdma_event_e81_hw_frozen] = "e81_HwFrozen", + [sdma_event_e82_hw_unfreeze] = "e82_HwUnfreeze", + [sdma_event_e85_link_down] = "e85_LinkDown", + [sdma_event_e90_sw_halted] = "e90_SwHalted", +}; + +static const struct sdma_set_state_action sdma_action_table[] = { + [sdma_state_s00_hw_down] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s10_hw_start_up_halt_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 1, + .op_cleanup = 0, + }, + [sdma_state_s15_hw_start_up_clean_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 1, + }, + [sdma_state_s20_idle] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s30_sw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s40_hw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 1, + }, + [sdma_state_s50_hw_halt_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s60_idle_halt_wait] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 1, + .op_cleanup = 0, + }, + [sdma_state_s80_hw_freeze] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s82_freeze_sw_clean] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s99_running] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 0, + .go_s99_running_totrue = 1, + }, +}; + +#define SDMA_TAIL_UPDATE_THRESH 0x1F + +/* declare all statics here rather than keep sorting */ +static void sdma_complete(struct kref *); +static void sdma_finalput(struct sdma_state *); +static void sdma_get(struct sdma_state *); +static void sdma_hw_clean_up_task(unsigned long); +static void sdma_put(struct sdma_state *); +static void sdma_set_state(struct sdma_engine *, enum sdma_states); +static void sdma_start_hw_clean_up(struct sdma_engine *); +static void sdma_start_sw_clean_up(struct sdma_engine *); +static void sdma_sw_clean_up_task(unsigned long); +static void sdma_sendctrl(struct sdma_engine *, unsigned); +static void init_sdma_regs(struct sdma_engine *, u32, uint); +static void sdma_process_event( + struct sdma_engine *sde, + enum sdma_events event); +static void __sdma_process_event( + struct sdma_engine *sde, + enum sdma_events event); +static void dump_sdma_state(struct sdma_engine *sde); +static void sdma_make_progress(struct sdma_engine *sde, u64 status); +static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail); +static void sdma_flush_descq(struct sdma_engine *sde); + +/** + * sdma_state_name() - return state string from enum + * @state: state + */ +static const char *sdma_state_name(enum sdma_states state) +{ + return sdma_state_names[state]; +} + +static void sdma_get(struct sdma_state *ss) +{ + kref_get(&ss->kref); +} + +static void sdma_complete(struct kref *kref) +{ + struct sdma_state *ss = + container_of(kref, struct sdma_state, kref); + + complete(&ss->comp); +} + +static void sdma_put(struct sdma_state *ss) +{ + kref_put(&ss->kref, sdma_complete); +} + +static void sdma_finalput(struct sdma_state *ss) +{ + sdma_put(ss); + wait_for_completion(&ss->comp); +} + +static inline void write_sde_csr( + struct sdma_engine *sde, + u32 offset0, + u64 value) +{ + write_kctxt_csr(sde->dd, sde->this_idx, offset0, value); +} + +static inline u64 read_sde_csr( + struct sdma_engine *sde, + u32 offset0) +{ + return read_kctxt_csr(sde->dd, sde->this_idx, offset0); +} + +/* + * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for + * sdma engine 'sde' to drop to 0. + */ +static void sdma_wait_for_packet_egress(struct sdma_engine *sde, + int pause) +{ + u64 off = 8 * sde->this_idx; + struct hfi1_devdata *dd = sde->dd; + int lcnt = 0; + + while (1) { + u64 reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS); + + reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK; + reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT; + if (reg == 0) + break; + if (lcnt++ > 100) { + dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u\n", + __func__, sde->this_idx, (u32)reg); + break; + } + udelay(1); + } +} + +/* + * sdma_wait() - wait for packet egress to complete for all SDMA engines, + * and pause for credit return. + */ +void sdma_wait(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->num_sdma; i++) { + struct sdma_engine *sde = &dd->per_sdma[i]; + + sdma_wait_for_packet_egress(sde, 0); + } +} + +static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt) +{ + u64 reg; + + if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT)) + return; + reg = cnt; + reg &= SD(DESC_CNT_CNT_MASK); + reg <<= SD(DESC_CNT_CNT_SHIFT); + write_sde_csr(sde, SD(DESC_CNT), reg); +} + +/* + * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status + * + * Depending on timing there can be txreqs in two places: + * - in the descq ring + * - in the flush list + * + * To avoid ordering issues the descq ring needs to be flushed + * first followed by the flush list. + * + * This routine is called from two places + * - From a work queue item + * - Directly from the state machine just before setting the + * state to running + * + * Must be called with head_lock held + * + */ +static void sdma_flush(struct sdma_engine *sde) +{ + struct sdma_txreq *txp, *txp_next; + LIST_HEAD(flushlist); + + /* flush from head to tail */ + sdma_flush_descq(sde); + spin_lock(&sde->flushlist_lock); + /* copy flush list */ + list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) { + list_del_init(&txp->list); + list_add_tail(&txp->list, &flushlist); + } + spin_unlock(&sde->flushlist_lock); + /* flush from flush list */ + list_for_each_entry_safe(txp, txp_next, &flushlist, list) { + int drained = 0; + /* protect against complete modifying */ + struct iowait *wait = txp->wait; + + list_del_init(&txp->list); +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + trace_hfi1_sdma_out_sn(sde, txp->sn); + if (WARN_ON_ONCE(sde->head_sn != txp->sn)) + dd_dev_err(sde->dd, "expected %llu got %llu\n", + sde->head_sn, txp->sn); + sde->head_sn++; +#endif + sdma_txclean(sde->dd, txp); + if (wait) + drained = atomic_dec_and_test(&wait->sdma_busy); + if (txp->complete) + (*txp->complete)(txp, SDMA_TXREQ_S_ABORTED, drained); + if (wait && drained) + iowait_drain_wakeup(wait); + } +} + +/* + * Fields a work request for flushing the descq ring + * and the flush list + * + * If the engine has been brought to running during + * the scheduling delay, the flush is ignored, assuming + * that the process of bringing the engine to running + * would have done this flush prior to going to running. + * + */ +static void sdma_field_flush(struct work_struct *work) +{ + unsigned long flags; + struct sdma_engine *sde = + container_of(work, struct sdma_engine, flush_worker); + + write_seqlock_irqsave(&sde->head_lock, flags); + if (!__sdma_running(sde)) + sdma_flush(sde); + write_sequnlock_irqrestore(&sde->head_lock, flags); +} + +static void sdma_err_halt_wait(struct work_struct *work) +{ + struct sdma_engine *sde = container_of(work, struct sdma_engine, + err_halt_worker); + u64 statuscsr; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT); + while (1) { + statuscsr = read_sde_csr(sde, SD(STATUS)); + statuscsr &= SD(STATUS_ENG_HALTED_SMASK); + if (statuscsr) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(sde->dd, + "SDMA engine %d - timeout waiting for engine to halt\n", + sde->this_idx); + /* + * Continue anyway. This could happen if there was + * an uncorrectable error in the wrong spot. + */ + break; + } + usleep_range(80, 120); + } + + sdma_process_event(sde, sdma_event_e15_hw_halt_done); +} + +static void sdma_start_err_halt_wait(struct sdma_engine *sde) +{ + schedule_work(&sde->err_halt_worker); +} + + +static void sdma_err_progress_check_schedule(struct sdma_engine *sde) +{ + if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) { + + unsigned index; + struct hfi1_devdata *dd = sde->dd; + + for (index = 0; index < dd->num_sdma; index++) { + struct sdma_engine *curr_sdma = &dd->per_sdma[index]; + + if (curr_sdma != sde) + curr_sdma->progress_check_head = + curr_sdma->descq_head; + } + dd_dev_err(sde->dd, + "SDMA engine %d - check scheduled\n", + sde->this_idx); + mod_timer(&sde->err_progress_check_timer, jiffies + 10); + } +} + +static void sdma_err_progress_check(unsigned long data) +{ + unsigned index; + struct sdma_engine *sde = (struct sdma_engine *)data; + + dd_dev_err(sde->dd, "SDE progress check event\n"); + for (index = 0; index < sde->dd->num_sdma; index++) { + struct sdma_engine *curr_sde = &sde->dd->per_sdma[index]; + unsigned long flags; + + /* check progress on each engine except the current one */ + if (curr_sde == sde) + continue; + /* + * We must lock interrupts when acquiring sde->lock, + * to avoid a deadlock if interrupt triggers and spins on + * the same lock on same CPU + */ + spin_lock_irqsave(&curr_sde->tail_lock, flags); + write_seqlock(&curr_sde->head_lock); + + /* skip non-running queues */ + if (curr_sde->state.current_state != sdma_state_s99_running) { + write_sequnlock(&curr_sde->head_lock); + spin_unlock_irqrestore(&curr_sde->tail_lock, flags); + continue; + } + + if ((curr_sde->descq_head != curr_sde->descq_tail) && + (curr_sde->descq_head == + curr_sde->progress_check_head)) + __sdma_process_event(curr_sde, + sdma_event_e90_sw_halted); + write_sequnlock(&curr_sde->head_lock); + spin_unlock_irqrestore(&curr_sde->tail_lock, flags); + } + schedule_work(&sde->err_halt_worker); +} + +static void sdma_hw_clean_up_task(unsigned long opaque) +{ + struct sdma_engine *sde = (struct sdma_engine *) opaque; + u64 statuscsr; + + while (1) { +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, + __func__); +#endif + statuscsr = read_sde_csr(sde, SD(STATUS)); + statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK); + if (statuscsr) + break; + udelay(10); + } + + sdma_process_event(sde, sdma_event_e25_hw_clean_up_done); +} + +static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde) +{ + smp_read_barrier_depends(); /* see sdma_update_tail() */ + return sde->tx_ring[sde->tx_head & sde->sdma_mask]; +} + +/* + * flush ring for recovery + */ +static void sdma_flush_descq(struct sdma_engine *sde) +{ + u16 head, tail; + int progress = 0; + struct sdma_txreq *txp = get_txhead(sde); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + head = sde->descq_head & sde->sdma_mask; + tail = sde->descq_tail & sde->sdma_mask; + while (head != tail) { + /* advance head, wrap if needed */ + head = ++sde->descq_head & sde->sdma_mask; + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == head) { + int drained = 0; + /* protect against complete modifying */ + struct iowait *wait = txp->wait; + + /* remove from list */ + sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; + if (wait) + drained = atomic_dec_and_test(&wait->sdma_busy); +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + trace_hfi1_sdma_out_sn(sde, txp->sn); + if (WARN_ON_ONCE(sde->head_sn != txp->sn)) + dd_dev_err(sde->dd, "expected %llu got %llu\n", + sde->head_sn, txp->sn); + sde->head_sn++; +#endif + sdma_txclean(sde->dd, txp); + trace_hfi1_sdma_progress(sde, head, tail, txp); + if (txp->complete) + (*txp->complete)( + txp, + SDMA_TXREQ_S_ABORTED, + drained); + if (wait && drained) + iowait_drain_wakeup(wait); + /* see if there is another txp */ + txp = get_txhead(sde); + } + progress++; + } + if (progress) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); +} + +static void sdma_sw_clean_up_task(unsigned long opaque) +{ + struct sdma_engine *sde = (struct sdma_engine *) opaque; + unsigned long flags; + + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + + /* + * At this point, the following should always be true: + * - We are halted, so no more descriptors are getting retired. + * - We are not running, so no one is submitting new work. + * - Only we can send the e40_sw_cleaned, so we can't start + * running again until we say so. So, the active list and + * descq are ours to play with. + */ + + + /* + * In the error clean up sequence, software clean must be called + * before the hardware clean so we can use the hardware head in + * the progress routine. A hardware clean or SPC unfreeze will + * reset the hardware head. + * + * Process all retired requests. The progress routine will use the + * latest physical hardware head - we are not running so speed does + * not matter. + */ + sdma_make_progress(sde, 0); + + sdma_flush(sde); + + /* + * Reset our notion of head and tail. + * Note that the HW registers have been reset via an earlier + * clean up. + */ + sde->descq_tail = 0; + sde->descq_head = 0; + sde->desc_avail = sdma_descq_freecnt(sde); + *sde->head_dma = 0; + + __sdma_process_event(sde, sdma_event_e40_sw_cleaned); + + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void sdma_sw_tear_down(struct sdma_engine *sde) +{ + struct sdma_state *ss = &sde->state; + + /* Releasing this reference means the state machine has stopped. */ + sdma_put(ss); + + /* stop waiting for all unfreeze events to complete */ + atomic_set(&sde->dd->sdma_unfreeze_count, -1); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); +} + +static void sdma_start_hw_clean_up(struct sdma_engine *sde) +{ + tasklet_hi_schedule(&sde->sdma_hw_clean_up_task); +} + +static void sdma_start_sw_clean_up(struct sdma_engine *sde) +{ + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); +} + +static void sdma_set_state(struct sdma_engine *sde, + enum sdma_states next_state) +{ + struct sdma_state *ss = &sde->state; + const struct sdma_set_state_action *action = sdma_action_table; + unsigned op = 0; + + trace_hfi1_sdma_state( + sde, + sdma_state_names[ss->current_state], + sdma_state_names[next_state]); + + /* debugging bookkeeping */ + ss->previous_state = ss->current_state; + ss->previous_op = ss->current_op; + ss->current_state = next_state; + + if (ss->previous_state != sdma_state_s99_running + && next_state == sdma_state_s99_running) + sdma_flush(sde); + + if (action[next_state].op_enable) + op |= SDMA_SENDCTRL_OP_ENABLE; + + if (action[next_state].op_intenable) + op |= SDMA_SENDCTRL_OP_INTENABLE; + + if (action[next_state].op_halt) + op |= SDMA_SENDCTRL_OP_HALT; + + if (action[next_state].op_cleanup) + op |= SDMA_SENDCTRL_OP_CLEANUP; + + if (action[next_state].go_s99_running_tofalse) + ss->go_s99_running = 0; + + if (action[next_state].go_s99_running_totrue) + ss->go_s99_running = 1; + + ss->current_op = op; + sdma_sendctrl(sde, ss->current_op); +} + +/** + * sdma_get_descq_cnt() - called when device probed + * + * Return a validated descq count. + * + * This is currently only used in the verbs initialization to build the tx + * list. + * + * This will probably be deleted in favor of a more scalable approach to + * alloc tx's. + * + */ +u16 sdma_get_descq_cnt(void) +{ + u16 count = sdma_descq_cnt; + + if (!count) + return SDMA_DESCQ_CNT; + /* count must be a power of 2 greater than 64 and less than + * 32768. Otherwise return default. + */ + if (!is_power_of_2(count)) + return SDMA_DESCQ_CNT; + if (count < 64 || count > 32768) + return SDMA_DESCQ_CNT; + return count; +} +/** + * sdma_select_engine_vl() - select sdma engine + * @dd: devdata + * @selector: a spreading factor + * @vl: this vl + * + * + * This function returns an engine based on the selector and a vl. The + * mapping fields are protected by RCU. + */ +struct sdma_engine *sdma_select_engine_vl( + struct hfi1_devdata *dd, + u32 selector, + u8 vl) +{ + struct sdma_vl_map *m; + struct sdma_map_elem *e; + struct sdma_engine *rval; + + if (WARN_ON(vl > 8)) + return NULL; + + rcu_read_lock(); + m = rcu_dereference(dd->sdma_map); + if (unlikely(!m)) { + rcu_read_unlock(); + return NULL; + } + e = m->map[vl & m->mask]; + rval = e->sde[selector & e->mask]; + rcu_read_unlock(); + + trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx); + return rval; +} + +/** + * sdma_select_engine_sc() - select sdma engine + * @dd: devdata + * @selector: a spreading factor + * @sc5: the 5 bit sc + * + * + * This function returns an engine based on the selector and an sc. + */ +struct sdma_engine *sdma_select_engine_sc( + struct hfi1_devdata *dd, + u32 selector, + u8 sc5) +{ + u8 vl = sc_to_vlt(dd, sc5); + + return sdma_select_engine_vl(dd, selector, vl); +} + +/* + * Free the indicated map struct + */ +static void sdma_map_free(struct sdma_vl_map *m) +{ + int i; + + for (i = 0; m && i < m->actual_vls; i++) + kfree(m->map[i]); + kfree(m); +} + +/* + * Handle RCU callback + */ +static void sdma_map_rcu_callback(struct rcu_head *list) +{ + struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list); + + sdma_map_free(m); +} + +/** + * sdma_map_init - called when # vls change + * @dd: hfi1_devdata + * @port: port number + * @num_vls: number of vls + * @vl_engines: per vl engine mapping (optional) + * + * This routine changes the mapping based on the number of vls. + * + * vl_engines is used to specify a non-uniform vl/engine loading. NULL + * implies auto computing the loading and giving each VLs a uniform + * distribution of engines per VL. + * + * The auto algorithm computes the sde_per_vl and the number of extra + * engines. Any extra engines are added from the last VL on down. + * + * rcu locking is used here to control access to the mapping fields. + * + * If either the num_vls or num_sdma are non-power of 2, the array sizes + * in the struct sdma_vl_map and the struct sdma_map_elem are rounded + * up to the next highest power of 2 and the first entry is reused + * in a round robin fashion. + * + * If an error occurs the map change is not done and the mapping is + * not changed. + * + */ +int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines) +{ + int i, j; + int extra, sde_per_vl; + int engine = 0; + u8 lvl_engines[OPA_MAX_VLS]; + struct sdma_vl_map *oldmap, *newmap; + + if (!(dd->flags & HFI1_HAS_SEND_DMA)) + return 0; + + if (!vl_engines) { + /* truncate divide */ + sde_per_vl = dd->num_sdma / num_vls; + /* extras */ + extra = dd->num_sdma % num_vls; + vl_engines = lvl_engines; + /* add extras from last vl down */ + for (i = num_vls - 1; i >= 0; i--, extra--) + vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0); + } + /* build new map */ + newmap = kzalloc( + sizeof(struct sdma_vl_map) + + roundup_pow_of_two(num_vls) * + sizeof(struct sdma_map_elem *), + GFP_KERNEL); + if (!newmap) + goto bail; + newmap->actual_vls = num_vls; + newmap->vls = roundup_pow_of_two(num_vls); + newmap->mask = (1 << ilog2(newmap->vls)) - 1; + for (i = 0; i < newmap->vls; i++) { + /* save for wrap around */ + int first_engine = engine; + + if (i < newmap->actual_vls) { + int sz = roundup_pow_of_two(vl_engines[i]); + + /* only allocate once */ + newmap->map[i] = kzalloc( + sizeof(struct sdma_map_elem) + + sz * sizeof(struct sdma_engine *), + GFP_KERNEL); + if (!newmap->map[i]) + goto bail; + newmap->map[i]->mask = (1 << ilog2(sz)) - 1; + /* assign engines */ + for (j = 0; j < sz; j++) { + newmap->map[i]->sde[j] = + &dd->per_sdma[engine]; + if (++engine >= first_engine + vl_engines[i]) + /* wrap back to first engine */ + engine = first_engine; + } + } else { + /* just re-use entry without allocating */ + newmap->map[i] = newmap->map[i % num_vls]; + } + engine = first_engine + vl_engines[i]; + } + /* newmap in hand, save old map */ + spin_lock_irq(&dd->sde_map_lock); + oldmap = rcu_dereference_protected(dd->sdma_map, + lockdep_is_held(&dd->sde_map_lock)); + + /* publish newmap */ + rcu_assign_pointer(dd->sdma_map, newmap); + + spin_unlock_irq(&dd->sde_map_lock); + /* success, free any old map after grace period */ + if (oldmap) + call_rcu(&oldmap->list, sdma_map_rcu_callback); + return 0; +bail: + /* free any partial allocation */ + sdma_map_free(newmap); + return -ENOMEM; +} + +/* + * Clean up allocated memory. + * + * This routine is can be called regardless of the success of sdma_init() + * + */ +static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) +{ + size_t i; + struct sdma_engine *sde; + + if (dd->sdma_pad_dma) { + dma_free_coherent(&dd->pcidev->dev, 4, + (void *)dd->sdma_pad_dma, + dd->sdma_pad_phys); + dd->sdma_pad_dma = NULL; + dd->sdma_pad_phys = 0; + } + if (dd->sdma_heads_dma) { + dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size, + (void *)dd->sdma_heads_dma, + dd->sdma_heads_phys); + dd->sdma_heads_dma = NULL; + dd->sdma_heads_phys = 0; + } + for (i = 0; dd->per_sdma && i < num_engines; ++i) { + sde = &dd->per_sdma[i]; + + sde->head_dma = NULL; + sde->head_phys = 0; + + if (sde->descq) { + dma_free_coherent( + &dd->pcidev->dev, + sde->descq_cnt * sizeof(u64[2]), + sde->descq, + sde->descq_phys + ); + sde->descq = NULL; + sde->descq_phys = 0; + } + if (is_vmalloc_addr(sde->tx_ring)) + vfree(sde->tx_ring); + else + kfree(sde->tx_ring); + sde->tx_ring = NULL; + } + spin_lock_irq(&dd->sde_map_lock); + kfree(rcu_access_pointer(dd->sdma_map)); + RCU_INIT_POINTER(dd->sdma_map, NULL); + spin_unlock_irq(&dd->sde_map_lock); + synchronize_rcu(); + kfree(dd->per_sdma); + dd->per_sdma = NULL; +} + +/** + * sdma_init() - called when device probed + * @dd: hfi1_devdata + * @port: port number (currently only zero) + * + * sdma_init initializes the specified number of engines. + * + * The code initializes each sde, its csrs. Interrupts + * are not required to be enabled. + * + * Returns: + * 0 - success, -errno on failure + */ +int sdma_init(struct hfi1_devdata *dd, u8 port) +{ + unsigned this_idx; + struct sdma_engine *sde; + u16 descq_cnt; + void *curr_head; + struct hfi1_pportdata *ppd = dd->pport + port; + u32 per_sdma_credits; + uint idle_cnt = sdma_idle_cnt; + size_t num_engines = dd->chip_sdma_engines; + + if (!HFI1_CAP_IS_KSET(SDMA)) { + HFI1_CAP_CLEAR(SDMA_AHG); + return 0; + } + if (mod_num_sdma && + /* can't exceed chip support */ + mod_num_sdma <= dd->chip_sdma_engines && + /* count must be >= vls */ + mod_num_sdma >= num_vls) + num_engines = mod_num_sdma; + + dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma); + dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines); + dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n", + dd->chip_sdma_mem_size); + + per_sdma_credits = + dd->chip_sdma_mem_size/(num_engines * SDMA_BLOCK_SIZE); + + /* set up freeze waitqueue */ + init_waitqueue_head(&dd->sdma_unfreeze_wq); + atomic_set(&dd->sdma_unfreeze_count, 0); + + descq_cnt = sdma_get_descq_cnt(); + dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n", + num_engines, descq_cnt); + + /* alloc memory for array of send engines */ + dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL); + if (!dd->per_sdma) + return -ENOMEM; + + idle_cnt = ns_to_cclock(dd, idle_cnt); + /* Allocate memory for SendDMA descriptor FIFOs */ + for (this_idx = 0; this_idx < num_engines; ++this_idx) { + sde = &dd->per_sdma[this_idx]; + sde->dd = dd; + sde->ppd = ppd; + sde->this_idx = this_idx; + sde->descq_cnt = descq_cnt; + sde->desc_avail = sdma_descq_freecnt(sde); + sde->sdma_shift = ilog2(descq_cnt); + sde->sdma_mask = (1 << sde->sdma_shift) - 1; + sde->descq_full_count = 0; + + /* Create a mask for all 3 chip interrupt sources */ + sde->imask = (u64)1 << (0*TXE_NUM_SDMA_ENGINES + this_idx) + | (u64)1 << (1*TXE_NUM_SDMA_ENGINES + this_idx) + | (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx); + /* Create a mask specifically for sdma_idle */ + sde->idle_mask = + (u64)1 << (2*TXE_NUM_SDMA_ENGINES + this_idx); + /* Create a mask specifically for sdma_progress */ + sde->progress_mask = + (u64)1 << (TXE_NUM_SDMA_ENGINES + this_idx); + spin_lock_init(&sde->tail_lock); + seqlock_init(&sde->head_lock); + spin_lock_init(&sde->senddmactrl_lock); + spin_lock_init(&sde->flushlist_lock); + /* insure there is always a zero bit */ + sde->ahg_bits = 0xfffffffe00000000ULL; + + sdma_set_state(sde, sdma_state_s00_hw_down); + + /* set up reference counting */ + kref_init(&sde->state.kref); + init_completion(&sde->state.comp); + + INIT_LIST_HEAD(&sde->flushlist); + INIT_LIST_HEAD(&sde->dmawait); + + sde->tail_csr = + get_kctxt_csr_addr(dd, this_idx, SD(TAIL)); + + if (idle_cnt) + dd->default_desc1 = + SDMA_DESC1_HEAD_TO_HOST_FLAG; + else + dd->default_desc1 = + SDMA_DESC1_INT_REQ_FLAG; + + tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task, + (unsigned long)sde); + + tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task, + (unsigned long)sde); + INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait); + INIT_WORK(&sde->flush_worker, sdma_field_flush); + + sde->progress_check_head = 0; + + init_timer(&sde->err_progress_check_timer); + sde->err_progress_check_timer.function = + sdma_err_progress_check; + sde->err_progress_check_timer.data = (unsigned long)sde; + + sde->descq = dma_zalloc_coherent( + &dd->pcidev->dev, + descq_cnt * sizeof(u64[2]), + &sde->descq_phys, + GFP_KERNEL + ); + if (!sde->descq) + goto bail; + sde->tx_ring = + kcalloc(descq_cnt, sizeof(struct sdma_txreq *), + GFP_KERNEL); + if (!sde->tx_ring) + sde->tx_ring = + vzalloc( + sizeof(struct sdma_txreq *) * + descq_cnt); + if (!sde->tx_ring) + goto bail; + } + + dd->sdma_heads_size = L1_CACHE_BYTES * num_engines; + /* Allocate memory for DMA of head registers to memory */ + dd->sdma_heads_dma = dma_zalloc_coherent( + &dd->pcidev->dev, + dd->sdma_heads_size, + &dd->sdma_heads_phys, + GFP_KERNEL + ); + if (!dd->sdma_heads_dma) { + dd_dev_err(dd, "failed to allocate SendDMA head memory\n"); + goto bail; + } + + /* Allocate memory for pad */ + dd->sdma_pad_dma = dma_zalloc_coherent( + &dd->pcidev->dev, + sizeof(u32), + &dd->sdma_pad_phys, + GFP_KERNEL + ); + if (!dd->sdma_pad_dma) { + dd_dev_err(dd, "failed to allocate SendDMA pad memory\n"); + goto bail; + } + + /* assign each engine to different cacheline and init registers */ + curr_head = (void *)dd->sdma_heads_dma; + for (this_idx = 0; this_idx < num_engines; ++this_idx) { + unsigned long phys_offset; + + sde = &dd->per_sdma[this_idx]; + + sde->head_dma = curr_head; + curr_head += L1_CACHE_BYTES; + phys_offset = (unsigned long)sde->head_dma - + (unsigned long)dd->sdma_heads_dma; + sde->head_phys = dd->sdma_heads_phys + phys_offset; + init_sdma_regs(sde, per_sdma_credits, idle_cnt); + } + dd->flags |= HFI1_HAS_SEND_DMA; + dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0; + dd->num_sdma = num_engines; + if (sdma_map_init(dd, port, ppd->vls_operational, NULL)) + goto bail; + dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma); + return 0; + +bail: + sdma_clean(dd, num_engines); + return -ENOMEM; +} + +/** + * sdma_all_running() - called when the link goes up + * @dd: hfi1_devdata + * + * This routine moves all engines to the running state. + */ +void sdma_all_running(struct hfi1_devdata *dd) +{ + struct sdma_engine *sde; + unsigned int i; + + /* move all engines to running */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e30_go_running); + } +} + +/** + * sdma_all_idle() - called when the link goes down + * @dd: hfi1_devdata + * + * This routine moves all engines to the idle state. + */ +void sdma_all_idle(struct hfi1_devdata *dd) +{ + struct sdma_engine *sde; + unsigned int i; + + /* idle all engines */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e70_go_idle); + } +} + +/** + * sdma_start() - called to kick off state processing for all engines + * @dd: hfi1_devdata + * + * This routine is for kicking off the state processing for all required + * sdma engines. Interrupts need to be working at this point. + * + */ +void sdma_start(struct hfi1_devdata *dd) +{ + unsigned i; + struct sdma_engine *sde; + + /* kick off the engines state processing */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e10_go_hw_start); + } +} + +/** + * sdma_exit() - used when module is removed + * @dd: hfi1_devdata + */ +void sdma_exit(struct hfi1_devdata *dd) +{ + unsigned this_idx; + struct sdma_engine *sde; + + for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma; + ++this_idx) { + + sde = &dd->per_sdma[this_idx]; + if (!list_empty(&sde->dmawait)) + dd_dev_err(dd, "sde %u: dmawait list not empty!\n", + sde->this_idx); + sdma_process_event(sde, sdma_event_e00_go_hw_down); + + del_timer_sync(&sde->err_progress_check_timer); + + /* + * This waits for the state machine to exit so it is not + * necessary to kill the sdma_sw_clean_up_task to make sure + * it is not running. + */ + sdma_finalput(&sde->state); + } + sdma_clean(dd, dd->num_sdma); +} + +/* + * unmap the indicated descriptor + */ +static inline void sdma_unmap_desc( + struct hfi1_devdata *dd, + struct sdma_desc *descp) +{ + switch (sdma_mapping_type(descp)) { + case SDMA_MAP_SINGLE: + dma_unmap_single( + &dd->pcidev->dev, + sdma_mapping_addr(descp), + sdma_mapping_len(descp), + DMA_TO_DEVICE); + break; + case SDMA_MAP_PAGE: + dma_unmap_page( + &dd->pcidev->dev, + sdma_mapping_addr(descp), + sdma_mapping_len(descp), + DMA_TO_DEVICE); + break; + } +} + +/* + * return the mode as indicated by the first + * descriptor in the tx. + */ +static inline u8 ahg_mode(struct sdma_txreq *tx) +{ + return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK) + >> SDMA_DESC1_HEADER_MODE_SHIFT; +} + +/** + * sdma_txclean() - clean tx of mappings, descp *kmalloc's + * @dd: hfi1_devdata for unmapping + * @tx: tx request to clean + * + * This is used in the progress routine to clean the tx or + * by the ULP to toss an in-process tx build. + * + * The code can be called multiple times without issue. + * + */ +void sdma_txclean( + struct hfi1_devdata *dd, + struct sdma_txreq *tx) +{ + u16 i; + + if (tx->num_desc) { + u8 skip = 0, mode = ahg_mode(tx); + + /* unmap first */ + sdma_unmap_desc(dd, &tx->descp[0]); + /* determine number of AHG descriptors to skip */ + if (mode > SDMA_AHG_APPLY_UPDATE1) + skip = mode >> 1; + for (i = 1 + skip; i < tx->num_desc; i++) + sdma_unmap_desc(dd, &tx->descp[i]); + tx->num_desc = 0; + } + kfree(tx->coalesce_buf); + tx->coalesce_buf = NULL; + /* kmalloc'ed descp */ + if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) { + tx->desc_limit = ARRAY_SIZE(tx->descs); + kfree(tx->descp); + } +} + +static inline u16 sdma_gethead(struct sdma_engine *sde) +{ + struct hfi1_devdata *dd = sde->dd; + int use_dmahead; + u16 hwhead; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + +retry: + use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) && + (dd->flags & HFI1_HAS_SDMA_TIMEOUT); + hwhead = use_dmahead ? + (u16) le64_to_cpu(*sde->head_dma) : + (u16) read_sde_csr(sde, SD(HEAD)); + + if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) { + u16 cnt; + u16 swtail; + u16 swhead; + int sane; + + swhead = sde->descq_head & sde->sdma_mask; + /* this code is really bad for cache line trading */ + swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + cnt = sde->descq_cnt; + + if (swhead < swtail) + /* not wrapped */ + sane = (hwhead >= swhead) & (hwhead <= swtail); + else if (swhead > swtail) + /* wrapped around */ + sane = ((hwhead >= swhead) && (hwhead < cnt)) || + (hwhead <= swtail); + else + /* empty */ + sane = (hwhead == swhead); + + if (unlikely(!sane)) { + dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n", + sde->this_idx, + use_dmahead ? "dma" : "kreg", + hwhead, swhead, swtail, cnt); + if (use_dmahead) { + /* try one more time, using csr */ + use_dmahead = 0; + goto retry; + } + /* proceed as if no progress */ + hwhead = swhead; + } + } + return hwhead; +} + +/* + * This is called when there are send DMA descriptors that might be + * available. + * + * This is called with head_lock held. + */ +static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail) +{ + struct iowait *wait, *nw; + struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; + unsigned i, n = 0, seq; + struct sdma_txreq *stx; + struct hfi1_ibdev *dev = &sde->dd->verbs_dev; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(sde->dd, "avail: %u\n", avail); +#endif + + do { + seq = read_seqbegin(&dev->iowait_lock); + if (!list_empty(&sde->dmawait)) { + /* at least one item */ + write_seqlock(&dev->iowait_lock); + /* Harvest waiters wanting DMA descriptors */ + list_for_each_entry_safe( + wait, + nw, + &sde->dmawait, + list) { + u16 num_desc = 0; + + if (!wait->wakeup) + continue; + if (n == ARRAY_SIZE(waits)) + break; + if (!list_empty(&wait->tx_head)) { + stx = list_first_entry( + &wait->tx_head, + struct sdma_txreq, + list); + num_desc = stx->num_desc; + } + if (num_desc > avail) + break; + avail -= num_desc; + list_del_init(&wait->list); + waits[n++] = wait; + } + write_sequnlock(&dev->iowait_lock); + break; + } + } while (read_seqretry(&dev->iowait_lock, seq)); + + for (i = 0; i < n; i++) + waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); +} + +/* head_lock must be held */ +static void sdma_make_progress(struct sdma_engine *sde, u64 status) +{ + struct sdma_txreq *txp = NULL; + int progress = 0; + u16 hwhead, swhead, swtail; + int idle_check_done = 0; + + hwhead = sdma_gethead(sde); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + +retry: + txp = get_txhead(sde); + swhead = sde->descq_head & sde->sdma_mask; + trace_hfi1_sdma_progress(sde, hwhead, swhead, txp); + while (swhead != hwhead) { + /* advance head, wrap if needed */ + swhead = ++sde->descq_head & sde->sdma_mask; + + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == swhead) { + int drained = 0; + /* protect against complete modifying */ + struct iowait *wait = txp->wait; + + /* remove from list */ + sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; + if (wait) + drained = atomic_dec_and_test(&wait->sdma_busy); +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + trace_hfi1_sdma_out_sn(sde, txp->sn); + if (WARN_ON_ONCE(sde->head_sn != txp->sn)) + dd_dev_err(sde->dd, "expected %llu got %llu\n", + sde->head_sn, txp->sn); + sde->head_sn++; +#endif + sdma_txclean(sde->dd, txp); + if (txp->complete) + (*txp->complete)( + txp, + SDMA_TXREQ_S_OK, + drained); + if (wait && drained) + iowait_drain_wakeup(wait); + /* see if there is another txp */ + txp = get_txhead(sde); + } + trace_hfi1_sdma_progress(sde, hwhead, swhead, txp); + progress++; + } + + /* + * The SDMA idle interrupt is not guaranteed to be ordered with respect + * to updates to the the dma_head location in host memory. The head + * value read might not be fully up to date. If there are pending + * descriptors and the SDMA idle interrupt fired then read from the + * CSR SDMA head instead to get the latest value from the hardware. + * The hardware SDMA head should be read at most once in this invocation + * of sdma_make_progress(..) which is ensured by idle_check_done flag + */ + if ((status & sde->idle_mask) && !idle_check_done) { + swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + if (swtail != hwhead) { + hwhead = (u16)read_sde_csr(sde, SD(HEAD)); + idle_check_done = 1; + goto retry; + } + } + + sde->last_status = status; + if (progress) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); +} + +/* + * sdma_engine_interrupt() - interrupt handler for engine + * @sde: sdma engine + * @status: sdma interrupt reason + * + * Status is a mask of the 3 possible interrupts for this engine. It will + * contain bits _only_ for this SDMA engine. It will contain at least one + * bit, it may contain more. + */ +void sdma_engine_interrupt(struct sdma_engine *sde, u64 status) +{ + trace_hfi1_sdma_engine_interrupt(sde, status); + write_seqlock(&sde->head_lock); + sdma_set_desc_cnt(sde, sde->descq_cnt / 2); + sdma_make_progress(sde, status); + write_sequnlock(&sde->head_lock); +} + +/** + * sdma_engine_error() - error handler for engine + * @sde: sdma engine + * @status: sdma interrupt reason + */ +void sdma_engine_error(struct sdma_engine *sde, u64 status) +{ + unsigned long flags; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n", + sde->this_idx, + (unsigned long long)status, + sdma_state_names[sde->state.current_state]); +#endif + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + if (status & ALL_SDMA_ENG_HALT_ERRS) + __sdma_process_event(sde, sdma_event_e60_hw_halted); + if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) { + dd_dev_err(sde->dd, + "SDMA (%u) engine error: 0x%llx state %s\n", + sde->this_idx, + (unsigned long long)status, + sdma_state_names[sde->state.current_state]); + dump_sdma_state(sde); + } + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void sdma_sendctrl(struct sdma_engine *sde, unsigned op) +{ + u64 set_senddmactrl = 0; + u64 clr_senddmactrl = 0; + unsigned long flags; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n", + sde->this_idx, + (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0); +#endif + + if (op & SDMA_SENDCTRL_OP_ENABLE) + set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK); + + if (op & SDMA_SENDCTRL_OP_INTENABLE) + set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK); + + if (op & SDMA_SENDCTRL_OP_HALT) + set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK); + + spin_lock_irqsave(&sde->senddmactrl_lock, flags); + + sde->p_senddmactrl |= set_senddmactrl; + sde->p_senddmactrl &= ~clr_senddmactrl; + + if (op & SDMA_SENDCTRL_OP_CLEANUP) + write_sde_csr(sde, SD(CTRL), + sde->p_senddmactrl | + SD(CTRL_SDMA_CLEANUP_SMASK)); + else + write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl); + + spin_unlock_irqrestore(&sde->senddmactrl_lock, flags); + +#ifdef CONFIG_SDMA_VERBOSITY + sdma_dumpstate(sde); +#endif +} + +static void sdma_setlengen(struct sdma_engine *sde) +{ +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + /* + * Set SendDmaLenGen and clear-then-set the MSB of the generation + * count to enable generation checking and load the internal + * generation counter. + */ + write_sde_csr(sde, SD(LEN_GEN), + (sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT) + ); + write_sde_csr(sde, SD(LEN_GEN), + ((sde->descq_cnt/64) << SD(LEN_GEN_LENGTH_SHIFT)) + | (4ULL << SD(LEN_GEN_GENERATION_SHIFT)) + ); +} + +static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail) +{ + /* Commit writes to memory and advance the tail on the chip */ + smp_wmb(); /* see get_txhead() */ + writeq(tail, sde->tail_csr); +} + +/* + * This is called when changing to state s10_hw_start_up_halt_wait as + * a result of send buffer errors or send DMA descriptor errors. + */ +static void sdma_hw_start_up(struct sdma_engine *sde) +{ + u64 reg; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + sdma_setlengen(sde); + sdma_update_tail(sde, 0); /* Set SendDmaTail */ + *sde->head_dma = 0; + + reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) << + SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT); + write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg); +} + +#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ +(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +#define SET_STATIC_RATE_CONTROL_SMASK(r) \ +(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) +/* + * set_sdma_integrity + * + * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'. + */ +static void set_sdma_integrity(struct sdma_engine *sde) +{ + struct hfi1_devdata *dd = sde->dd; + u64 reg; + + if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY))) + return; + + reg = hfi1_pkt_base_sdma_integrity(dd); + + if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)) + CLEAR_STATIC_RATE_CONTROL_SMASK(reg); + else + SET_STATIC_RATE_CONTROL_SMASK(reg); + + write_sde_csr(sde, SD(CHECK_ENABLE), reg); +} + + +static void init_sdma_regs( + struct sdma_engine *sde, + u32 credits, + uint idle_cnt) +{ + u8 opval, opmask; +#ifdef CONFIG_SDMA_VERBOSITY + struct hfi1_devdata *dd = sde->dd; + + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys); + sdma_setlengen(sde); + sdma_update_tail(sde, 0); /* Set SendDmaTail */ + write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt); + write_sde_csr(sde, SD(DESC_CNT), 0); + write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys); + write_sde_csr(sde, SD(MEMORY), + ((u64)credits << + SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) | + ((u64)(credits * sde->this_idx) << + SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT))); + write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull); + set_sdma_integrity(sde); + opmask = OPCODE_CHECK_MASK_DISABLED; + opval = OPCODE_CHECK_VAL_DISABLED; + write_sde_csr(sde, SD(CHECK_OPCODE), + (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) | + (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT)); +} + +#ifdef CONFIG_SDMA_VERBOSITY + +#define sdma_dumpstate_helper0(reg) do { \ + csr = read_csr(sde->dd, reg); \ + dd_dev_err(sde->dd, "%36s 0x%016llx\n", #reg, csr); \ + } while (0) + +#define sdma_dumpstate_helper(reg) do { \ + csr = read_sde_csr(sde, reg); \ + dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \ + #reg, sde->this_idx, csr); \ + } while (0) + +#define sdma_dumpstate_helper2(reg) do { \ + csr = read_csr(sde->dd, reg + (8 * i)); \ + dd_dev_err(sde->dd, "%33s_%02u 0x%016llx\n", \ + #reg, i, csr); \ + } while (0) + +void sdma_dumpstate(struct sdma_engine *sde) +{ + u64 csr; + unsigned i; + + sdma_dumpstate_helper(SD(CTRL)); + sdma_dumpstate_helper(SD(STATUS)); + sdma_dumpstate_helper0(SD(ERR_STATUS)); + sdma_dumpstate_helper0(SD(ERR_MASK)); + sdma_dumpstate_helper(SD(ENG_ERR_STATUS)); + sdma_dumpstate_helper(SD(ENG_ERR_MASK)); + + for (i = 0; i < CCE_NUM_INT_CSRS; ++i) { + sdma_dumpstate_helper2(CCE_INT_STATUS); + sdma_dumpstate_helper2(CCE_INT_MASK); + sdma_dumpstate_helper2(CCE_INT_BLOCKED); + } + + sdma_dumpstate_helper(SD(TAIL)); + sdma_dumpstate_helper(SD(HEAD)); + sdma_dumpstate_helper(SD(PRIORITY_THLD)); + sdma_dumpstate_helper(SD(IDLE_CNT)); + sdma_dumpstate_helper(SD(RELOAD_CNT)); + sdma_dumpstate_helper(SD(DESC_CNT)); + sdma_dumpstate_helper(SD(DESC_FETCHED_CNT)); + sdma_dumpstate_helper(SD(MEMORY)); + sdma_dumpstate_helper0(SD(ENGINES)); + sdma_dumpstate_helper0(SD(MEM_SIZE)); + /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS); */ + sdma_dumpstate_helper(SD(BASE_ADDR)); + sdma_dumpstate_helper(SD(LEN_GEN)); + sdma_dumpstate_helper(SD(HEAD_ADDR)); + sdma_dumpstate_helper(SD(CHECK_ENABLE)); + sdma_dumpstate_helper(SD(CHECK_VL)); + sdma_dumpstate_helper(SD(CHECK_JOB_KEY)); + sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY)); + sdma_dumpstate_helper(SD(CHECK_SLID)); + sdma_dumpstate_helper(SD(CHECK_OPCODE)); +} +#endif + +static void dump_sdma_state(struct sdma_engine *sde) +{ + struct hw_sdma_desc *descq; + struct hw_sdma_desc *descqp; + u64 desc[2]; + u64 addr; + u8 gen; + u16 len; + u16 head, tail, cnt; + + head = sde->descq_head & sde->sdma_mask; + tail = sde->descq_tail & sde->sdma_mask; + cnt = sdma_descq_freecnt(sde); + descq = sde->descq; + + dd_dev_err(sde->dd, + "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n", + sde->this_idx, + head, + tail, + cnt, + !list_empty(&sde->flushlist)); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 0 }; + + descqp = &sde->descq[head]; + desc[0] = le64_to_cpu(descqp->qw[0]); + desc[1] = le64_to_cpu(descqp->qw[1]); + flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? + 'H' : '-'; + flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT) + & SDMA_DESC0_PHY_ADDR_MASK; + gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT) + & SDMA_DESC1_GENERATION_MASK; + len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) + & SDMA_DESC0_BYTE_COUNT_MASK; + dd_dev_err(sde->dd, + "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); + dd_dev_err(sde->dd, + "\tdesc0:0x%016llx desc1 0x%016llx\n", + desc[0], desc[1]); + if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) + dd_dev_err(sde->dd, + "\taidx: %u amode: %u alen: %u\n", + (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK) + >> SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK) + >> SDMA_DESC1_HEADER_MODE_SHIFT), + (u8)((desc[1] & SDMA_DESC1_HEADER_DWS_SMASK) + >> SDMA_DESC1_HEADER_DWS_SHIFT)); + head++; + head &= sde->sdma_mask; + } +} + +#define SDE_FMT \ + "SDE %u STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n" +/** + * sdma_seqfile_dump_sde() - debugfs dump of sde + * @s: seq file + * @sde: send dma engine to dump + * + * This routine dumps the sde to the indicated seq file. + */ +void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde) +{ + u16 head, tail; + struct hw_sdma_desc *descqp; + u64 desc[2]; + u64 addr; + u8 gen; + u16 len; + + head = sde->descq_head & sde->sdma_mask; + tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + seq_printf(s, SDE_FMT, sde->this_idx, + sdma_state_name(sde->state.current_state), + (unsigned long long)read_sde_csr(sde, SD(CTRL)), + (unsigned long long)read_sde_csr(sde, SD(STATUS)), + (unsigned long long)read_sde_csr(sde, + SD(ENG_ERR_STATUS)), + (unsigned long long)read_sde_csr(sde, SD(TAIL)), + tail, + (unsigned long long)read_sde_csr(sde, SD(HEAD)), + head, + (unsigned long long)le64_to_cpu(*sde->head_dma), + (unsigned long long)read_sde_csr(sde, SD(MEMORY)), + (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)), + (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)), + (unsigned long long)sde->last_status, + (unsigned long long)sde->ahg_bits, + sde->tx_tail, + sde->tx_head, + sde->descq_tail, + sde->descq_head, + !list_empty(&sde->flushlist), + sde->descq_full_count, + (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID)); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 0 }; + + descqp = &sde->descq[head]; + desc[0] = le64_to_cpu(descqp->qw[0]); + desc[1] = le64_to_cpu(descqp->qw[1]); + flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? + 'H' : '-'; + flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT) + & SDMA_DESC0_PHY_ADDR_MASK; + gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT) + & SDMA_DESC1_GENERATION_MASK; + len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) + & SDMA_DESC0_BYTE_COUNT_MASK; + seq_printf(s, + "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); + if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) + seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n", + (u8)((desc[1] & SDMA_DESC1_HEADER_INDEX_SMASK) + >> SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & SDMA_DESC1_HEADER_MODE_SMASK) + >> SDMA_DESC1_HEADER_MODE_SHIFT)); + head = (head + 1) & sde->sdma_mask; + } +} + +/* + * add the generation number into + * the qw1 and return + */ +static inline u64 add_gen(struct sdma_engine *sde, u64 qw1) +{ + u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3; + + qw1 &= ~SDMA_DESC1_GENERATION_SMASK; + qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + return qw1; +} + +/* + * This routine submits the indicated tx + * + * Space has already been guaranteed and + * tail side of ring is locked. + * + * The hardware tail update is done + * in the caller and that is facilitated + * by returning the new tail. + * + * There is special case logic for ahg + * to not add the generation number for + * up to 2 descriptors that follow the + * first descriptor. + * + */ +static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) +{ + int i; + u16 tail; + struct sdma_desc *descp = tx->descp; + u8 skip = 0, mode = ahg_mode(tx); + + tail = sde->descq_tail & sde->sdma_mask; + sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]); + sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1])); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1], + tail, &sde->descq[tail]); + tail = ++sde->descq_tail & sde->sdma_mask; + descp++; + if (mode > SDMA_AHG_APPLY_UPDATE1) + skip = mode >> 1; + for (i = 1; i < tx->num_desc; i++, descp++) { + u64 qw1; + + sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]); + if (skip) { + /* edits don't have generation */ + qw1 = descp->qw[1]; + skip--; + } else { + /* replace generation with real one for non-edits */ + qw1 = add_gen(sde, descp->qw[1]); + } + sde->descq[tail].qw[1] = cpu_to_le64(qw1); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1, + tail, &sde->descq[tail]); + tail = ++sde->descq_tail & sde->sdma_mask; + } + tx->next_descq_idx = tail; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); + WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]); +#endif + sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx; + sde->desc_avail -= tx->num_desc; + return tail; +} + +/* + * Check for progress + */ +static int sdma_check_progress( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx) +{ + int ret; + + sde->desc_avail = sdma_descq_freecnt(sde); + if (tx->num_desc <= sde->desc_avail) + return -EAGAIN; + /* pulse the head_lock */ + if (wait && wait->sleep) { + unsigned seq; + + seq = raw_seqcount_begin( + (const seqcount_t *)&sde->head_lock.seqcount); + ret = wait->sleep(sde, wait, tx, seq); + if (ret == -EAGAIN) + sde->desc_avail = sdma_descq_freecnt(sde); + } else + ret = -EBUSY; + return ret; +} + +/** + * sdma_send_txreq() - submit a tx req to ring + * @sde: sdma engine to use + * @wait: wait structure to use when full (may be NULL) + * @tx: sdma_txreq to submit + * + * The call submits the tx into the ring. If a iowait structure is non-NULL + * the packet will be queued to the list in wait. + * + * Return: + * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in + * ring (wait == NULL) + * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state + */ +int sdma_send_txreq(struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx) +{ + int ret = 0; + u16 tail; + unsigned long flags; + + /* user should have supplied entire packet */ + if (unlikely(tx->tlen)) + return -EINVAL; + tx->wait = wait; + spin_lock_irqsave(&sde->tail_lock, flags); +retry: + if (unlikely(!__sdma_running(sde))) + goto unlock_noconn; + if (unlikely(tx->num_desc > sde->desc_avail)) + goto nodesc; + tail = submit_tx(sde, tx); + if (wait) + atomic_inc(&wait->sdma_busy); + sdma_update_tail(sde, tail); +unlock: + spin_unlock_irqrestore(&sde->tail_lock, flags); + return ret; +unlock_noconn: + if (wait) + atomic_inc(&wait->sdma_busy); + tx->next_descq_idx = 0; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); +#endif + spin_lock(&sde->flushlist_lock); + list_add_tail(&tx->list, &sde->flushlist); + spin_unlock(&sde->flushlist_lock); + if (wait) { + wait->tx_count++; + wait->count += tx->num_desc; + } + schedule_work(&sde->flush_worker); + ret = -ECOMM; + goto unlock; +nodesc: + ret = sdma_check_progress(sde, wait, tx); + if (ret == -EAGAIN) { + ret = 0; + goto retry; + } + sde->descq_full_count++; + goto unlock; +} + +/** + * sdma_send_txlist() - submit a list of tx req to ring + * @sde: sdma engine to use + * @wait: wait structure to use when full (may be NULL) + * @tx_list: list of sdma_txreqs to submit + * + * The call submits the list into the ring. + * + * If the iowait structure is non-NULL and not equal to the iowait list + * the unprocessed part of the list will be appended to the list in wait. + * + * In all cases, the tx_list will be updated so the head of the tx_list is + * the list of descriptors that have yet to be transmitted. + * + * The intent of this call is to provide a more efficient + * way of submitting multiple packets to SDMA while holding the tail + * side locking. + * + * Return: + * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring + * (wait == NULL) + * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state + */ +int sdma_send_txlist(struct sdma_engine *sde, + struct iowait *wait, + struct list_head *tx_list) +{ + struct sdma_txreq *tx, *tx_next; + int ret = 0; + unsigned long flags; + u16 tail = INVALID_TAIL; + int count = 0; + + spin_lock_irqsave(&sde->tail_lock, flags); +retry: + list_for_each_entry_safe(tx, tx_next, tx_list, list) { + tx->wait = wait; + if (unlikely(!__sdma_running(sde))) + goto unlock_noconn; + if (unlikely(tx->num_desc > sde->desc_avail)) + goto nodesc; + if (unlikely(tx->tlen)) { + ret = -EINVAL; + goto update_tail; + } + list_del_init(&tx->list); + tail = submit_tx(sde, tx); + count++; + if (tail != INVALID_TAIL && + (count & SDMA_TAIL_UPDATE_THRESH) == 0) { + sdma_update_tail(sde, tail); + tail = INVALID_TAIL; + } + } +update_tail: + if (wait) + atomic_add(count, &wait->sdma_busy); + if (tail != INVALID_TAIL) + sdma_update_tail(sde, tail); + spin_unlock_irqrestore(&sde->tail_lock, flags); + return ret; +unlock_noconn: + spin_lock(&sde->flushlist_lock); + list_for_each_entry_safe(tx, tx_next, tx_list, list) { + tx->wait = wait; + list_del_init(&tx->list); + if (wait) + atomic_inc(&wait->sdma_busy); + tx->next_descq_idx = 0; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); +#endif + list_add_tail(&tx->list, &sde->flushlist); + if (wait) { + wait->tx_count++; + wait->count += tx->num_desc; + } + } + spin_unlock(&sde->flushlist_lock); + schedule_work(&sde->flush_worker); + ret = -ECOMM; + goto update_tail; +nodesc: + ret = sdma_check_progress(sde, wait, tx); + if (ret == -EAGAIN) { + ret = 0; + goto retry; + } + sde->descq_full_count++; + goto update_tail; +} + +static void sdma_process_event(struct sdma_engine *sde, + enum sdma_events event) +{ + unsigned long flags; + + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + + __sdma_process_event(sde, event); + + if (sde->state.current_state == sdma_state_s99_running) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); + + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void __sdma_process_event(struct sdma_engine *sde, + enum sdma_events event) +{ + struct sdma_state *ss = &sde->state; + int need_progress = 0; + + /* CONFIG SDMA temporary */ +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx, + sdma_state_names[ss->current_state], + sdma_event_names[event]); +#endif + + switch (ss->current_state) { + case sdma_state_s00_hw_down: + switch (event) { + case sdma_event_e00_go_hw_down: + break; + case sdma_event_e30_go_running: + /* + * If down, but running requested (usually result + * of link up, then we need to start up. + * This can happen when hw down is requested while + * bringing the link up with traffic active on + * 7220, e.g. */ + ss->go_s99_running = 1; + /* fall through and start dma engine */ + case sdma_event_e10_go_hw_start: + /* This reference means the state machine is started */ + sdma_get(&sde->state); + sdma_set_state(sde, + sdma_state_s10_hw_start_up_halt_wait); + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e40_sw_cleaned: + sdma_sw_tear_down(sde); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s10_hw_start_up_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, + sdma_state_s15_hw_start_up_clean_wait); + sdma_start_hw_clean_up(sde); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + sdma_start_err_halt_wait(sde); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s15_hw_start_up_clean_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s20_idle: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + sdma_set_state(sde, sdma_state_s99_running); + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + sdma_set_state(sde, sdma_state_s50_hw_halt_wait); + sdma_start_err_halt_wait(sde); + break; + case sdma_event_e70_go_idle: + break; + case sdma_event_e85_link_down: + /* fall through */ + case sdma_event_e80_hw_freeze: + sdma_set_state(sde, sdma_state_s80_hw_freeze); + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s30_sw_clean_up_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait); + sdma_start_hw_clean_up(sde); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s40_hw_clean_up_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s50_hw_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + sdma_start_err_halt_wait(sde); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s60_idle_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + sdma_start_err_halt_wait(sde); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s80_hw_freeze: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + sdma_set_state(sde, sdma_state_s82_freeze_sw_clean); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s82_freeze_sw_clean: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + /* notify caller this engine is done cleaning */ + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s99_running: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_start_sw_clean_up(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + need_progress = 1; + sdma_err_progress_check_schedule(sde); + case sdma_event_e90_sw_halted: + /* + * SW initiated halt does not perform engines + * progress check + */ + sdma_set_state(sde, sdma_state_s50_hw_halt_wait); + sdma_start_err_halt_wait(sde); + break; + case sdma_event_e70_go_idle: + sdma_set_state(sde, sdma_state_s60_idle_halt_wait); + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + /* fall through */ + case sdma_event_e80_hw_freeze: + sdma_set_state(sde, sdma_state_s80_hw_freeze); + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + } + break; + } + + ss->last_event = event; + if (need_progress) + sdma_make_progress(sde, 0); +} + +/* + * _extend_sdma_tx_descs() - helper to extend txreq + * + * This is called once the initial nominal allocation + * of descriptors in the sdma_txreq is exhausted. + * + * The code will bump the allocation up to the max + * of MAX_DESC (64) descriptors. There doesn't seem + * much point in an interim step. + * + */ +int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) +{ + int i; + + tx->descp = kmalloc_array( + MAX_DESC, + sizeof(struct sdma_desc), + GFP_ATOMIC); + if (!tx->descp) + return -ENOMEM; + tx->desc_limit = MAX_DESC; + /* copy ones already built */ + for (i = 0; i < tx->num_desc; i++) + tx->descp[i] = tx->descs[i]; + return 0; +} + +/* Update sdes when the lmc changes */ +void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid) +{ + struct sdma_engine *sde; + int i; + u64 sreg; + + sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) << + SD(CHECK_SLID_MASK_SHIFT)) | + (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) << + SD(CHECK_SLID_VALUE_SHIFT)); + + for (i = 0; i < dd->num_sdma; i++) { + hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x", + i, (u32)sreg); + sde = &dd->per_sdma[i]; + write_sde_csr(sde, SD(CHECK_SLID), sreg); + } +} + +/* tx not dword sized - pad */ +int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) +{ + int rval = 0; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = _extend_sdma_tx_descs(dd, tx); + if (rval) + return rval; + } + /* finish the one just added */ + tx->num_desc++; + make_tx_sdma_desc( + tx, + SDMA_MAP_NONE, + dd->sdma_pad_phys, + sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); + _sdma_close_tx(dd, tx); + return rval; +} + +/* + * Add ahg to the sdma_txreq + * + * The logic will consume up to 3 + * descriptors at the beginning of + * sdma_txreq. + */ +void _sdma_txreq_ahgadd( + struct sdma_txreq *tx, + u8 num_ahg, + u8 ahg_entry, + u32 *ahg, + u8 ahg_hlen) +{ + u32 i, shift = 0, desc = 0; + u8 mode; + + WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4); + /* compute mode */ + if (num_ahg == 1) + mode = SDMA_AHG_APPLY_UPDATE1; + else if (num_ahg <= 5) + mode = SDMA_AHG_APPLY_UPDATE2; + else + mode = SDMA_AHG_APPLY_UPDATE3; + tx->num_desc++; + /* initialize to consumed descriptors to zero */ + switch (mode) { + case SDMA_AHG_APPLY_UPDATE3: + tx->num_desc++; + tx->descs[2].qw[0] = 0; + tx->descs[2].qw[1] = 0; + /* FALLTHROUGH */ + case SDMA_AHG_APPLY_UPDATE2: + tx->num_desc++; + tx->descs[1].qw[0] = 0; + tx->descs[1].qw[1] = 0; + break; + } + ahg_hlen >>= 2; + tx->descs[0].qw[1] |= + (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK) + << SDMA_DESC1_HEADER_INDEX_SHIFT) | + (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK) + << SDMA_DESC1_HEADER_DWS_SHIFT) | + (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK) + << SDMA_DESC1_HEADER_MODE_SHIFT) | + (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK) + << SDMA_DESC1_HEADER_UPDATE1_SHIFT); + for (i = 0; i < (num_ahg - 1); i++) { + if (!shift && !(i & 2)) + desc++; + tx->descs[desc].qw[!!(i & 2)] |= + (((u64)ahg[i + 1]) + << shift); + shift = (shift + 32) & 63; + } +} + +/** + * sdma_ahg_alloc - allocate an AHG entry + * @sde: engine to allocate from + * + * Return: + * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled, + * -ENOSPC if an entry is not available + */ +int sdma_ahg_alloc(struct sdma_engine *sde) +{ + int nr; + int oldbit; + + if (!sde) { + trace_hfi1_ahg_allocate(sde, -EINVAL); + return -EINVAL; + } + while (1) { + nr = ffz(ACCESS_ONCE(sde->ahg_bits)); + if (nr > 31) { + trace_hfi1_ahg_allocate(sde, -ENOSPC); + return -ENOSPC; + } + oldbit = test_and_set_bit(nr, &sde->ahg_bits); + if (!oldbit) + break; + cpu_relax(); + } + trace_hfi1_ahg_allocate(sde, nr); + return nr; +} + +/** + * sdma_ahg_free - free an AHG entry + * @sde: engine to return AHG entry + * @ahg_index: index to free + * + * This routine frees the indicate AHG entry. + */ +void sdma_ahg_free(struct sdma_engine *sde, int ahg_index) +{ + if (!sde) + return; + trace_hfi1_ahg_deallocate(sde, ahg_index); + if (ahg_index < 0 || ahg_index > 31) + return; + clear_bit(ahg_index, &sde->ahg_bits); +} + +/* + * SPC freeze handling for SDMA engines. Called when the driver knows + * the SPC is going into a freeze but before the freeze is fully + * settled. Generally an error interrupt. + * + * This event will pull the engine out of running so no more entries can be + * added to the engine's queue. + */ +void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down) +{ + int i; + enum sdma_events event = link_down ? sdma_event_e85_link_down : + sdma_event_e80_hw_freeze; + + /* set up the wait but do not wait here */ + atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma); + + /* tell all engines to stop running and wait */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], event); + + /* sdma_freeze() will wait for all engines to have stopped */ +} + +/* + * SPC freeze handling for SDMA engines. Called when the driver knows + * the SPC is fully frozen. + */ +void sdma_freeze(struct hfi1_devdata *dd) +{ + int i; + int ret; + + /* + * Make sure all engines have moved out of the running state before + * continuing. + */ + ret = wait_event_interruptible(dd->sdma_unfreeze_wq, + atomic_read(&dd->sdma_unfreeze_count) <= 0); + /* interrupted or count is negative, then unloading - just exit */ + if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0) + return; + + /* set up the count for the next wait */ + atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma); + + /* tell all engines that the SPC is frozen, they can start cleaning */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen); + + /* + * Wait for everyone to finish software clean before exiting. The + * software clean will read engine CSRs, so must be completed before + * the next step, which will clear the engine CSRs. + */ + (void) wait_event_interruptible(dd->sdma_unfreeze_wq, + atomic_read(&dd->sdma_unfreeze_count) <= 0); + /* no need to check results - done no matter what */ +} + +/* + * SPC freeze handling for the SDMA engines. Called after the SPC is unfrozen. + * + * The SPC freeze acts like a SDMA halt and a hardware clean combined. All + * that is left is a software clean. We could do it after the SPC is fully + * frozen, but then we'd have to add another state to wait for the unfreeze. + * Instead, just defer the software clean until the unfreeze step. + */ +void sdma_unfreeze(struct hfi1_devdata *dd) +{ + int i; + + /* tell all engines start freeze clean up */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], + sdma_event_e82_hw_unfreeze); +} + +/** + * _sdma_engine_progress_schedule() - schedule progress on engine + * @sde: sdma_engine to schedule progress + * + */ +void _sdma_engine_progress_schedule( + struct sdma_engine *sde) +{ + trace_hfi1_sdma_engine_progress(sde, sde->progress_mask); + /* assume we have selected a good cpu */ + write_csr(sde->dd, + CCE_INT_FORCE + (8*(IS_SDMA_START/64)), sde->progress_mask); +} diff --git a/drivers/staging/rdma/hfi1/sdma.h b/drivers/staging/rdma/hfi1/sdma.h new file mode 100644 index 000000000..496086903 --- /dev/null +++ b/drivers/staging/rdma/hfi1/sdma.h @@ -0,0 +1,1123 @@ +#ifndef _HFI1_SDMA_H +#define _HFI1_SDMA_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "verbs.h" + +/* increased for AHG */ +#define NUM_DESC 6 +/* Hardware limit */ +#define MAX_DESC 64 +/* Hardware limit for SDMA packet size */ +#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1) + + +#define SDMA_TXREQ_S_OK 0 +#define SDMA_TXREQ_S_SENDERROR 1 +#define SDMA_TXREQ_S_ABORTED 2 +#define SDMA_TXREQ_S_SHUTDOWN 3 + +/* flags bits */ +#define SDMA_TXREQ_F_URGENT 0x0001 +#define SDMA_TXREQ_F_AHG_COPY 0x0002 +#define SDMA_TXREQ_F_USE_AHG 0x0004 + +#define SDMA_MAP_NONE 0 +#define SDMA_MAP_SINGLE 1 +#define SDMA_MAP_PAGE 2 + +#define SDMA_AHG_VALUE_MASK 0xffff +#define SDMA_AHG_VALUE_SHIFT 0 +#define SDMA_AHG_INDEX_MASK 0xf +#define SDMA_AHG_INDEX_SHIFT 16 +#define SDMA_AHG_FIELD_LEN_MASK 0xf +#define SDMA_AHG_FIELD_LEN_SHIFT 20 +#define SDMA_AHG_FIELD_START_MASK 0x1f +#define SDMA_AHG_FIELD_START_SHIFT 24 +#define SDMA_AHG_UPDATE_ENABLE_MASK 0x1 +#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31 + +/* AHG modes */ + +/* + * Be aware the ordering and values + * for SDMA_AHG_APPLY_UPDATE[123] + * are assumed in generating a skip + * count in submit_tx() in sdma.c + */ +#define SDMA_AHG_NO_AHG 0 +#define SDMA_AHG_COPY 1 +#define SDMA_AHG_APPLY_UPDATE1 2 +#define SDMA_AHG_APPLY_UPDATE2 3 +#define SDMA_AHG_APPLY_UPDATE3 4 + +/* + * Bits defined in the send DMA descriptor. + */ +#define SDMA_DESC0_FIRST_DESC_FLAG (1ULL << 63) +#define SDMA_DESC0_LAST_DESC_FLAG (1ULL << 62) +#define SDMA_DESC0_BYTE_COUNT_SHIFT 48 +#define SDMA_DESC0_BYTE_COUNT_WIDTH 14 +#define SDMA_DESC0_BYTE_COUNT_MASK \ + ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1) +#define SDMA_DESC0_BYTE_COUNT_SMASK \ + (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT) +#define SDMA_DESC0_PHY_ADDR_SHIFT 0 +#define SDMA_DESC0_PHY_ADDR_WIDTH 48 +#define SDMA_DESC0_PHY_ADDR_MASK \ + ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1) +#define SDMA_DESC0_PHY_ADDR_SMASK \ + (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT) + +#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32 +#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32 +#define SDMA_DESC1_HEADER_UPDATE1_MASK \ + ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1) +#define SDMA_DESC1_HEADER_UPDATE1_SMASK \ + (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT) +#define SDMA_DESC1_HEADER_MODE_SHIFT 13 +#define SDMA_DESC1_HEADER_MODE_WIDTH 3 +#define SDMA_DESC1_HEADER_MODE_MASK \ + ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1) +#define SDMA_DESC1_HEADER_MODE_SMASK \ + (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT) +#define SDMA_DESC1_HEADER_INDEX_SHIFT 8 +#define SDMA_DESC1_HEADER_INDEX_WIDTH 5 +#define SDMA_DESC1_HEADER_INDEX_MASK \ + ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1) +#define SDMA_DESC1_HEADER_INDEX_SMASK \ + (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT) +#define SDMA_DESC1_HEADER_DWS_SHIFT 4 +#define SDMA_DESC1_HEADER_DWS_WIDTH 4 +#define SDMA_DESC1_HEADER_DWS_MASK \ + ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1) +#define SDMA_DESC1_HEADER_DWS_SMASK \ + (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT) +#define SDMA_DESC1_GENERATION_SHIFT 2 +#define SDMA_DESC1_GENERATION_WIDTH 2 +#define SDMA_DESC1_GENERATION_MASK \ + ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1) +#define SDMA_DESC1_GENERATION_SMASK \ + (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT) +#define SDMA_DESC1_INT_REQ_FLAG (1ULL << 1) +#define SDMA_DESC1_HEAD_TO_HOST_FLAG (1ULL << 0) + +enum sdma_states { + sdma_state_s00_hw_down, + sdma_state_s10_hw_start_up_halt_wait, + sdma_state_s15_hw_start_up_clean_wait, + sdma_state_s20_idle, + sdma_state_s30_sw_clean_up_wait, + sdma_state_s40_hw_clean_up_wait, + sdma_state_s50_hw_halt_wait, + sdma_state_s60_idle_halt_wait, + sdma_state_s80_hw_freeze, + sdma_state_s82_freeze_sw_clean, + sdma_state_s99_running, +}; + +enum sdma_events { + sdma_event_e00_go_hw_down, + sdma_event_e10_go_hw_start, + sdma_event_e15_hw_halt_done, + sdma_event_e25_hw_clean_up_done, + sdma_event_e30_go_running, + sdma_event_e40_sw_cleaned, + sdma_event_e50_hw_cleaned, + sdma_event_e60_hw_halted, + sdma_event_e70_go_idle, + sdma_event_e80_hw_freeze, + sdma_event_e81_hw_frozen, + sdma_event_e82_hw_unfreeze, + sdma_event_e85_link_down, + sdma_event_e90_sw_halted, +}; + +struct sdma_set_state_action { + unsigned op_enable:1; + unsigned op_intenable:1; + unsigned op_halt:1; + unsigned op_cleanup:1; + unsigned go_s99_running_tofalse:1; + unsigned go_s99_running_totrue:1; +}; + +struct sdma_state { + struct kref kref; + struct completion comp; + enum sdma_states current_state; + unsigned current_op; + unsigned go_s99_running; + /* debugging/development */ + enum sdma_states previous_state; + unsigned previous_op; + enum sdma_events last_event; +}; + +/** + * DOC: sdma exported routines + * + * These sdma routines fit into three categories: + * - The SDMA API for building and submitting packets + * to the ring + * + * - Initialization and tear down routines to buildup + * and tear down SDMA + * + * - ISR entrances to handle interrupts, state changes + * and errors + */ + +/** + * DOC: sdma PSM/verbs API + * + * The sdma API is designed to be used by both PSM + * and verbs to supply packets to the SDMA ring. + * + * The usage of the API is as follows: + * + * Embed a struct iowait in the QP or + * PQ. The iowait should be initialized with a + * call to iowait_init(). + * + * The user of the API should create an allocation method + * for their version of the txreq. slabs, pre-allocated lists, + * and dma pools can be used. Once the user's overload of + * the sdma_txreq has been allocated, the sdma_txreq member + * must be initialized with sdma_txinit() or sdma_txinit_ahg(). + * + * The txreq must be declared with the sdma_txreq first. + * + * The tx request, once initialized, is manipulated with calls to + * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr() + * for each disjoint memory location. It is the user's responsibility + * to understand the packet boundaries and page boundaries to do the + * appropriate number of sdma_txadd_* calls.. The user + * must be prepared to deal with failures from these routines due to + * either memory allocation or dma_mapping failures. + * + * The mapping specifics for each memory location are recorded + * in the tx. Memory locations added with sdma_txadd_page() + * and sdma_txadd_kvaddr() are automatically mapped when added + * to the tx and nmapped as part of the progress processing in the + * SDMA interrupt handling. + * + * sdma_txadd_daddr() is used to add an dma_addr_t memory to the + * tx. An example of a use case would be a pre-allocated + * set of headers allocated via dma_pool_alloc() or + * dma_alloc_coherent(). For these memory locations, it + * is the responsibility of the user to handle that unmapping. + * (This would usually be at an unload or job termination.) + * + * The routine sdma_send_txreq() is used to submit + * a tx to the ring after the appropriate number of + * sdma_txadd_* have been done. + * + * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist() + * can be used to submit a list of packets. + * + * The user is free to use the link overhead in the struct sdma_txreq as + * long as the tx isn't in flight. + * + * The extreme degenerate case of the number of descriptors + * exceeding the ring size is automatically handled as + * memory locations are added. An overflow of the descriptor + * array that is part of the sdma_txreq is also automatically + * handled. + * + */ + +/** + * DOC: Infrastructure calls + * + * sdma_init() is used to initialize data structures and + * CSRs for the desired number of SDMA engines. + * + * sdma_start() is used to kick the SDMA engines initialized + * with sdma_init(). Interrupts must be enabled at this + * point since aspects of the state machine are interrupt + * driven. + * + * sdma_engine_error() and sdma_engine_interrupt() are + * entrances for interrupts. + * + * sdma_map_init() is for the management of the mapping + * table when the number of vls is changed. + * + */ + +/* + * struct hw_sdma_desc - raw 128 bit SDMA descriptor + * + * This is the raw descriptor in the SDMA ring + */ +struct hw_sdma_desc { + /* private: don't use directly */ + __le64 qw[2]; +}; + +/* + * struct sdma_desc - canonical fragment descriptor + * + * This is the descriptor carried in the tx request + * corresponding to each fragment. + * + */ +struct sdma_desc { + /* private: don't use directly */ + u64 qw[2]; +}; + +struct sdma_txreq; +typedef void (*callback_t)(struct sdma_txreq *, int, int); + +/** + * struct sdma_txreq - the sdma_txreq structure (one per packet) + * @list: for use by user and by queuing for wait + * + * This is the representation of a packet which consists of some + * number of fragments. Storage is provided to within the structure. + * for all fragments. + * + * The storage for the descriptors are automatically extended as needed + * when the currently allocation is exceeded. + * + * The user (Verbs or PSM) may overload this structure with fields + * specific to their use by putting this struct first in their struct. + * The method of allocation of the overloaded structure is user dependent + * + * The list is the only public field in the structure. + * + */ + +struct sdma_txreq { + struct list_head list; + /* private: */ + struct sdma_desc *descp; + /* private: */ + void *coalesce_buf; + /* private: */ + struct iowait *wait; + /* private: */ + callback_t complete; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + u64 sn; +#endif + /* private: - used in coalesce/pad processing */ + u16 packet_len; + /* private: - down-counted to trigger last */ + u16 tlen; + /* private: flags */ + u16 flags; + /* private: */ + u16 num_desc; + /* private: */ + u16 desc_limit; + /* private: */ + u16 next_descq_idx; + /* private: */ + struct sdma_desc descs[NUM_DESC]; +}; + +struct verbs_txreq { + struct hfi1_pio_header phdr; + struct sdma_txreq txreq; + struct hfi1_qp *qp; + struct hfi1_swqe *wqe; + struct hfi1_mregion *mr; + struct hfi1_sge_state *ss; + struct sdma_engine *sde; + u16 hdr_dwords; + u16 hdr_inx; +}; + +/** + * struct sdma_engine - Data pertaining to each SDMA engine. + * @dd: a back-pointer to the device data + * @ppd: per port back-pointer + * @imask: mask for irq manipulation + * @idle_mask: mask for determining if an interrupt is due to sdma_idle + * + * This structure has the state for each sdma_engine. + * + * Accessing to non public fields are not supported + * since the private members are subject to change. + */ +struct sdma_engine { + /* read mostly */ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + /* private: */ + void __iomem *tail_csr; + u64 imask; /* clear interrupt mask */ + u64 idle_mask; + u64 progress_mask; + /* private: */ + struct workqueue_struct *wq; + /* private: */ + volatile __le64 *head_dma; /* DMA'ed by chip */ + /* private: */ + dma_addr_t head_phys; + /* private: */ + struct hw_sdma_desc *descq; + /* private: */ + unsigned descq_full_count; + struct sdma_txreq **tx_ring; + /* private: */ + dma_addr_t descq_phys; + /* private */ + u32 sdma_mask; + /* private */ + struct sdma_state state; + /* private: */ + u8 sdma_shift; + /* private: */ + u8 this_idx; /* zero relative engine */ + /* protect changes to senddmactrl shadow */ + spinlock_t senddmactrl_lock; + /* private: */ + u64 p_senddmactrl; /* shadow per-engine SendDmaCtrl */ + + /* read/write using tail_lock */ + spinlock_t tail_lock ____cacheline_aligned_in_smp; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + /* private: */ + u64 tail_sn; +#endif + /* private: */ + u32 descq_tail; + /* private: */ + unsigned long ahg_bits; + /* private: */ + u16 desc_avail; + /* private: */ + u16 tx_tail; + /* private: */ + u16 descq_cnt; + + /* read/write using head_lock */ + /* private: */ + seqlock_t head_lock ____cacheline_aligned_in_smp; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + /* private: */ + u64 head_sn; +#endif + /* private: */ + u32 descq_head; + /* private: */ + u16 tx_head; + /* private: */ + u64 last_status; + + /* private: */ + struct list_head dmawait; + + /* CONFIG SDMA for now, just blindly duplicate */ + /* private: */ + struct tasklet_struct sdma_hw_clean_up_task + ____cacheline_aligned_in_smp; + + /* private: */ + struct tasklet_struct sdma_sw_clean_up_task + ____cacheline_aligned_in_smp; + /* private: */ + struct work_struct err_halt_worker; + /* private */ + struct timer_list err_progress_check_timer; + u32 progress_check_head; + /* private: */ + struct work_struct flush_worker; + spinlock_t flushlist_lock; + /* private: */ + struct list_head flushlist; +}; + + +int sdma_init(struct hfi1_devdata *dd, u8 port); +void sdma_start(struct hfi1_devdata *dd); +void sdma_exit(struct hfi1_devdata *dd); +void sdma_all_running(struct hfi1_devdata *dd); +void sdma_all_idle(struct hfi1_devdata *dd); +void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle); +void sdma_freeze(struct hfi1_devdata *dd); +void sdma_unfreeze(struct hfi1_devdata *dd); +void sdma_wait(struct hfi1_devdata *dd); + +/** + * sdma_empty() - idle engine test + * @engine: sdma engine + * + * Currently used by verbs as a latency optimization. + * + * Return: + * 1 - empty, 0 - non-empty + */ +static inline int sdma_empty(struct sdma_engine *sde) +{ + return sde->descq_tail == sde->descq_head; +} + +static inline u16 sdma_descq_freecnt(struct sdma_engine *sde) +{ + return sde->descq_cnt - + (sde->descq_tail - + ACCESS_ONCE(sde->descq_head)) - 1; +} + +static inline u16 sdma_descq_inprocess(struct sdma_engine *sde) +{ + return sde->descq_cnt - sdma_descq_freecnt(sde); +} + +/* + * Either head_lock or tail lock required to see + * a steady state. + */ +static inline int __sdma_running(struct sdma_engine *engine) +{ + return engine->state.current_state == sdma_state_s99_running; +} + + +/** + * sdma_running() - state suitability test + * @engine: sdma engine + * + * sdma_running probes the internal state to determine if it is suitable + * for submitting packets. + * + * Return: + * 1 - ok to submit, 0 - not ok to submit + * + */ +static inline int sdma_running(struct sdma_engine *engine) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&engine->tail_lock, flags); + ret = __sdma_running(engine); + spin_unlock_irqrestore(&engine->tail_lock, flags); + return ret; +} + +void _sdma_txreq_ahgadd( + struct sdma_txreq *tx, + u8 num_ahg, + u8 ahg_entry, + u32 *ahg, + u8 ahg_hlen); + + +/** + * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG + * @tx: tx request to initialize + * @flags: flags to key last descriptor additions + * @tlen: total packet length (pbc + headers + data) + * @ahg_entry: ahg entry to use (0 - 31) + * @num_ahg: ahg descriptor for first descriptor (0 - 9) + * @ahg: array of AHG descriptors (up to 9 entries) + * @ahg_hlen: number of bytes from ASIC entry to use + * @cb: callback + * + * The allocation of the sdma_txreq and it enclosing structure is user + * dependent. This routine must be called to initialize the user independent + * fields. + * + * The currently supported flags are SDMA_TXREQ_F_URGENT, + * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG. + * + * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the + * completion is desired as soon as possible. + * + * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be + * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in + * the AHG descriptors into the first 1 to 3 descriptors. + * + * Completions of submitted requests can be gotten on selected + * txreqs by giving a completion routine callback to sdma_txinit() or + * sdma_txinit_ahg(). The environment in which the callback runs + * can be from an ISR, a tasklet, or a thread, so no sleeping + * kernel routines can be used. Aspects of the sdma ring may + * be locked so care should be taken with locking. + * + * The callback pointer can be NULL to avoid any callback for the packet + * being submitted. The callback will be provided this tx, a status, and a flag. + * + * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR, + * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN. + * + * The flag, if the is the iowait had been used, indicates the iowait + * sdma_busy count has reached zero. + * + * user data portion of tlen should be precise. The sdma_txadd_* entrances + * will pad with a descriptor references 1 - 3 bytes when the number of bytes + * specified in tlen have been supplied to the sdma_txreq. + * + * ahg_hlen is used to determine the number of on-chip entry bytes to + * use as the header. This is for cases where the stored header is + * larger than the header to be used in a packet. This is typical + * for verbs where an RDMA_WRITE_FIRST is larger than the packet in + * and RDMA_WRITE_MIDDLE. + * + */ +static inline int sdma_txinit_ahg( + struct sdma_txreq *tx, + u16 flags, + u16 tlen, + u8 ahg_entry, + u8 num_ahg, + u32 *ahg, + u8 ahg_hlen, + void (*cb)(struct sdma_txreq *, int, int)) +{ + if (tlen == 0) + return -ENODATA; + if (tlen > MAX_SDMA_PKT_SIZE) + return -EMSGSIZE; + tx->desc_limit = ARRAY_SIZE(tx->descs); + tx->descp = &tx->descs[0]; + INIT_LIST_HEAD(&tx->list); + tx->num_desc = 0; + tx->flags = flags; + tx->complete = cb; + tx->coalesce_buf = NULL; + tx->wait = NULL; + tx->tlen = tx->packet_len = tlen; + tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG; + tx->descs[0].qw[1] = 0; + if (flags & SDMA_TXREQ_F_AHG_COPY) + tx->descs[0].qw[1] |= + (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK) + << SDMA_DESC1_HEADER_INDEX_SHIFT) | + (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK) + << SDMA_DESC1_HEADER_MODE_SHIFT); + else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg) + _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen); + return 0; +} + +/** + * sdma_txinit() - initialize an sdma_txreq struct (no AHG) + * @tx: tx request to initialize + * @flags: flags to key last descriptor additions + * @tlen: total packet length (pbc + headers + data) + * @cb: callback pointer + * + * The allocation of the sdma_txreq and it enclosing structure is user + * dependent. This routine must be called to initialize the user + * independent fields. + * + * The currently supported flags is SDMA_TXREQ_F_URGENT. + * + * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the + * completion is desired as soon as possible. + * + * Completions of submitted requests can be gotten on selected + * txreqs by giving a completion routine callback to sdma_txinit() or + * sdma_txinit_ahg(). The environment in which the callback runs + * can be from an ISR, a tasklet, or a thread, so no sleeping + * kernel routines can be used. The head size of the sdma ring may + * be locked so care should be taken with locking. + * + * The callback pointer can be NULL to avoid any callback for the packet + * being submitted. + * + * The callback, if non-NULL, will be provided this tx and a status. The + * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR, + * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN. + * + */ +static inline int sdma_txinit( + struct sdma_txreq *tx, + u16 flags, + u16 tlen, + void (*cb)(struct sdma_txreq *, int, int)) +{ + return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb); +} + +/* helpers - don't use */ +static inline int sdma_mapping_type(struct sdma_desc *d) +{ + return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK) + >> SDMA_DESC1_GENERATION_SHIFT; +} + +static inline size_t sdma_mapping_len(struct sdma_desc *d) +{ + return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK) + >> SDMA_DESC0_BYTE_COUNT_SHIFT; +} + +static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d) +{ + return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK) + >> SDMA_DESC0_PHY_ADDR_SHIFT; +} + +static inline void make_tx_sdma_desc( + struct sdma_txreq *tx, + int type, + dma_addr_t addr, + size_t len) +{ + struct sdma_desc *desc = &tx->descp[tx->num_desc]; + + if (!tx->num_desc) { + /* qw[0] zero; qw[1] first, ahg mode already in from init */ + desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + } else { + desc->qw[0] = 0; + desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + } + desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK) + << SDMA_DESC0_PHY_ADDR_SHIFT) | + (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK) + << SDMA_DESC0_BYTE_COUNT_SHIFT); +} + +/* helper to extend txreq */ +int _extend_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *); +int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *); +void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *); + +/* helpers used by public routines */ +static inline void _sdma_close_tx(struct hfi1_devdata *dd, + struct sdma_txreq *tx) +{ + tx->descp[tx->num_desc].qw[0] |= + SDMA_DESC0_LAST_DESC_FLAG; + tx->descp[tx->num_desc].qw[1] |= + dd->default_desc1; + if (tx->flags & SDMA_TXREQ_F_URGENT) + tx->descp[tx->num_desc].qw[1] |= + (SDMA_DESC1_HEAD_TO_HOST_FLAG| + SDMA_DESC1_INT_REQ_FLAG); +} + +static inline int _sdma_txadd_daddr( + struct hfi1_devdata *dd, + int type, + struct sdma_txreq *tx, + dma_addr_t addr, + u16 len) +{ + int rval = 0; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = _extend_sdma_tx_descs(dd, tx); + if (rval) + return rval; + } + make_tx_sdma_desc( + tx, + type, + addr, len); + WARN_ON(len > tx->tlen); + tx->tlen -= len; + /* special cases for last */ + if (!tx->tlen) { + if (tx->packet_len & (sizeof(u32) - 1)) + rval = _pad_sdma_tx_descs(dd, tx); + else + _sdma_close_tx(dd, tx); + } + tx->num_desc++; + return rval; +} + +/** + * sdma_txadd_page() - add a page to the sdma_txreq + * @dd: the device to use for mapping + * @tx: tx request to which the page is added + * @page: page to map + * @offset: offset within the page + * @len: length in bytes + * + * This is used to add a page/offset/length descriptor. + * + * The mapping/unmapping of the page/offset/len is automatically handled. + * + * Return: + * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't + * extend descriptor array or couldn't allocate coalesce + * buffer. + * + */ +static inline int sdma_txadd_page( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + struct page *page, + unsigned long offset, + u16 len) +{ + dma_addr_t addr = + dma_map_page( + &dd->pcidev->dev, + page, + offset, + len, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + sdma_txclean(dd, tx); + return -ENOSPC; + } + return _sdma_txadd_daddr( + dd, SDMA_MAP_PAGE, tx, addr, len); +} + +/** + * sdma_txadd_daddr() - add a dma address to the sdma_txreq + * @dd: the device to use for mapping + * @tx: sdma_txreq to which the page is added + * @addr: dma address mapped by caller + * @len: length in bytes + * + * This is used to add a descriptor for memory that is already dma mapped. + * + * In this case, there is no unmapping as part of the progress processing for + * this memory location. + * + * Return: + * 0 - success, -ENOMEM - couldn't extend descriptor array + */ + +static inline int sdma_txadd_daddr( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + dma_addr_t addr, + u16 len) +{ + return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len); +} + +/** + * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq + * @dd: the device to use for mapping + * @tx: sdma_txreq to which the page is added + * @kvaddr: the kernel virtual address + * @len: length in bytes + * + * This is used to add a descriptor referenced by the indicated kvaddr and + * len. + * + * The mapping/unmapping of the kvaddr and len is automatically handled. + * + * Return: + * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend + * descriptor array + */ +static inline int sdma_txadd_kvaddr( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + void *kvaddr, + u16 len) +{ + dma_addr_t addr = + dma_map_single( + &dd->pcidev->dev, + kvaddr, + len, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + sdma_txclean(dd, tx); + return -ENOSPC; + } + return _sdma_txadd_daddr( + dd, SDMA_MAP_SINGLE, tx, addr, len); +} + +struct iowait; + +int sdma_send_txreq(struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx); +int sdma_send_txlist(struct sdma_engine *sde, + struct iowait *wait, + struct list_head *tx_list); + +int sdma_ahg_alloc(struct sdma_engine *sde); +void sdma_ahg_free(struct sdma_engine *sde, int ahg_index); + +/** + * sdma_build_ahg - build ahg descriptor + * @data + * @dwindex + * @startbit + * @bits + * + * Build and return a 32 bit descriptor. + */ +static inline u32 sdma_build_ahg_descriptor( + u16 data, + u8 dwindex, + u8 startbit, + u8 bits) +{ + return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT | + ((startbit & SDMA_AHG_FIELD_START_MASK) << + SDMA_AHG_FIELD_START_SHIFT) | + ((bits & SDMA_AHG_FIELD_LEN_MASK) << + SDMA_AHG_FIELD_LEN_SHIFT) | + ((dwindex & SDMA_AHG_INDEX_MASK) << + SDMA_AHG_INDEX_SHIFT) | + ((data & SDMA_AHG_VALUE_MASK) << + SDMA_AHG_VALUE_SHIFT)); +} + +/** + * sdma_progress - use seq number of detect head progress + * @sde: sdma_engine to check + * @seq: base seq count + * @tx: txreq for which we need to check descriptor availability + * + * This is used in the appropriate spot in the sleep routine + * to check for potential ring progress. This routine gets the + * seqcount before queuing the iowait structure for progress. + * + * If the seqcount indicates that progress needs to be checked, + * re-submission is detected by checking whether the descriptor + * queue has enough descriptor for the txreq. + */ +static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq, + struct sdma_txreq *tx) +{ + if (read_seqretry(&sde->head_lock, seq)) { + sde->desc_avail = sdma_descq_freecnt(sde); + if (tx->num_desc > sde->desc_avail) + return 0; + return 1; + } + return 0; +} + +/** + * sdma_iowait_schedule() - initialize wait structure + * @sde: sdma_engine to schedule + * @wait: wait struct to schedule + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ +static inline void sdma_iowait_schedule( + struct sdma_engine *sde, + struct iowait *wait) +{ + iowait_schedule(wait, sde->wq); +} + +/* for use by interrupt handling */ +void sdma_engine_error(struct sdma_engine *sde, u64 status); +void sdma_engine_interrupt(struct sdma_engine *sde, u64 status); + +/* + * + * The diagram below details the relationship of the mapping structures + * + * Since the mapping now allows for non-uniform engines per vl, the + * number of engines for a vl is either the vl_engines[vl] or + * a computation based on num_sdma/num_vls: + * + * For example: + * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls + * + * n = roundup to next highest power of 2 using nactual + * + * In the case where there are num_sdma/num_vls doesn't divide + * evenly, the extras are added from the last vl downward. + * + * For the case where n > nactual, the engines are assigned + * in a round robin fashion wrapping back to the first engine + * for a particular vl. + * + * dd->sdma_map + * | sdma_map_elem[0] + * | +--------------------+ + * v | mask | + * sdma_vl_map |--------------------| + * +--------------------------+ | sde[0] -> eng 1 | + * | list (RCU) | |--------------------| + * |--------------------------| ->| sde[1] -> eng 2 | + * | mask | --/ |--------------------| + * |--------------------------| -/ | * | + * | actual_vls (max 8) | -/ |--------------------| + * |--------------------------| --/ | sde[n] -> eng n | + * | vls (max 8) | -/ +--------------------+ + * |--------------------------| --/ + * | map[0] |-/ + * |--------------------------| +--------------------+ + * | map[1] |--- | mask | + * |--------------------------| \---- |--------------------| + * | * | \-- | sde[0] -> eng 1+n | + * | * | \---- |--------------------| + * | * | \->| sde[1] -> eng 2+n | + * |--------------------------| |--------------------| + * | map[vls - 1] |- | * | + * +--------------------------+ \- |--------------------| + * \- | sde[m] -> eng m+n | + * \ +--------------------+ + * \- + * \ + * \- +--------------------+ + * \- | mask | + * \ |--------------------| + * \- | sde[0] -> eng 1+m+n| + * \- |--------------------| + * >| sde[1] -> eng 2+m+n| + * |--------------------| + * | * | + * |--------------------| + * | sde[o] -> eng o+m+n| + * +--------------------+ + * + */ + +/** + * struct sdma_map_elem - mapping for a vl + * @mask - selector mask + * @sde - array of engines for this vl + * + * The mask is used to "mod" the selector + * to produce index into the trailing + * array of sdes. + */ +struct sdma_map_elem { + u32 mask; + struct sdma_engine *sde[0]; +}; + +/** + * struct sdma_map_el - mapping for a vl + * @list - rcu head for free callback + * @mask - vl mask to "mod" the vl to produce an index to map array + * @actual_vls - number of vls + * @vls - number of vls rounded to next power of 2 + * @map - array of sdma_map_elem entries + * + * This is the parent mapping structure. The trailing + * members of the struct point to sdma_map_elem entries, which + * in turn point to an array of sde's for that vl. + */ +struct sdma_vl_map { + struct rcu_head list; + u32 mask; + u8 actual_vls; + u8 vls; + struct sdma_map_elem *map[0]; +}; + +int sdma_map_init( + struct hfi1_devdata *dd, + u8 port, + u8 num_vls, + u8 *vl_engines); + +/* slow path */ +void _sdma_engine_progress_schedule(struct sdma_engine *sde); + +/** + * sdma_engine_progress_schedule() - schedule progress on engine + * @sde: sdma_engine to schedule progress + * + * This is the fast path. + * + */ +static inline void sdma_engine_progress_schedule( + struct sdma_engine *sde) +{ + if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8)) + return; + _sdma_engine_progress_schedule(sde); +} + +struct sdma_engine *sdma_select_engine_sc( + struct hfi1_devdata *dd, + u32 selector, + u8 sc5); + +struct sdma_engine *sdma_select_engine_vl( + struct hfi1_devdata *dd, + u32 selector, + u8 vl); + +void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *); + +#ifdef CONFIG_SDMA_VERBOSITY +void sdma_dumpstate(struct sdma_engine *); +#endif +static inline char *slashstrip(char *s) +{ + char *r = s; + + while (*s) + if (*s++ == '/') + r = s; + return r; +} + +u16 sdma_get_descq_cnt(void); + +extern uint mod_num_sdma; + +void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); + +#endif diff --git a/drivers/staging/rdma/hfi1/srq.c b/drivers/staging/rdma/hfi1/srq.c new file mode 100644 index 000000000..67786d417 --- /dev/null +++ b/drivers/staging/rdma/hfi1/srq.c @@ -0,0 +1,397 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +#include "verbs.h" + +/** + * hfi1_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: A pointer to the first WR to cause a problem is put here + * + * This may be called from interrupt context. + */ +int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct hfi1_srq *srq = to_isrq(ibsrq); + struct hfi1_rwq *wq; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct hfi1_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + wq = srq->rq.wq; + next = wq->head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&srq->rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * hfi1_create_srq - create a shared receive queue + * @ibpd: the protection domain of the SRQ to create + * @srq_init_attr: the attributes of the SRQ + * @udata: data from libibverbs when creating a user SRQ + */ +struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct hfi1_ibdev *dev = to_idev(ibpd->device); + struct hfi1_srq *srq; + u32 sz; + struct ib_srq *ret; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + + if (srq_init_attr->attr.max_sge == 0 || + srq_init_attr->attr.max_sge > hfi1_max_srq_sges || + srq_init_attr->attr.max_wr == 0 || + srq_init_attr->attr.max_wr > hfi1_max_srq_wrs) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + srq->rq.size = srq_init_attr->attr.max_wr + 1; + srq->rq.max_sge = srq_init_attr->attr.max_sge; + sz = sizeof(struct ib_sge) * srq->rq.max_sge + + sizeof(struct hfi1_rwqe); + srq->rq.wq = vmalloc_user(sizeof(struct hfi1_rwq) + srq->rq.size * sz); + if (!srq->rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_srq; + } + + /* + * Return the address of the RWQ as the offset to mmap. + * See hfi1_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + u32 s = sizeof(struct hfi1_rwq) + srq->rq.size * sz; + + srq->ip = + hfi1_create_mmap_info(dev, s, ibpd->uobject->context, + srq->rq.wq); + if (!srq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wq; + } + + err = ib_copy_to_udata(udata, &srq->ip->offset, + sizeof(srq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + srq->ip = NULL; + + /* + * ib_create_srq() will initialize srq->ibsrq. + */ + spin_lock_init(&srq->rq.lock); + srq->rq.wq->head = 0; + srq->rq.wq->tail = 0; + srq->limit = srq_init_attr->attr.srq_limit; + + spin_lock(&dev->n_srqs_lock); + if (dev->n_srqs_allocated == hfi1_max_srqs) { + spin_unlock(&dev->n_srqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_srqs_allocated++; + spin_unlock(&dev->n_srqs_lock); + + if (srq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &srq->ibsrq; + goto done; + +bail_ip: + kfree(srq->ip); +bail_wq: + vfree(srq->rq.wq); +bail_srq: + kfree(srq); +done: + return ret; +} + +/** + * hfi1_modify_srq - modify a shared receive queue + * @ibsrq: the SRQ to modify + * @attr: the new attributes of the SRQ + * @attr_mask: indicates which attributes to modify + * @udata: user data for libibverbs.so + */ +int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct hfi1_srq *srq = to_isrq(ibsrq); + struct hfi1_rwq *wq; + int ret = 0; + + if (attr_mask & IB_SRQ_MAX_WR) { + struct hfi1_rwq *owq; + struct hfi1_rwqe *p; + u32 sz, size, n, head, tail; + + /* Check that the requested sizes are below the limits. */ + if ((attr->max_wr > hfi1_max_srq_wrs) || + ((attr_mask & IB_SRQ_LIMIT) ? + attr->srq_limit : srq->limit) > attr->max_wr) { + ret = -EINVAL; + goto bail; + } + + sz = sizeof(struct hfi1_rwqe) + + srq->rq.max_sge * sizeof(struct ib_sge); + size = attr->max_wr + 1; + wq = vmalloc_user(sizeof(struct hfi1_rwq) + size * sz); + if (!wq) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 offset_addr; + __u64 offset = 0; + + ret = ib_copy_from_udata(&offset_addr, udata, + sizeof(offset_addr)); + if (ret) + goto bail_free; + udata->outbuf = + (void __user *) (unsigned long) offset_addr; + ret = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&srq->rq.lock); + /* + * validate head and tail pointer values and compute + * the number of remaining WQEs. + */ + owq = srq->rq.wq; + head = owq->head; + tail = owq->tail; + if (head >= srq->rq.size || tail >= srq->rq.size) { + ret = -EINVAL; + goto bail_unlock; + } + n = head; + if (n < tail) + n += srq->rq.size - tail; + else + n -= tail; + if (size <= n) { + ret = -EINVAL; + goto bail_unlock; + } + n = 0; + p = wq->wq; + while (tail != head) { + struct hfi1_rwqe *wqe; + int i; + + wqe = get_rwqe_ptr(&srq->rq, tail); + p->wr_id = wqe->wr_id; + p->num_sge = wqe->num_sge; + for (i = 0; i < wqe->num_sge; i++) + p->sg_list[i] = wqe->sg_list[i]; + n++; + p = (struct hfi1_rwqe *)((char *)p + sz); + if (++tail >= srq->rq.size) + tail = 0; + } + srq->rq.wq = wq; + srq->rq.size = size; + wq->head = n; + wq->tail = 0; + if (attr_mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + + vfree(owq); + + if (srq->ip) { + struct hfi1_mmap_info *ip = srq->ip; + struct hfi1_ibdev *dev = to_idev(srq->ibsrq.device); + u32 s = sizeof(struct hfi1_rwq) + size * sz; + + hfi1_update_mmap_info(dev, ip, s, wq); + + /* + * Return the offset to mmap. + * See hfi1_mmap() for details. + */ + if (udata && udata->inlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + /* + * Put user mapping info onto the pending list + * unless it already is on the list. + */ + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, + &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + } else if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irq(&srq->rq.lock); + if (attr->srq_limit >= srq->rq.size) + ret = -EINVAL; + else + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + } + goto bail; + +bail_unlock: + spin_unlock_irq(&srq->rq.lock); +bail_free: + vfree(wq); +bail: + return ret; +} + +int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct hfi1_srq *srq = to_isrq(ibsrq); + + attr->max_wr = srq->rq.size - 1; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +/** + * hfi1_destroy_srq - destroy a shared receive queue + * @ibsrq: the SRQ to destroy + */ +int hfi1_destroy_srq(struct ib_srq *ibsrq) +{ + struct hfi1_srq *srq = to_isrq(ibsrq); + struct hfi1_ibdev *dev = to_idev(ibsrq->device); + + spin_lock(&dev->n_srqs_lock); + dev->n_srqs_allocated--; + spin_unlock(&dev->n_srqs_lock); + if (srq->ip) + kref_put(&srq->ip->ref, hfi1_release_mmap_info); + else + vfree(srq->rq.wq); + kfree(srq); + + return 0; +} diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c new file mode 100644 index 000000000..b78c72861 --- /dev/null +++ b/drivers/staging/rdma/hfi1/sysfs.c @@ -0,0 +1,739 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include + +#include "hfi.h" +#include "mad.h" +#include "trace.h" + + +/* + * Start of per-port congestion control structures and support code + */ + +/* + * Congestion control table size followed by table entries + */ +static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + struct cc_state *cc_state; + + ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow) + + sizeof(__be16); + + if (pos > ret) + return -EINVAL; + + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + rcu_read_lock(); + cc_state = get_cc_state(ppd); + if (cc_state == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + memcpy(buf, &cc_state->cct, count); + rcu_read_unlock(); + + return count; +} + +static void port_release(struct kobject *kobj) +{ + /* nothing to do since memory is freed by hfi1_free_devdata() */ +} + +static struct kobj_type port_cc_ktype = { + .release = port_release, +}; + +static struct bin_attribute cc_table_bin_attr = { + .attr = {.name = "cc_table_bin", .mode = 0444}, + .read = read_cc_table_bin, + .size = PAGE_SIZE, +}; + +/* + * Congestion settings: port control, control map and an array of 16 + * entries for the congestion entries - increase, timer, event log + * trigger threshold and the minimum injection rate delay. + */ +static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + struct cc_state *cc_state; + + ret = sizeof(struct opa_congestion_setting_attr_shadow); + + if (pos > ret) + return -EINVAL; + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + rcu_read_lock(); + cc_state = get_cc_state(ppd); + if (cc_state == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + memcpy(buf, &cc_state->cong_setting, count); + rcu_read_unlock(); + + return count; +} + +static struct bin_attribute cc_setting_bin_attr = { + .attr = {.name = "cc_settings_bin", .mode = 0444}, + .read = read_cc_setting_bin, + .size = PAGE_SIZE, +}; + +/* Start sc2vl */ +#define HFI1_SC2VL_ATTR(N) \ + static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .sc = N \ + } + +struct hfi1_sc2vl_attr { + struct attribute attr; + int sc; +}; + +HFI1_SC2VL_ATTR(0); +HFI1_SC2VL_ATTR(1); +HFI1_SC2VL_ATTR(2); +HFI1_SC2VL_ATTR(3); +HFI1_SC2VL_ATTR(4); +HFI1_SC2VL_ATTR(5); +HFI1_SC2VL_ATTR(6); +HFI1_SC2VL_ATTR(7); +HFI1_SC2VL_ATTR(8); +HFI1_SC2VL_ATTR(9); +HFI1_SC2VL_ATTR(10); +HFI1_SC2VL_ATTR(11); +HFI1_SC2VL_ATTR(12); +HFI1_SC2VL_ATTR(13); +HFI1_SC2VL_ATTR(14); +HFI1_SC2VL_ATTR(15); +HFI1_SC2VL_ATTR(16); +HFI1_SC2VL_ATTR(17); +HFI1_SC2VL_ATTR(18); +HFI1_SC2VL_ATTR(19); +HFI1_SC2VL_ATTR(20); +HFI1_SC2VL_ATTR(21); +HFI1_SC2VL_ATTR(22); +HFI1_SC2VL_ATTR(23); +HFI1_SC2VL_ATTR(24); +HFI1_SC2VL_ATTR(25); +HFI1_SC2VL_ATTR(26); +HFI1_SC2VL_ATTR(27); +HFI1_SC2VL_ATTR(28); +HFI1_SC2VL_ATTR(29); +HFI1_SC2VL_ATTR(30); +HFI1_SC2VL_ATTR(31); + + +static struct attribute *sc2vl_default_attributes[] = { + &hfi1_sc2vl_attr_0.attr, + &hfi1_sc2vl_attr_1.attr, + &hfi1_sc2vl_attr_2.attr, + &hfi1_sc2vl_attr_3.attr, + &hfi1_sc2vl_attr_4.attr, + &hfi1_sc2vl_attr_5.attr, + &hfi1_sc2vl_attr_6.attr, + &hfi1_sc2vl_attr_7.attr, + &hfi1_sc2vl_attr_8.attr, + &hfi1_sc2vl_attr_9.attr, + &hfi1_sc2vl_attr_10.attr, + &hfi1_sc2vl_attr_11.attr, + &hfi1_sc2vl_attr_12.attr, + &hfi1_sc2vl_attr_13.attr, + &hfi1_sc2vl_attr_14.attr, + &hfi1_sc2vl_attr_15.attr, + &hfi1_sc2vl_attr_16.attr, + &hfi1_sc2vl_attr_17.attr, + &hfi1_sc2vl_attr_18.attr, + &hfi1_sc2vl_attr_19.attr, + &hfi1_sc2vl_attr_20.attr, + &hfi1_sc2vl_attr_21.attr, + &hfi1_sc2vl_attr_22.attr, + &hfi1_sc2vl_attr_23.attr, + &hfi1_sc2vl_attr_24.attr, + &hfi1_sc2vl_attr_25.attr, + &hfi1_sc2vl_attr_26.attr, + &hfi1_sc2vl_attr_27.attr, + &hfi1_sc2vl_attr_28.attr, + &hfi1_sc2vl_attr_29.attr, + &hfi1_sc2vl_attr_30.attr, + &hfi1_sc2vl_attr_31.attr, + NULL +}; + +static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_sc2vl_attr *sattr = + container_of(attr, struct hfi1_sc2vl_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, sc2vl_kobj); + struct hfi1_devdata *dd = ppd->dd; + + return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc)); +} + +static const struct sysfs_ops hfi1_sc2vl_ops = { + .show = sc2vl_attr_show, +}; + +static struct kobj_type hfi1_sc2vl_ktype = { + .release = port_release, + .sysfs_ops = &hfi1_sc2vl_ops, + .default_attrs = sc2vl_default_attributes +}; + +/* End sc2vl */ + +/* Start sl2sc */ +#define HFI1_SL2SC_ATTR(N) \ + static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .sl = N \ + } + +struct hfi1_sl2sc_attr { + struct attribute attr; + int sl; +}; + +HFI1_SL2SC_ATTR(0); +HFI1_SL2SC_ATTR(1); +HFI1_SL2SC_ATTR(2); +HFI1_SL2SC_ATTR(3); +HFI1_SL2SC_ATTR(4); +HFI1_SL2SC_ATTR(5); +HFI1_SL2SC_ATTR(6); +HFI1_SL2SC_ATTR(7); +HFI1_SL2SC_ATTR(8); +HFI1_SL2SC_ATTR(9); +HFI1_SL2SC_ATTR(10); +HFI1_SL2SC_ATTR(11); +HFI1_SL2SC_ATTR(12); +HFI1_SL2SC_ATTR(13); +HFI1_SL2SC_ATTR(14); +HFI1_SL2SC_ATTR(15); +HFI1_SL2SC_ATTR(16); +HFI1_SL2SC_ATTR(17); +HFI1_SL2SC_ATTR(18); +HFI1_SL2SC_ATTR(19); +HFI1_SL2SC_ATTR(20); +HFI1_SL2SC_ATTR(21); +HFI1_SL2SC_ATTR(22); +HFI1_SL2SC_ATTR(23); +HFI1_SL2SC_ATTR(24); +HFI1_SL2SC_ATTR(25); +HFI1_SL2SC_ATTR(26); +HFI1_SL2SC_ATTR(27); +HFI1_SL2SC_ATTR(28); +HFI1_SL2SC_ATTR(29); +HFI1_SL2SC_ATTR(30); +HFI1_SL2SC_ATTR(31); + + +static struct attribute *sl2sc_default_attributes[] = { + &hfi1_sl2sc_attr_0.attr, + &hfi1_sl2sc_attr_1.attr, + &hfi1_sl2sc_attr_2.attr, + &hfi1_sl2sc_attr_3.attr, + &hfi1_sl2sc_attr_4.attr, + &hfi1_sl2sc_attr_5.attr, + &hfi1_sl2sc_attr_6.attr, + &hfi1_sl2sc_attr_7.attr, + &hfi1_sl2sc_attr_8.attr, + &hfi1_sl2sc_attr_9.attr, + &hfi1_sl2sc_attr_10.attr, + &hfi1_sl2sc_attr_11.attr, + &hfi1_sl2sc_attr_12.attr, + &hfi1_sl2sc_attr_13.attr, + &hfi1_sl2sc_attr_14.attr, + &hfi1_sl2sc_attr_15.attr, + &hfi1_sl2sc_attr_16.attr, + &hfi1_sl2sc_attr_17.attr, + &hfi1_sl2sc_attr_18.attr, + &hfi1_sl2sc_attr_19.attr, + &hfi1_sl2sc_attr_20.attr, + &hfi1_sl2sc_attr_21.attr, + &hfi1_sl2sc_attr_22.attr, + &hfi1_sl2sc_attr_23.attr, + &hfi1_sl2sc_attr_24.attr, + &hfi1_sl2sc_attr_25.attr, + &hfi1_sl2sc_attr_26.attr, + &hfi1_sl2sc_attr_27.attr, + &hfi1_sl2sc_attr_28.attr, + &hfi1_sl2sc_attr_29.attr, + &hfi1_sl2sc_attr_30.attr, + &hfi1_sl2sc_attr_31.attr, + NULL +}; + +static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_sl2sc_attr *sattr = + container_of(attr, struct hfi1_sl2sc_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, sl2sc_kobj); + struct hfi1_ibport *ibp = &ppd->ibport_data; + + return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]); +} + +static const struct sysfs_ops hfi1_sl2sc_ops = { + .show = sl2sc_attr_show, +}; + +static struct kobj_type hfi1_sl2sc_ktype = { + .release = port_release, + .sysfs_ops = &hfi1_sl2sc_ops, + .default_attrs = sl2sc_default_attributes +}; + +/* End sl2sc */ + +/* Start vl2mtu */ + +#define HFI1_VL2MTU_ATTR(N) \ + static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .vl = N \ + } + +struct hfi1_vl2mtu_attr { + struct attribute attr; + int vl; +}; + +HFI1_VL2MTU_ATTR(0); +HFI1_VL2MTU_ATTR(1); +HFI1_VL2MTU_ATTR(2); +HFI1_VL2MTU_ATTR(3); +HFI1_VL2MTU_ATTR(4); +HFI1_VL2MTU_ATTR(5); +HFI1_VL2MTU_ATTR(6); +HFI1_VL2MTU_ATTR(7); +HFI1_VL2MTU_ATTR(8); +HFI1_VL2MTU_ATTR(9); +HFI1_VL2MTU_ATTR(10); +HFI1_VL2MTU_ATTR(11); +HFI1_VL2MTU_ATTR(12); +HFI1_VL2MTU_ATTR(13); +HFI1_VL2MTU_ATTR(14); +HFI1_VL2MTU_ATTR(15); + +static struct attribute *vl2mtu_default_attributes[] = { + &hfi1_vl2mtu_attr_0.attr, + &hfi1_vl2mtu_attr_1.attr, + &hfi1_vl2mtu_attr_2.attr, + &hfi1_vl2mtu_attr_3.attr, + &hfi1_vl2mtu_attr_4.attr, + &hfi1_vl2mtu_attr_5.attr, + &hfi1_vl2mtu_attr_6.attr, + &hfi1_vl2mtu_attr_7.attr, + &hfi1_vl2mtu_attr_8.attr, + &hfi1_vl2mtu_attr_9.attr, + &hfi1_vl2mtu_attr_10.attr, + &hfi1_vl2mtu_attr_11.attr, + &hfi1_vl2mtu_attr_12.attr, + &hfi1_vl2mtu_attr_13.attr, + &hfi1_vl2mtu_attr_14.attr, + &hfi1_vl2mtu_attr_15.attr, + NULL +}; + +static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_vl2mtu_attr *vlattr = + container_of(attr, struct hfi1_vl2mtu_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj); + struct hfi1_devdata *dd = ppd->dd; + + return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu); +} + +static const struct sysfs_ops hfi1_vl2mtu_ops = { + .show = vl2mtu_attr_show, +}; + +static struct kobj_type hfi1_vl2mtu_ktype = { + .release = port_release, + .sysfs_ops = &hfi1_vl2mtu_ops, + .default_attrs = vl2mtu_default_attributes +}; + + +/* end of per-port file structures and support code */ + +/* + * Start of per-unit (or driver, in some cases, but replicated + * per unit) functions (these get a device *) + */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + + return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); +} + +static ssize_t show_hfi(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + int ret; + + if (!dd->boardname) + ret = -EINVAL; + else + ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); + return ret; +} + +static ssize_t show_boardversion(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); +} + + +static ssize_t show_nctxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* + * Return the smaller of send and receive contexts. + * Normally, user level applications would require both a send + * and a receive context, so returning the smaller of the two counts + * give a more accurate picture of total contexts available. + */ + return scnprintf(buf, PAGE_SIZE, "%u\n", + min(dd->num_rcv_contexts - dd->first_user_ctxt, + (u32)dd->sc_sizes[SC_USER].count)); +} + +static ssize_t show_nfreectxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* Return the number of free user ports (contexts) available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); +} + +static ssize_t show_serial(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); + +} + +static ssize_t store_chip_reset(struct device *device, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) { + ret = -EINVAL; + goto bail; + } + + ret = hfi1_reset_device(dd->unit); +bail: + return ret < 0 ? ret : count; +} + +/* + * Convert the reported temperature from an integer (reported in + * units of 0.25C) to a floating point number. + */ +#define temp2str(temp, buf, size, idx) \ + scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ", \ + ((temp) >> 2), ((temp) & 0x3) * 25) + +/* + * Dump tempsense values, in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + struct hfi1_temp temp; + int ret = -ENXIO; + + ret = hfi1_tempsense_rd(dd, &temp); + if (!ret) { + int idx = 0; + + idx += temp2str(temp.curr, buf, PAGE_SIZE, idx); + idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx); + idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx); + idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx); + idx += scnprintf(buf + idx, PAGE_SIZE - idx, + "%u %u %u\n", temp.triggers & 0x1, + temp.triggers & 0x2, temp.triggers & 0x4); + ret = idx; + } + return ret; +} + +/* + * end of per-unit (or driver, in some cases, but replicated + * per unit) functions + */ + +/* start of per-unit file structures and support code */ +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL); +static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); +static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); +static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); + +static struct device_attribute *hfi1_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_board_id, + &dev_attr_nctxts, + &dev_attr_nfreectxts, + &dev_attr_serial, + &dev_attr_boardversion, + &dev_attr_tempsense, + &dev_attr_chip_reset, +}; + +int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int ret; + + if (!port_num || port_num > dd->num_pports) { + dd_dev_err(dd, + "Skipping infiniband class with invalid port %u\n", + port_num); + return -ENODEV; + } + ppd = &dd->pport[port_num - 1]; + + ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj, + "sc2vl"); + if (ret) { + dd_dev_err(dd, + "Skipping sc2vl sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail; + } + kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj, + "sl2sc"); + if (ret) { + dd_dev_err(dd, + "Skipping sl2sc sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_sc2vl; + } + kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj, + "vl2mtu"); + if (ret) { + dd_dev_err(dd, + "Skipping vl2mtu sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_sl2sc; + } + kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD); + + + ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype, + kobj, "CCMgtA"); + if (ret) { + dd_dev_err(dd, + "Skipping Congestion Control sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_vl2mtu; + } + + kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD); + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); + if (ret) { + dd_dev_err(dd, + "Skipping Congestion Control setting sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc; + } + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, + &cc_table_bin_attr); + if (ret) { + dd_dev_err(dd, + "Skipping Congestion Control table sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc_entry_bin; + } + + dd_dev_info(dd, + "IB%u: Congestion Control Agent enabled for port %d\n", + dd->unit, port_num); + + return 0; + +bail_cc_entry_bin: + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); +bail_cc: + kobject_put(&ppd->pport_cc_kobj); +bail_vl2mtu: + kobject_put(&ppd->vl2mtu_kobj); +bail_sl2sc: + kobject_put(&ppd->sl2sc_kobj); +bail_sc2vl: + kobject_put(&ppd->sc2vl_kobj); +bail: + return ret; +} + +/* + * Register and create our files in /sys/class/infiniband. + */ +int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) +{ + struct ib_device *dev = &dd->verbs_dev.ibdev; + int i, ret; + + for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) { + ret = device_create_file(&dev->dev, hfi1_attributes[i]); + if (ret) + goto bail; + } + + return 0; +bail: + for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) + device_remove_file(&dev->dev, hfi1_attributes[i]); + return ret; +} + +/* + * Unregister and remove our files in /sys/class/infiniband. + */ +void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + for (i = 0; i < dd->num_pports; i++) { + ppd = &dd->pport[i]; + + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_table_bin_attr); + kobject_put(&ppd->pport_cc_kobj); + kobject_put(&ppd->vl2mtu_kobj); + kobject_put(&ppd->sl2sc_kobj); + kobject_put(&ppd->sc2vl_kobj); + } +} diff --git a/drivers/staging/rdma/hfi1/trace.c b/drivers/staging/rdma/hfi1/trace.c new file mode 100644 index 000000000..70ad7b9fc --- /dev/null +++ b/drivers/staging/rdma/hfi1/trace.c @@ -0,0 +1,221 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#define CREATE_TRACE_POINTS +#include "trace.h" + +u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr) +{ + struct hfi1_other_headers *ohdr; + u8 opcode; + u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + + if (lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + return hdr_len_by_opcode[opcode] == 0 ? + 0 : hdr_len_by_opcode[opcode] - (12 + 8); +} + +#define IMM_PRN "imm %d" +#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x" +#define AETH_PRN "aeth syn 0x%.2x msn 0x%.8x" +#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x" +#define ATOMICACKETH_PRN "origdata %lld" +#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld" + +#define OP(transport, op) IB_OPCODE_## transport ## _ ## op + +static u64 ib_u64_get(__be32 *p) +{ + return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]); +} + +const char *parse_everbs_hdrs( + struct trace_seq *p, + u8 opcode, + void *ehdrs) +{ + union ib_ehdrs *eh = ehdrs; + const char *ret = trace_seq_buffer_ptr(p); + + switch (opcode) { + /* imm */ + case OP(RC, SEND_LAST_WITH_IMMEDIATE): + case OP(UC, SEND_LAST_WITH_IMMEDIATE): + case OP(RC, SEND_ONLY_WITH_IMMEDIATE): + case OP(UC, SEND_ONLY_WITH_IMMEDIATE): + case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE): + case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE): + trace_seq_printf(p, IMM_PRN, + be32_to_cpu(eh->imm_data)); + break; + /* reth + imm */ + case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): + case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): + trace_seq_printf(p, RETH_PRN " " IMM_PRN, + (unsigned long long)ib_u64_get( + (__be32 *)&eh->rc.reth.vaddr), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length), + be32_to_cpu(eh->rc.imm_data)); + break; + /* reth */ + case OP(RC, RDMA_READ_REQUEST): + case OP(RC, RDMA_WRITE_FIRST): + case OP(UC, RDMA_WRITE_FIRST): + case OP(RC, RDMA_WRITE_ONLY): + case OP(UC, RDMA_WRITE_ONLY): + trace_seq_printf(p, RETH_PRN, + (unsigned long long)ib_u64_get( + (__be32 *)&eh->rc.reth.vaddr), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length)); + break; + case OP(RC, RDMA_READ_RESPONSE_FIRST): + case OP(RC, RDMA_READ_RESPONSE_LAST): + case OP(RC, RDMA_READ_RESPONSE_ONLY): + case OP(RC, ACKNOWLEDGE): + trace_seq_printf(p, AETH_PRN, + be32_to_cpu(eh->aeth) >> 24, + be32_to_cpu(eh->aeth) & HFI1_QPN_MASK); + break; + /* aeth + atomicacketh */ + case OP(RC, ATOMIC_ACKNOWLEDGE): + trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, + (be32_to_cpu(eh->at.aeth) >> 24) & 0xff, + be32_to_cpu(eh->at.aeth) & HFI1_QPN_MASK, + (unsigned long long)ib_u64_get(eh->at.atomic_ack_eth)); + break; + /* atomiceth */ + case OP(RC, COMPARE_SWAP): + case OP(RC, FETCH_ADD): + trace_seq_printf(p, ATOMICETH_PRN, + (unsigned long long)ib_u64_get(eh->atomic_eth.vaddr), + eh->atomic_eth.rkey, + (unsigned long long)ib_u64_get( + (__be32 *)&eh->atomic_eth.swap_data), + (unsigned long long) ib_u64_get( + (__be32 *)&eh->atomic_eth.compare_data)); + break; + /* deth */ + case OP(UD, SEND_ONLY): + case OP(UD, SEND_ONLY_WITH_IMMEDIATE): + trace_seq_printf(p, DETH_PRN, + be32_to_cpu(eh->ud.deth[0]), + be32_to_cpu(eh->ud.deth[1]) & HFI1_QPN_MASK); + break; + } + trace_seq_putc(p, 0); + return ret; +} + +const char *parse_sdma_flags( + struct trace_seq *p, + u64 desc0, u64 desc1) +{ + const char *ret = trace_seq_buffer_ptr(p); + char flags[5] = { 'x', 'x', 'x', 'x', 0 }; + + flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? 'H' : '-'; + flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + trace_seq_printf(p, "%s", flags); + if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) + trace_seq_printf(p, " amode:%u aidx:%u alen:%u", + (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) + & SDMA_DESC1_HEADER_MODE_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) + & SDMA_DESC1_HEADER_INDEX_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) + & SDMA_DESC1_HEADER_DWS_MASK)); + return ret; +} + +const char *print_u32_array( + struct trace_seq *p, + u32 *arr, int len) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; i < len ; i++) + trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]); + trace_seq_putc(p, 0); + return ret; +} + +const char *print_u64_array( + struct trace_seq *p, + u64 *arr, int len) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; i < len; i++) + trace_seq_printf(p, "%s0x%016llx", i == 0 ? "" : " ", arr[i]); + trace_seq_putc(p, 0); + return ret; +} + +__hfi1_trace_fn(PKT); +__hfi1_trace_fn(PROC); +__hfi1_trace_fn(SDMA); +__hfi1_trace_fn(LINKVERB); +__hfi1_trace_fn(DEBUG); +__hfi1_trace_fn(SNOOP); +__hfi1_trace_fn(CNTR); +__hfi1_trace_fn(PIO); +__hfi1_trace_fn(DC8051); +__hfi1_trace_fn(FIRMWARE); +__hfi1_trace_fn(RCVCTRL); +__hfi1_trace_fn(TID); diff --git a/drivers/staging/rdma/hfi1/trace.h b/drivers/staging/rdma/hfi1/trace.h new file mode 100644 index 000000000..d7851c0a0 --- /dev/null +++ b/drivers/staging/rdma/hfi1/trace.h @@ -0,0 +1,1409 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR hfi1 + +#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_H + +#include +#include + +#include "hfi.h" +#include "mad.h" +#include "sdma.h" + +#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) + +#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } +#define show_packettype(etype) \ +__print_symbolic(etype, \ + packettype_name(EXPECTED), \ + packettype_name(EAGER), \ + packettype_name(IB), \ + packettype_name(ERROR), \ + packettype_name(BYPASS)) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rx + +TRACE_EVENT(hfi1_rcvhdr, + TP_PROTO(struct hfi1_devdata *dd, + u64 eflags, + u32 ctxt, + u32 etype, + u32 hlen, + u32 tlen, + u32 updegr, + u32 etail), + TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u64, eflags) + __field(u32, ctxt) + __field(u32, etype) + __field(u32, hlen) + __field(u32, tlen) + __field(u32, updegr) + __field(u32, etail) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->eflags = eflags; + __entry->ctxt = ctxt; + __entry->etype = etype; + __entry->hlen = hlen; + __entry->tlen = tlen; + __entry->updegr = updegr; + __entry->etail = etail; + ), + TP_printk( +"[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", + __get_str(dev), + __entry->ctxt, + __entry->eflags, + __entry->etype, show_packettype(__entry->etype), + __entry->hlen, + __entry->tlen, + __entry->updegr, + __entry->etail + ) +); + +TRACE_EVENT(hfi1_receive_interrupt, + TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), + TP_ARGS(dd, ctxt), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u32, ctxt) + __field(u8, slow_path) + __field(u8, dma_rtail) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt) { + __entry->slow_path = 1; + __entry->dma_rtail = 0xFF; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_dma_rtail){ + __entry->dma_rtail = 1; + __entry->slow_path = 0; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_nodma_rtail) { + __entry->dma_rtail = 0; + __entry->slow_path = 0; + } + ), + TP_printk( + "[%s] ctxt %d SlowPath: %d DmaRtail: %d", + __get_str(dev), + __entry->ctxt, + __entry->slow_path, + __entry->dma_rtail + ) +); + +const char *print_u64_array(struct trace_seq *, u64 *, int); + +TRACE_EVENT(hfi1_exp_tid_map, + TP_PROTO(unsigned ctxt, u16 subctxt, int dir, + unsigned long *maps, u16 count), + TP_ARGS(ctxt, subctxt, dir, maps, count), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __field(int, dir) + __field(u16, count) + __dynamic_array(unsigned long, maps, sizeof(*maps) * count) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->dir = dir; + __entry->count = count; + memcpy(__get_dynamic_array(maps), maps, + sizeof(*maps) * count); + ), + TP_printk("[%3u:%02u] %s tidmaps %s", + __entry->ctxt, + __entry->subctxt, + (__entry->dir ? ">" : "<"), + print_u64_array(p, __get_dynamic_array(maps), + __entry->count) + ) + ); + +TRACE_EVENT(hfi1_exp_rcv_set, + TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid, + unsigned long vaddr, u64 phys_addr, void *page), + TP_ARGS(ctxt, subctxt, tid, vaddr, phys_addr, page), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __field(u32, tid) + __field(unsigned long, vaddr) + __field(u64, phys_addr) + __field(void *, page) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->tid = tid; + __entry->vaddr = vaddr; + __entry->phys_addr = phys_addr; + __entry->page = page; + ), + TP_printk("[%u:%u] TID %u, vaddrs 0x%lx, physaddr 0x%llx, pgp %p", + __entry->ctxt, + __entry->subctxt, + __entry->tid, + __entry->vaddr, + __entry->phys_addr, + __entry->page + ) + ); + +TRACE_EVENT(hfi1_exp_rcv_free, + TP_PROTO(unsigned ctxt, u16 subctxt, u32 tid, + unsigned long phys, void *page), + TP_ARGS(ctxt, subctxt, tid, phys, page), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __field(u32, tid) + __field(unsigned long, phys) + __field(void *, page) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->tid = tid; + __entry->phys = phys; + __entry->page = page; + ), + TP_printk("[%u:%u] freeing TID %u, 0x%lx, pgp %p", + __entry->ctxt, + __entry->subctxt, + __entry->tid, + __entry->phys, + __entry->page + ) + ); +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tx + +TRACE_EVENT(hfi1_piofree, + TP_PROTO(struct send_context *sc, int extra), + TP_ARGS(sc, extra), + TP_STRUCT__entry( + DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(int, extra) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->extra = extra; + ), + TP_printk( + "[%s] ctxt %u(%u) extra %d", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->extra + ) +); + +TRACE_EVENT(hfi1_wantpiointr, + TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), + TP_ARGS(sc, needint, credit_ctrl), + TP_STRUCT__entry( + DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(u32, needint) + __field(u64, credit_ctrl) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->needint = needint; + __entry->credit_ctrl = credit_ctrl; + ), + TP_printk( + "[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->needint, + (unsigned long long)__entry->credit_ctrl + ) +); + +DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, + TP_PROTO(struct hfi1_qp *qp, u32 flags), + TP_ARGS(qp, flags), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, flags) + __field(u32, s_flags) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->flags = flags; + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + ), + TP_printk( + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", + __get_str(dev), + __entry->qpn, + __entry->flags, + __entry->s_flags + ) +); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, + TP_PROTO(struct hfi1_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, + TP_PROTO(struct hfi1_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_qphash +DECLARE_EVENT_CLASS(hfi1_qphash_template, + TP_PROTO(struct hfi1_qp *qp, u32 bucket), + TP_ARGS(qp, bucket), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, bucket) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->bucket = bucket; + ), + TP_printk( + "[%s] qpn 0x%x bucket %u", + __get_str(dev), + __entry->qpn, + __entry->bucket + ) +); + +DEFINE_EVENT(hfi1_qphash_template, hfi1_qpinsert, + TP_PROTO(struct hfi1_qp *qp, u32 bucket), + TP_ARGS(qp, bucket)); + +DEFINE_EVENT(hfi1_qphash_template, hfi1_qpremove, + TP_PROTO(struct hfi1_qp *qp, u32 bucket), + TP_ARGS(qp, bucket)); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ibhdrs + +u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr); +const char *parse_everbs_hdrs( + struct trace_seq *p, + u8 opcode, + void *ehdrs); + +#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) + +const char *parse_sdma_flags( + struct trace_seq *p, + u64 desc0, u64 desc1); + +#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) + + +#define lrh_name(lrh) { HFI1_##lrh, #lrh } +#define show_lnh(lrh) \ +__print_symbolic(lrh, \ + lrh_name(LRH_BTH), \ + lrh_name(LRH_GRH)) + +#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } +#define show_ib_opcode(opcode) \ +__print_symbolic(opcode, \ + ib_opcode_name(RC_SEND_FIRST), \ + ib_opcode_name(RC_SEND_MIDDLE), \ + ib_opcode_name(RC_SEND_LAST), \ + ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_SEND_ONLY), \ + ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_FIRST), \ + ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(RC_RDMA_WRITE_LAST), \ + ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_READ_REQUEST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ + ib_opcode_name(RC_ACKNOWLEDGE), \ + ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ + ib_opcode_name(RC_COMPARE_SWAP), \ + ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(UC_SEND_FIRST), \ + ib_opcode_name(UC_SEND_MIDDLE), \ + ib_opcode_name(UC_SEND_LAST), \ + ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_SEND_ONLY), \ + ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_FIRST), \ + ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(UC_RDMA_WRITE_LAST), \ + ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UD_SEND_ONLY), \ + ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE)) + + +#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" +#define BTH_PRN \ + "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ + "f %d b %d qpn 0x%.6x a %d psn 0x%.8x" +#define EHDR_PRN "%s" + +DECLARE_EVENT_CLASS(hfi1_ibhdr_template, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + /* LRH */ + __field(u8, vl) + __field(u8, lver) + __field(u8, sl) + __field(u8, lnh) + __field(u16, dlid) + __field(u16, len) + __field(u16, slid) + /* BTH */ + __field(u8, opcode) + __field(u8, se) + __field(u8, m) + __field(u8, pad) + __field(u8, tver) + __field(u16, pkey) + __field(u8, f) + __field(u8, b) + __field(u32, qpn) + __field(u8, a) + __field(u32, psn) + /* extended headers */ + __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + DD_DEV_ASSIGN(dd); + /* LRH */ + __entry->vl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); + __entry->lver = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; + __entry->sl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->lnh = + (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + __entry->dlid = + be16_to_cpu(hdr->lrh[1]); + /* allow for larger len */ + __entry->len = + be16_to_cpu(hdr->lrh[2]); + __entry->slid = + be16_to_cpu(hdr->lrh[3]); + /* BTH */ + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + __entry->opcode = + (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->se = + (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; + __entry->m = + (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; + __entry->pad = + (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + __entry->tver = + (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; + __entry->pkey = + be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->f = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) + & HFI1_FECN_MASK; + __entry->b = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) + & HFI1_BECN_MASK; + __entry->qpn = + be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + __entry->a = + (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; + /* allow for larger PSN */ + __entry->psn = + be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; + /* extended headers */ + memcpy( + __get_dynamic_array(ehdrs), + &ohdr->u, + ibhdr_exhdr_len(hdr)); + ), + TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, + __get_str(dev), + /* LRH */ + __entry->vl, + __entry->lver, + __entry->sl, + __entry->lnh, show_lnh(__entry->lnh), + __entry->dlid, + __entry->len, + __entry->slid, + /* BTH */ + __entry->opcode, show_ib_opcode(__entry->opcode), + __entry->se, + __entry->m, + __entry->pad, + __entry->tver, + __entry->pkey, + __entry->f, + __entry->b, + __entry->qpn, + __entry->a, + __entry->psn, + /* extended headers */ + __parse_ib_ehdrs( + __entry->opcode, + (void *)__get_dynamic_array(ehdrs)) + ) +); + +DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +#define SNOOP_PRN \ + "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \ + "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_snoop + + +TRACE_EVENT(snoop_capture, + TP_PROTO(struct hfi1_devdata *dd, + int hdr_len, + struct hfi1_ib_header *hdr, + int data_len, + void *data), + TP_ARGS(dd, hdr_len, hdr, data_len, data), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, slid) + __field(u16, dlid) + __field(u32, qpn) + __field(u8, opcode) + __field(u8, sl) + __field(u16, pkey) + __field(u32, hdr_len) + __field(u32, data_len) + __field(u8, lnh) + __dynamic_array(u8, raw_hdr, hdr_len) + __dynamic_array(u8, raw_pkt, data_len) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + DD_DEV_ASSIGN(dd); + __entry->slid = be16_to_cpu(hdr->lrh[3]); + __entry->dlid = be16_to_cpu(hdr->lrh[1]); + __entry->qpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->hdr_len = hdr_len; + __entry->data_len = data_len; + memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); + memcpy(__get_dynamic_array(raw_pkt), data, data_len); + ), + TP_printk("[%s] " SNOOP_PRN, + __get_str(dev), + __entry->slid, + __entry->dlid, + __entry->qpn, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->sl, + __entry->pkey, + __entry->hdr_len, + __entry->data_len + ) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ctxts + +#define UCTXT_FMT \ + "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \ + "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx" +TRACE_EVENT(hfi1_uctxtdata, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), + TP_ARGS(dd, uctxt), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(unsigned, ctxt) + __field(u32, credits) + __field(u64, hw_free) + __field(u64, piobase) + __field(u16, rcvhdrq_cnt) + __field(u64, rcvhdrq_phys) + __field(u32, eager_cnt) + __field(u64, rcvegr_phys) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = uctxt->ctxt; + __entry->credits = uctxt->sc->credits; + __entry->hw_free = (u64)uctxt->sc->hw_free; + __entry->piobase = (u64)uctxt->sc->base_addr; + __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; + __entry->eager_cnt = uctxt->egrbufs.alloced; + __entry->rcvegr_phys = uctxt->egrbufs.rcvtids[0].phys; + ), + TP_printk( + "[%s] ctxt %u " UCTXT_FMT, + __get_str(dev), + __entry->ctxt, + __entry->credits, + __entry->hw_free, + __entry->piobase, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_phys, + __entry->eager_cnt, + __entry->rcvegr_phys + ) + ); + +#define CINFO_FMT \ + "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" +TRACE_EVENT(hfi1_ctxt_info, + TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt, + struct hfi1_ctxt_info cinfo), + TP_ARGS(dd, ctxt, subctxt, cinfo), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(unsigned, ctxt) + __field(unsigned, subctxt) + __field(u16, egrtids) + __field(u16, rcvhdrq_cnt) + __field(u16, rcvhdrq_size) + __field(u16, sdma_ring_size) + __field(u32, rcvegr_size) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->egrtids = cinfo.egrtids; + __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo.sdma_ring_size; + __entry->rcvegr_size = cinfo.rcvegr_size; + ), + TP_printk( + "[%s] ctxt %u:%u " CINFO_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->egrtids, + __entry->rcvegr_size, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_size, + __entry->sdma_ring_size + ) + ); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_sma + +#define BCT_FORMAT \ + "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]" + +#define BCT(field) \ + be16_to_cpu( \ + ((struct buffer_control *)__get_dynamic_array(bct))->field \ + ) + +DECLARE_EVENT_CLASS(hfi1_bct_template, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __dynamic_array(u8, bct, sizeof(*bc)) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + memcpy( + __get_dynamic_array(bct), + bc, + sizeof(*bc)); + ), + TP_printk(BCT_FORMAT, + BCT(overall_shared_limit), + + BCT(vl[0].dedicated), + BCT(vl[0].shared), + + BCT(vl[1].dedicated), + BCT(vl[1].shared), + + BCT(vl[2].dedicated), + BCT(vl[2].shared), + + BCT(vl[3].dedicated), + BCT(vl[3].shared), + + BCT(vl[4].dedicated), + BCT(vl[4].shared), + + BCT(vl[5].dedicated), + BCT(vl[5].shared), + + BCT(vl[6].dedicated), + BCT(vl[6].shared), + + BCT(vl[7].dedicated), + BCT(vl[7].shared), + + BCT(vl[15].dedicated), + BCT(vl[15].shared) + ) +); + + +DEFINE_EVENT(hfi1_bct_template, bct_set, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +DEFINE_EVENT(hfi1_bct_template, bct_get, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_sdma + +TRACE_EVENT(hfi1_sdma_descriptor, + TP_PROTO( + struct sdma_engine *sde, + u64 desc0, + u64 desc1, + u16 e, + void *descp), + TP_ARGS(sde, desc0, desc1, e, descp), + TP_STRUCT__entry( + DD_DEV_ENTRY(sde->dd) + __field(void *, descp) + __field(u64, desc0) + __field(u64, desc1) + __field(u16, e) + __field(u8, idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sde->dd); + __entry->desc0 = desc0; + __entry->desc1 = desc1; + __entry->idx = sde->this_idx; + __entry->descp = descp; + __entry->e = e; + ), + TP_printk( + "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", + __get_str(dev), + __entry->idx, + __parse_sdma_flags(__entry->desc0, __entry->desc1), + (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) + & SDMA_DESC0_PHY_ADDR_MASK, + (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) + & SDMA_DESC1_GENERATION_MASK), + (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) + & SDMA_DESC0_BYTE_COUNT_MASK), + __entry->desc0, + __entry->desc1, + __entry->descp, + __entry->e + ) +); + +TRACE_EVENT(hfi1_sdma_engine_select, + TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), + TP_ARGS(dd, sel, vl, idx), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u32, sel) + __field(u8, vl) + __field(u8, idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->sel = sel; + __entry->vl = vl; + __entry->idx = idx; + ), + TP_printk( + "[%s] selecting SDE %u sel 0x%x vl %u", + __get_str(dev), + __entry->idx, + __entry->sel, + __entry->vl + ) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, + TP_PROTO( + struct sdma_engine *sde, + u64 status + ), + TP_ARGS(sde, status), + TP_STRUCT__entry( + DD_DEV_ENTRY(sde->dd) + __field(u64, status) + __field(u8, idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sde->dd); + __entry->status = status; + __entry->idx = sde->this_idx; + ), + TP_printk( + "[%s] SDE(%u) status %llx", + __get_str(dev), + __entry->idx, + (unsigned long long)__entry->status + ) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, + TP_PROTO( + struct sdma_engine *sde, + u64 status + ), + TP_ARGS(sde, status) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, + TP_PROTO( + struct sdma_engine *sde, + u64 status + ), + TP_ARGS(sde, status) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, + TP_PROTO( + struct sdma_engine *sde, + int aidx + ), + TP_ARGS(sde, aidx), + TP_STRUCT__entry( + DD_DEV_ENTRY(sde->dd) + __field(int, aidx) + __field(u8, idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sde->dd); + __entry->idx = sde->this_idx; + __entry->aidx = aidx; + ), + TP_printk( + "[%s] SDE(%u) aidx %d", + __get_str(dev), + __entry->idx, + __entry->aidx + ) +); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, + TP_PROTO( + struct sdma_engine *sde, + int aidx + ), + TP_ARGS(sde, aidx)); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, + TP_PROTO( + struct sdma_engine *sde, + int aidx + ), + TP_ARGS(sde, aidx)); + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO( + struct sdma_engine *sde, + u16 hwhead, + u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry( + DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + __entry->sn = txp ? txp->sn : ~0; + ), + TP_printk( + "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->sn, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#else +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO( + struct sdma_engine *sde, + u16 hwhead, + u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry( + DD_DEV_ENTRY(sde->dd) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + ), + TP_printk( + "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#endif + +DECLARE_EVENT_CLASS(hfi1_sdma_sn, + TP_PROTO( + struct sdma_engine *sde, + u64 sn + ), + TP_ARGS(sde, sn), + TP_STRUCT__entry( + DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u8, idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sde->dd); + __entry->sn = sn; + __entry->idx = sde->this_idx; + ), + TP_printk( + "[%s] SDE(%u) sn %llu", + __get_str(dev), + __entry->idx, + __entry->sn + ) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, + TP_PROTO( + struct sdma_engine *sde, + u64 sn + ), + TP_ARGS(sde, sn) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, + TP_PROTO( + struct sdma_engine *sde, + u64 sn + ), + TP_ARGS(sde, sn) +); + +#define USDMA_HDR_FORMAT \ + "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x" + +TRACE_EVENT(hfi1_sdma_user_header, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + struct hfi1_pkt_header *hdr, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(__le32, pbc0) + __field(__le32, pbc1) + __field(__be32, lrh0) + __field(__be32, lrh1) + __field(__be32, bth0) + __field(__be32, bth1) + __field(__be32, bth2) + __field(__le32, kdeth0) + __field(__le32, kdeth1) + __field(__le32, kdeth2) + __field(__le32, kdeth3) + __field(__le32, kdeth4) + __field(__le32, kdeth5) + __field(__le32, kdeth6) + __field(__le32, kdeth7) + __field(__le32, kdeth8) + __field(u32, tidval) + ), + TP_fast_assign( + __le32 *pbc = (__le32 *)hdr->pbc; + __be32 *lrh = (__be32 *)hdr->lrh; + __be32 *bth = (__be32 *)hdr->bth; + __le32 *kdeth = (__le32 *)&hdr->kdeth; + + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->pbc0 = pbc[0]; + __entry->pbc1 = pbc[1]; + __entry->lrh0 = be32_to_cpu(lrh[0]); + __entry->lrh1 = be32_to_cpu(lrh[1]); + __entry->bth0 = be32_to_cpu(bth[0]); + __entry->bth1 = be32_to_cpu(bth[1]); + __entry->bth2 = be32_to_cpu(bth[2]); + __entry->kdeth0 = kdeth[0]; + __entry->kdeth1 = kdeth[1]; + __entry->kdeth2 = kdeth[2]; + __entry->kdeth3 = kdeth[3]; + __entry->kdeth4 = kdeth[4]; + __entry->kdeth5 = kdeth[5]; + __entry->kdeth6 = kdeth[6]; + __entry->kdeth7 = kdeth[7]; + __entry->kdeth8 = kdeth[8]; + __entry->tidval = tidval; + ), + TP_printk(USDMA_HDR_FORMAT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->pbc1, + __entry->pbc0, + __entry->lrh0, + __entry->lrh1, + __entry->bth0, + __entry->bth1, + __entry->bth2, + __entry->kdeth0, + __entry->kdeth1, + __entry->kdeth2, + __entry->kdeth3, + __entry->kdeth4, + __entry->kdeth5, + __entry->kdeth6, + __entry->kdeth7, + __entry->kdeth8, + __entry->tidval + ) + ); + +#define SDMA_UREQ_FMT \ + "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u" +TRACE_EVENT(hfi1_sdma_user_reqinfo, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), + TP_ARGS(dd, ctxt, subctxt, i), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd); + __field(u16, ctxt) + __field(u8, subctxt) + __field(u8, ver_opcode) + __field(u8, iovcnt) + __field(u16, npkts) + __field(u16, fragsize) + __field(u16, comp_idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->ver_opcode = i[0] & 0xff; + __entry->iovcnt = (i[0] >> 8) & 0xff; + __entry->npkts = i[1]; + __entry->fragsize = i[2]; + __entry->comp_idx = i[3]; + ), + TP_printk(SDMA_UREQ_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->ver_opcode, + __entry->iovcnt, + __entry->npkts, + __entry->fragsize, + __entry->comp_idx + ) + ); + +#define usdma_complete_name(st) { st, #st } +#define show_usdma_complete_state(st) \ + __print_symbolic(st, \ + usdma_complete_name(FREE), \ + usdma_complete_name(QUEUED), \ + usdma_complete_name(COMPLETE), \ + usdma_complete_name(ERROR)) + +TRACE_EVENT(hfi1_sdma_user_completion, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx, + u8 state, int code), + TP_ARGS(dd, ctxt, subctxt, idx, state, code), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, idx) + __field(u8, state) + __field(int, code) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->idx = idx; + __entry->state = state; + __entry->code = code; + ), + TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)", + __get_str(dev), __entry->ctxt, __entry->subctxt, + __entry->idx, show_usdma_complete_state(__entry->state), + __entry->code) + ); + +const char *print_u32_array(struct trace_seq *, u32 *, int); +#define __print_u32_hex(arr, len) print_u32_array(p, arr, len) + +TRACE_EVENT(hfi1_sdma_user_header_ahg, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(u8, sde) + __field(u8, idx) + __field(int, len) + __field(u32, tidval) + __array(u32, ahg, 10) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->sde = sde; + __entry->idx = ahgidx; + __entry->len = len; + __entry->tidval = tidval; + memcpy(__entry->ahg, ahg, len * sizeof(u32)); + ), + TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->sde, + __entry->idx, + __entry->len - 1, + __print_u32_hex(__entry->ahg, __entry->len), + __entry->tidval + ) + ); + +TRACE_EVENT(hfi1_sdma_state, + TP_PROTO( + struct sdma_engine *sde, + const char *cstate, + const char *nstate + ), + TP_ARGS(sde, cstate, nstate), + TP_STRUCT__entry( + DD_DEV_ENTRY(sde->dd) + __string(curstate, cstate) + __string(newstate, nstate) + ), + TP_fast_assign( + DD_DEV_ASSIGN(sde->dd); + __assign_str(curstate, cstate); + __assign_str(newstate, nstate); + ), + TP_printk("[%s] current state %s new state %s", + __get_str(dev), + __get_str(curstate), + __get_str(newstate) + ) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rc + +DECLARE_EVENT_CLASS(hfi1_sdma_rc, + TP_PROTO(struct hfi1_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, flags) + __field(u32, psn) + __field(u32, sending_psn) + __field(u32, sending_hpsn) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->flags = qp->s_flags; + __entry->psn = psn; + __entry->sending_psn = qp->s_sending_psn; + __entry->sending_hpsn = qp->s_sending_hpsn; + ), + TP_printk( + "[%s] qpn 0x%x flags 0x%x psn 0x%x sending_psn 0x%x sending_hpsn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->flags, + __entry->psn, + __entry->sending_psn, + __entry->sending_psn + ) +); + +DEFINE_EVENT(hfi1_sdma_rc, hfi1_rc_sendcomplete, + TP_PROTO(struct hfi1_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_misc + +TRACE_EVENT(hfi1_interrupt, + TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, + int src), + TP_ARGS(dd, is_entry, src), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __array(char, buf, 64) + __field(int, src) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd) + is_entry->is_name(__entry->buf, 64, src - is_entry->start); + __entry->src = src; + ), + TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, + __entry->src) +); + +/* + * Note: + * This produces a REALLY ugly trace in the console output when the string is + * too long. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_trace + +#define MAX_MSG_LEN 512 + +DECLARE_EVENT_CLASS(hfi1_trace_template, + TP_PROTO(const char *function, struct va_format *vaf), + TP_ARGS(function, vaf), + TP_STRUCT__entry( + __string(function, function) + __dynamic_array(char, msg, MAX_MSG_LEN) + ), + TP_fast_assign( + __assign_str(function, function); + WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), + MAX_MSG_LEN, vaf->fmt, + *vaf->va) >= MAX_MSG_LEN); + ), + TP_printk("(%s) %s", + __get_str(function), + __get_str(msg)) +); + +/* + * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an + * actual function to work and can not be in a macro. + */ +#define __hfi1_trace_def(lvl) \ +void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \ + \ +DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \ + TP_PROTO(const char *function, struct va_format *vaf), \ + TP_ARGS(function, vaf)) + +#define __hfi1_trace_fn(lvl) \ +void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ +{ \ + struct va_format vaf = { \ + .fmt = fmt, \ + }; \ + va_list args; \ + \ + va_start(args, fmt); \ + vaf.va = &args; \ + trace_hfi1_ ##lvl(func, &vaf); \ + va_end(args); \ + return; \ +} + +/* + * To create a new trace level simply define it below and as a __hfi1_trace_fn + * in trace.c. This will create all the hooks for calling + * hfi1_cdbg(LVL, fmt, ...); as well as take care of all + * the debugfs stuff. + */ +__hfi1_trace_def(PKT); +__hfi1_trace_def(PROC); +__hfi1_trace_def(SDMA); +__hfi1_trace_def(LINKVERB); +__hfi1_trace_def(DEBUG); +__hfi1_trace_def(SNOOP); +__hfi1_trace_def(CNTR); +__hfi1_trace_def(PIO); +__hfi1_trace_def(DC8051); +__hfi1_trace_def(FIRMWARE); +__hfi1_trace_def(RCVCTRL); +__hfi1_trace_def(TID); + +#define hfi1_cdbg(which, fmt, ...) \ + __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) + +#define hfi1_dbg(fmt, ...) \ + hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__) + +/* + * Define HFI1_EARLY_DBG at compile time or here to enable early trace + * messages. Do not check in an enablement for this. + */ + +#ifdef HFI1_EARLY_DBG +#define hfi1_dbg_early(fmt, ...) \ + trace_printk(fmt, ##__VA_ARGS__) +#else +#define hfi1_dbg_early(fmt, ...) +#endif + +#endif /* __HFI1_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include diff --git a/drivers/staging/rdma/hfi1/twsi.c b/drivers/staging/rdma/hfi1/twsi.c new file mode 100644 index 000000000..ea54fd270 --- /dev/null +++ b/drivers/staging/rdma/hfi1/twsi.c @@ -0,0 +1,518 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include + +#include "hfi.h" +#include "twsi.h" + +/* + * "Two Wire Serial Interface" support. + * + * Originally written for a not-quite-i2c serial eeprom, which is + * still used on some supported boards. Later boards have added a + * variety of other uses, most board-specific, so the bit-boffing + * part has been split off to this file, while the other parts + * have been moved to chip-specific files. + * + * We have also dropped all pretense of fully generic (e.g. pretend + * we don't know whether '1' is the higher voltage) interface, as + * the restrictions of the generic i2c interface (e.g. no access from + * driver itself) make it unsuitable for this use. + */ + +#define READ_CMD 1 +#define WRITE_CMD 0 + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the hfi1_ib device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target) +{ + /* + * implicit read of EXTStatus is as good as explicit + * read of scratch, if all we want to do is flush + * writes. + */ + hfi1_gpio_mod(dd, target, 0, 0, 0); + rmb(); /* inlined, so prevent compiler reordering */ +} + +/* + * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that + * for "almost compliant" modules + */ +#define SCL_WAIT_USEC 1000 + +/* BUF_WAIT is time bus must be free between STOP or ACK and to next START. + * Should be 20, but some chips need more. + */ +#define TWSI_BUF_WAIT_USEC 60 + +static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit) +{ + u32 mask; + + udelay(1); + + mask = QSFP_HFI0_I2CCLK; + + /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); + + /* + * Allow for slow slaves by simple + * delay for falling edge, sampling on rise. + */ + if (!bit) + udelay(2); + else { + int rise_usec; + + for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) { + if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0)) + break; + udelay(2); + } + if (rise_usec <= 0) + dd_dev_err(dd, "SCL interface stuck low > %d uSec\n", + SCL_WAIT_USEC); + } + i2c_wait_for_writes(dd, target); +} + +static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit) +{ + u32 mask; + + mask = QSFP_HFI0_I2CDAT; + + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); + + i2c_wait_for_writes(dd, target); + udelay(2); +} + +static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait) +{ + u32 read_val, mask; + + mask = QSFP_HFI0_I2CDAT; + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, 0, mask); + read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); + if (wait) + i2c_wait_for_writes(dd, target); + return (read_val & mask) >> GPIO_SDA_NUM; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the hfi1_ib device + */ +static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, target, 1); + scl_out(dd, target, 1); + ack_received = sda_in(dd, target, 1) == 0; + scl_out(dd, target, 0); + return ack_received; +} + +static void stop_cmd(struct hfi1_devdata *dd, u32 target); + +/** + * rd_byte - read a byte, sending STOP on last, else ACK + * @dd: the hfi1_ib device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct hfi1_devdata *dd, u32 target, int last) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, target, 1); + data |= sda_in(dd, target, 0); + scl_out(dd, target, 0); + } + if (last) { + scl_out(dd, target, 1); + stop_cmd(dd, target); + } else { + sda_out(dd, target, 0); + scl_out(dd, target, 1); + scl_out(dd, target, 0); + sda_out(dd, target, 1); + } + return data; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the hfi1_ib device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, target, bit); + scl_out(dd, target, 1); + scl_out(dd, target, 0); + } + return (!i2c_ackrcv(dd, target)) ? 1 : 0; +} + +/* + * issue TWSI start sequence: + * (both clock/data high, clock high, data low while clock is high) + */ +static void start_seq(struct hfi1_devdata *dd, u32 target) +{ + sda_out(dd, target, 1); + scl_out(dd, target, 1); + sda_out(dd, target, 0); + udelay(1); + scl_out(dd, target, 0); +} + +/** + * stop_seq - transmit the stop sequence + * @dd: the hfi1_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_seq(struct hfi1_devdata *dd, u32 target) +{ + scl_out(dd, target, 0); + sda_out(dd, target, 0); + scl_out(dd, target, 1); + sda_out(dd, target, 1); +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the hfi1_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct hfi1_devdata *dd, u32 target) +{ + stop_seq(dd, target); + udelay(TWSI_BUF_WAIT_USEC); +} + +/** + * hfi1_twsi_reset - reset I2C communication + * @dd: the hfi1_ib device + */ + +int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target) +{ + int clock_cycles_left = 9; + int was_high = 0; + u32 pins, mask; + + /* Both SCL and SDA should be high. If not, there + * is something wrong. + */ + mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT; + + /* + * Force pins to desired innocuous state. + * This is the default power-on state with out=0 and dir=0, + * So tri-stated and should be floating high (barring HW problems) + */ + hfi1_gpio_mod(dd, target, 0, 0, mask); + + /* + * Clock nine times to get all listeners into a sane state. + * If SDA does not go high at any point, we are wedged. + * One vendor recommends then issuing START followed by STOP. + * we cannot use our "normal" functions to do that, because + * if SCL drops between them, another vendor's part will + * wedge, dropping SDA and keeping it low forever, at the end of + * the next transaction (even if it was not the device addressed). + * So our START and STOP take place with SCL held high. + */ + while (clock_cycles_left--) { + scl_out(dd, target, 0); + scl_out(dd, target, 1); + /* Note if SDA is high, but keep clocking to sync slave */ + was_high |= sda_in(dd, target, 0); + } + + if (was_high) { + /* + * We saw a high, which we hope means the slave is sync'd. + * Issue START, STOP, pause for T_BUF. + */ + + pins = hfi1_gpio_mod(dd, target, 0, 0, 0); + if ((pins & mask) != mask) + dd_dev_err(dd, "GPIO pins not at rest: %d\n", + pins & mask); + /* Drop SDA to issue START */ + udelay(1); /* Guarantee .6 uSec setup */ + sda_out(dd, target, 0); + udelay(1); /* Guarantee .6 uSec hold */ + /* At this point, SCL is high, SDA low. Raise SDA for STOP */ + sda_out(dd, target, 1); + udelay(TWSI_BUF_WAIT_USEC); + } + + return !was_high; +} + +#define HFI1_TWSI_START 0x100 +#define HFI1_TWSI_STOP 0x200 + +/* Write byte to TWSI, optionally prefixed with START or suffixed with + * STOP. + * returns 0 if OK (ACK received), else != 0 + */ +static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags) +{ + int ret = 1; + + if (flags & HFI1_TWSI_START) + start_seq(dd, target); + + /* Leaves SCL low (from i2c_ackrcv()) */ + ret = wr_byte(dd, target, data); + + if (flags & HFI1_TWSI_STOP) + stop_cmd(dd, target); + return ret; +} + +/* Added functionality for IBA7220-based cards */ +#define HFI1_TEMP_DEV 0x98 + +/* + * hfi1_twsi_blk_rd + * General interface for data transfer from twsi devices. + * One vestige of its former role is that it recognizes a device + * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a one-byte "address" which selects + * the "register" or "offset" within the device from which data should + * be read. + */ +int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, + void *buffer, int len) +{ + int ret; + u8 *bp = buffer; + + ret = 1; + + if (dev == HFI1_TWSI_NO_DEV) { + /* legacy not-really-I2C */ + addr = (addr << 1) | READ_CMD; + ret = twsi_wr(dd, target, addr, HFI1_TWSI_START); + } else { + /* Actual I2C */ + ret = twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START); + if (ret) { + stop_cmd(dd, target); + ret = 1; + goto bail; + } + /* + * SFF spec claims we do _not_ stop after the addr + * but simply issue a start with the "read" dev-addr. + * Since we are implicitly waiting for ACK here, + * we need t_buf (nominally 20uSec) before that start, + * and cannot rely on the delay built in to the STOP + */ + ret = twsi_wr(dd, target, addr, 0); + udelay(TWSI_BUF_WAIT_USEC); + + if (ret) { + dd_dev_err(dd, + "Failed to write interface read addr %02X\n", + addr); + ret = 1; + goto bail; + } + ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START); + } + if (ret) { + stop_cmd(dd, target); + ret = 1; + goto bail; + } + + /* + * block devices keeps clocking data out as long as we ack, + * automatically incrementing the address. Some have "pages" + * whose boundaries will not be crossed, but the handling + * of these is left to the caller, who is in a better + * position to know. + */ + while (len-- > 0) { + /* + * Get and store data, sending ACK if length remaining, + * else STOP + */ + *bp++ = rd_byte(dd, target, !len); + } + + ret = 0; + +bail: + return ret; +} + +/* + * hfi1_twsi_blk_wr + * General interface for data transfer to twsi devices. + * One vestige of its former role is that it recognizes a device + * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a one-byte "address" which selects + * the "register" or "offset" within the device to which data should + * be written. + */ +int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, + const void *buffer, int len) +{ + int sub_len; + const u8 *bp = buffer; + int max_wait_time, i; + int ret = 1; + + while (len > 0) { + if (dev == HFI1_TWSI_NO_DEV) { + if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD, + HFI1_TWSI_START)) { + goto failed_write; + } + } else { + /* Real I2C */ + if (twsi_wr(dd, target, + dev | WRITE_CMD, HFI1_TWSI_START)) + goto failed_write; + ret = twsi_wr(dd, target, addr, 0); + if (ret) { + dd_dev_err(dd, + "Failed to write interface write addr %02X\n", + addr); + goto failed_write; + } + } + + sub_len = min(len, 4); + addr += sub_len; + len -= sub_len; + + for (i = 0; i < sub_len; i++) + if (twsi_wr(dd, target, *bp++, 0)) + goto failed_write; + + stop_cmd(dd, target); + + /* + * Wait for write complete by waiting for a successful + * read (the chip replies with a zero after the write + * cmd completes, and before it writes to the eeprom. + * The startcmd for the read will fail the ack until + * the writes have completed. We do this inline to avoid + * the debug prints that are in the real read routine + * if the startcmd fails. + * We also use the proper device address, so it doesn't matter + * whether we have real eeprom_dev. Legacy likes any address. + */ + max_wait_time = 100; + while (twsi_wr(dd, target, + dev | READ_CMD, HFI1_TWSI_START)) { + stop_cmd(dd, target); + if (!--max_wait_time) + goto failed_write; + } + /* now read (and ignore) the resulting byte */ + rd_byte(dd, target, 1); + } + + ret = 0; + goto bail; + +failed_write: + stop_cmd(dd, target); + ret = 1; + +bail: + return ret; +} diff --git a/drivers/staging/rdma/hfi1/twsi.h b/drivers/staging/rdma/hfi1/twsi.h new file mode 100644 index 000000000..5907e0296 --- /dev/null +++ b/drivers/staging/rdma/hfi1/twsi.h @@ -0,0 +1,68 @@ +#ifndef _TWSI_H +#define _TWSI_H +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define HFI1_TWSI_NO_DEV 0xFF + +struct hfi1_devdata; + +/* Bit position of SDA pin in ASIC_QSFP* registers */ +#define GPIO_SDA_NUM 1 + +/* these functions must be called with qsfp_lock held */ +int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target); +int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, + void *buffer, int len); +int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, + const void *buffer, int len); + + +#endif /* _TWSI_H */ diff --git a/drivers/staging/rdma/hfi1/uc.c b/drivers/staging/rdma/hfi1/uc.c new file mode 100644 index 000000000..b536f3977 --- /dev/null +++ b/drivers/staging/rdma/hfi1/uc.c @@ -0,0 +1,585 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "sdma.h" +#include "qp.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +/** + * hfi1_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_uc_req(struct hfi1_qp *qp) +{ + struct hfi1_other_headers *ohdr; + struct hfi1_swqe *wqe; + unsigned long flags; + u32 hwords = 5; + u32 bth0 = 0; + u32 len; + u32 pmtu = qp->pmtu; + int ret = 0; + int middle = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_SEND_OK)) { + if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_iowait.sdma_busy)) { + qp->s_flags |= HFI1_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = get_swqe_ptr(qp, qp->s_last); + hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + ohdr = &qp->s_hdr->ibh.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr->ibh.u.l.oth; + + /* Get the next send request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_hfi1_state_ops[qp->state] & + HFI1_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + if (qp->s_cur == qp->s_head) { + clear_ahg(qp); + goto bail; + } + /* + * Start a new request. + */ + wqe->psn = qp->s_next_psn; + qp->s_psn = qp->s_next_psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + len = wqe->length; + qp->s_len = len; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), + mask_psn(qp->s_next_psn++), middle); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~HFI1_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * hfi1_uc_rcv - handle an incoming UC packet + * @ibp: the port the packet came in on + * @hdr: the header of the packet + * @rcv_flags: flags relevant to rcv processing + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void hfi1_uc_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_ib_header *hdr = packet->hdr; + u32 rcv_flags = packet->rcv_flags; + void *data = packet->ebuf; + u32 tlen = packet->tlen; + struct hfi1_qp *qp = packet->qp; + struct hfi1_other_headers *ohdr = packet->ohdr; + u32 opcode; + u32 hdrsize = packet->hlen; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + struct ib_reth *reth; + int has_grh = rcv_flags & HFI1_HAS_GRH; + int ret; + u32 bth1; + struct ib_grh *grh = NULL; + + opcode = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode)) + return; + + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { + if (bth1 & HFI1_BECN_SMASK) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 rqpn, lqpn; + u16 rlid = be16_to_cpu(hdr->lrh[3]); + u8 sl, sc5; + + lqpn = bth1 & HFI1_QPN_MASK; + rqpn = qp->remote_qpn; + + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + sl = ibp->sc_to_sl[sc5]; + + process_becn(ppd, sl, rlid, lqpn, rqpn, + IB_CC_SVCTYPE_UC); + } + + if (bth1 & HFI1_FECN_SMASK) { + u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); + u16 slid = be16_to_cpu(hdr->lrh[3]); + u16 dlid = be16_to_cpu(hdr->lrh[1]); + u32 src_qp = qp->remote_qpn; + u8 sc5; + + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + + return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh); + } + } + + psn = be32_to_cpu(ohdr->bth[2]); + opcode >>= 24; + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; +inv: + if (qp->r_state == OP(SEND_FIRST) || + qp->r_state == OP(SEND_MIDDLE)) { + set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; + } else + hfi1_put_ss(&qp->r_sge); + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + goto drop; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & HFI1_R_COMM_EST)) + qp_comm_est(qp); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): +send_first: + if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags)) + qp->r_sge = qp->s_rdma_read_sge; + else { + ret = hfi1_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + /* + * qp->s_rdma_read_sge will be the owner + * of the mr references. + */ + qp->s_rdma_read_sge = qp->r_sge; + } + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto rewind; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto rewind; + hfi1_copy_sge(&qp->r_sge, data, pmtu, 0); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): +no_immediate_data: + wc.ex.imm_data = 0; + wc.wc_flags = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto rewind; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto rewind; + wc.opcode = IB_WC_RECV; + hfi1_copy_sge(&qp->r_sge, data, tlen, 0); + hfi1_put_ss(&qp->s_rdma_read_sge); +last_imm: + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + /* + * It seems that IB mandates the presence of an SL in a + * work completion only for the UD transport (see section + * 11.4.2 of IBTA Vol. 1). + * + * However, the way the SL is chosen below is consistent + * with the way that IB/qib works and is trying avoid + * introducing incompatibilities. + * + * See also OPA Vol. 1, section 9.7.6, and table 9-17. + */ + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ +rdma_first: + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + goto drop; + } + reth = &ohdr->u.rc.reth; + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = hfi1_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, + vaddr, rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto drop; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_ONLY)) + goto rdma_last; + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { + wc.ex.imm_data = ohdr->u.rc.imm_data; + goto rdma_last_imm; + } + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto drop; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto drop; + hfi1_copy_sge(&qp->r_sge, data, pmtu, 1); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + wc.ex.imm_data = ohdr->u.imm_data; +rdma_last_imm: + wc.wc_flags = IB_WC_WITH_IMM; + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + if (test_and_clear_bit(HFI1_R_REWIND_SGE, &qp->r_aflags)) + hfi1_put_ss(&qp->s_rdma_read_sge); + else { + ret = hfi1_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + hfi1_copy_sge(&qp->r_sge, data, tlen, 1); + hfi1_put_ss(&qp->r_sge); + goto last_imm; + + case OP(RDMA_WRITE_LAST): +rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + hfi1_copy_sge(&qp->r_sge, data, tlen, 1); + hfi1_put_ss(&qp->r_sge); + break; + + default: + /* Drop packet for unknown opcodes. */ + goto drop; + } + qp->r_psn++; + qp->r_state = opcode; + return; + +rewind: + set_bit(HFI1_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; +drop: + ibp->n_pkt_drops++; + return; + +op_err: + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + +} diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c new file mode 100644 index 000000000..d40d1a1e1 --- /dev/null +++ b/drivers/staging/rdma/hfi1/ud.c @@ -0,0 +1,885 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include "hfi.h" +#include "mad.h" +#include "qp.h" + +/** + * ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from hfi1_make_ud_req() to forward a WQE addressed + * to the same HFI. + * Note that the receive interrupt handler may be calling hfi1_ud_rcv() + * while this is being called. + */ +static void ud_loopback(struct hfi1_qp *sqp, struct hfi1_swqe *swqe) +{ + struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct hfi1_pportdata *ppd; + struct hfi1_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct hfi1_sge_state ssge; + struct hfi1_sge *sge; + struct ib_wc wc; + u32 length; + enum ib_qp_type sqptype, dqptype; + + rcu_read_lock(); + + qp = hfi1_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn); + if (!qp) { + ibp->n_pkt_drops++; + rcu_read_unlock(); + return; + } + + sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : sqp->ibqp.qp_type; + dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : qp->ibqp.qp_type; + + if (dqptype != sqptype || + !(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK)) { + ibp->n_pkt_drops++; + goto drop; + } + + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; + ppd = ppd_from_ibp(ibp); + + if (qp->ibqp.qp_num > 1) { + u16 pkey; + u16 slid; + u8 sc5 = ibp->sl_to_sc[ah_attr->sl]; + + pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index); + slid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + if (unlikely(ingress_pkey_check(ppd, pkey, sc5, + qp->s_pkey_index, slid))) { + hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + cpu_to_be16(slid), + cpu_to_be16(ah_attr->dlid)); + goto drop; + } + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (qp->ibqp.qp_num) { + u32 qkey; + + qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey; + if (unlikely(qkey != qp->qkey)) { + u16 lid; + + lid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + cpu_to_be16(lid), + cpu_to_be16(ah_attr->dlid)); + goto drop; + } + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof(wc)); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + spin_lock_irqsave(&qp->r_lock, flags); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & HFI1_R_REUSE_SGE) + qp->r_flags &= ~HFI1_R_REUSE_SGE; + else { + int ret; + + ret = hfi1_get_rwqe(qp, 0); + if (ret < 0) { + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + goto bail_unlock; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->n_vl15_dropped++; + goto bail_unlock; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= HFI1_R_REUSE_SGE; + ibp->n_pkt_drops++; + goto bail_unlock; + } + + if (ah_attr->ah_flags & IB_AH_GRH) { + hfi1_copy_sge(&qp->r_sge, &ah_attr->grh, + sizeof(struct ib_grh), 1); + wc.wc_flags |= IB_WC_GRH; + } else + hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + ssge.sg_list = swqe->sg_list + 1; + ssge.sge = *swqe->sg_list; + ssge.num_sge = swqe->wr.num_sge; + sge = &ssge.sge; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ssge.num_sge) + *sge = *ssge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + hfi1_put_ss(&qp->r_sge); + if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + goto bail_unlock; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) { + if (sqp->ibqp.qp_type == IB_QPT_GSI || + sqp->ibqp.qp_type == IB_QPT_SMI) + wc.pkey_index = swqe->wr.wr.ud.pkey_index; + else + wc.pkey_index = sqp->s_pkey_index; + } else { + wc.pkey_index = 0; + } + wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1)); + /* Check for loopback when the port lid is not set */ + if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI) + wc.slid = HFI1_PERMISSIVE_LID; + wc.sl = ah_attr->sl; + wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); + ibp->n_loop_pkts++; +bail_unlock: + spin_unlock_irqrestore(&qp->r_lock, flags); +drop: + rcu_read_unlock(); +} + +/** + * hfi1_make_ud_req - construct a UD request packet + * @qp: the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_ud_req(struct hfi1_qp *qp) +{ + struct hfi1_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct hfi1_swqe *wqe; + unsigned long flags; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int ret = 0; + int next_cur; + u8 sc5; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_NEXT_SEND_OK)) { + if (!(ib_hfi1_state_ops[qp->state] & HFI1_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_iowait.sdma_busy)) { + qp->s_flags |= HFI1_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + if (qp->s_cur == qp->s_head) + goto bail; + + wqe = get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; + if (ah_attr->dlid < HFI1_MULTICAST_LID_BASE || + ah_attr->dlid == HFI1_PERMISSIVE_LID) { + lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); + if (unlikely(!loopback && (lid == ppd->lid || + (lid == HFI1_PERMISSIVE_LID && + qp->ibqp.qp_type == IB_QPT_GSI)))) { + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (atomic_read(&qp->s_iowait.sdma_busy)) { + qp->s_flags |= HFI1_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, flags); + ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done; + } + } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_srate = ah_attr->static_rate; + qp->srate_mbps = ib_rate_to_mbps(qp->s_srate); + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + qp->s_hdrwords += hfi1_make_grh(ibp, &qp->s_hdr->ibh.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); + lrh0 = HFI1_LRH_GRH; + ohdr = &qp->s_hdr->ibh.u.l.oth; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + lrh0 = HFI1_LRH_BTH; + ohdr = &qp->s_hdr->ibh.u.oth; + } + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + sc5 = ibp->sl_to_sc[ah_attr->sl]; + lrh0 |= (ah_attr->sl & 0xf) << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) { + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + qp->s_sc = 0xf; + } else { + lrh0 |= (sc5 & 0xf) << 12; + qp->s_sc = sc5; + } + qp->s_hdr->ibh.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr->ibh.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + qp->s_hdr->ibh.lrh[2] = + cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) + qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE; + else { + lid = ppd->lid; + if (lid) { + lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1); + qp->s_hdr->ibh.lrh[3] = cpu_to_be16(lid); + } else + qp->s_hdr->ibh.lrh[3] = IB_LID_PERMISSIVE; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth0 |= extra_bytes << 20; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) + bth0 |= hfi1_get_pkey(ibp, wqe->wr.wr.ud.pkey_index); + else + bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(wqe->wr.wr.ud.remote_qpn); + ohdr->bth[2] = cpu_to_be32(mask_psn(qp->s_next_psn++)); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + /* disarm any ahg */ + qp->s_hdr->ahgcount = 0; + qp->s_hdr->ahgidx = 0; + qp->s_hdr->tx_flags = 0; + qp->s_hdr->sde = NULL; + +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~HFI1_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/* + * Hardware can't check this so we do it here. + * + * This is a slightly different algorithm than the standard pkey check. It + * special cases the management keys and allows for 0x7fff and 0xffff to be in + * the table at the same time. + * + * @returns the index found or -1 if not found + */ +int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + unsigned i; + + if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) { + unsigned lim_idx = -1; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) { + /* here we look for an exact match */ + if (ppd->pkeys[i] == pkey) + return i; + if (ppd->pkeys[i] == LIM_MGMT_P_KEY) + lim_idx = i; + } + + /* did not find 0xffff return 0x7fff idx if found */ + if (pkey == FULL_MGMT_P_KEY) + return lim_idx; + + /* no match... */ + return -1; + } + + pkey &= 0x7fff; /* remove limited/full membership bit */ + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) + if ((ppd->pkeys[i] & 0x7fff) == pkey) + return i; + + /* + * Should not get here, this means hardware failed to validate pkeys. + */ + return -1; +} + +void return_cnp(struct hfi1_ibport *ibp, struct hfi1_qp *qp, u32 remote_qpn, + u32 pkey, u32 slid, u32 dlid, u8 sc5, + const struct ib_grh *old_grh) +{ + u64 pbc, pbc_flags = 0; + u32 bth0, plen, vl, hwords = 5; + u16 lrh0; + u8 sl = ibp->sc_to_sl[sc5]; + struct hfi1_ib_header hdr; + struct hfi1_other_headers *ohdr; + struct pio_buf *pbuf; + struct send_context *ctxt = qp_to_send_context(qp, sc5); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (old_grh) { + struct ib_grh *grh = &hdr.u.l.grh; + + grh->version_tclass_flow = old_grh->version_tclass_flow; + grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2); + grh->hop_limit = 0xff; + grh->sgid = old_grh->dgid; + grh->dgid = old_grh->sgid; + ohdr = &hdr.u.l.oth; + lrh0 = HFI1_LRH_GRH; + hwords += sizeof(struct ib_grh) / sizeof(u32); + } else { + ohdr = &hdr.u.oth; + lrh0 = HFI1_LRH_BTH; + } + + lrh0 |= (sc5 & 0xf) << 12 | sl << 4; + + bth0 = pkey | (IB_OPCODE_CNP << 24); + ohdr->bth[0] = cpu_to_be32(bth0); + + ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT)); + ohdr->bth[2] = 0; /* PSN 0 */ + + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(slid); + + plen = 2 /* PBC */ + hwords; + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + if (ctxt) { + pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); + if (pbuf) + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + &hdr, hwords); + } +} + +/* + * opa_smp_check() - Do the regular pkey checking, and the additional + * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26 + * ("SMA Packet Checks"). + * + * Note that: + * - Checks are done using the pkey directly from the packet's BTH, + * and specifically _not_ the pkey that we attach to the completion, + * which may be different. + * - These checks are specifically for "non-local" SMPs (i.e., SMPs + * which originated on another node). SMPs which are sent from, and + * destined to this node are checked in opa_local_smp_check(). + * + * At the point where opa_smp_check() is called, we know: + * - destination QP is QP0 + * + * opa_smp_check() returns 0 if all checks succeed, 1 otherwise. + */ +static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, + struct hfi1_qp *qp, u16 slid, struct opa_smp *smp) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + /* + * I don't think it's possible for us to get here with sc != 0xf, + * but check it to be certain. + */ + if (sc5 != 0xf) + return 1; + + if (rcv_pkey_check(ppd, pkey, sc5, slid)) + return 1; + + /* + * At this point we know (and so don't need to check again) that + * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY + * (see ingress_pkey_check). + */ + if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + + /* + * SMPs fall into one of four (disjoint) categories: + * SMA request, SMA response, trap, or trap repress. + * Our response depends, in part, on which type of + * SMP we're processing. + * + * If this is not an SMA request, or trap repress: + * - accept MAD if the port is running an SM + * - pkey == FULL_MGMT_P_KEY => + * reply with unsupported method (i.e., just mark + * the smp's status field here, and let it be + * processed normally) + * - pkey != LIM_MGMT_P_KEY => + * increment port recv constraint errors, drop MAD + * If this is an SMA request or trap repress: + * - pkey != FULL_MGMT_P_KEY => + * increment port recv constraint errors, drop MAD + */ + switch (smp->method) { + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (pkey != FULL_MGMT_P_KEY) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + break; + case IB_MGMT_METHOD_SEND: + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + case IB_MGMT_METHOD_REPORT_RESP: + if (ibp->port_cap_flags & IB_PORT_SM) + return 0; + if (pkey == FULL_MGMT_P_KEY) { + smp->status |= IB_SMP_UNSUP_METHOD; + return 0; + } + if (pkey != LIM_MGMT_P_KEY) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + break; + default: + break; + } + return 0; +} + + +/** + * hfi1_ud_rcv - receive an incoming UD packet + * @ibp: the port the packet came in on + * @hdr: the packet header + * @rcv_flags: flags relevant to rcv processing + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void hfi1_ud_rcv(struct hfi1_packet *packet) +{ + struct hfi1_other_headers *ohdr = packet->ohdr; + int opcode; + u32 hdrsize = packet->hlen; + u32 pad; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + u16 dlid, pkey; + int mgmt_pkey_idx = -1; + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_ib_header *hdr = packet->hdr; + u32 rcv_flags = packet->rcv_flags; + void *data = packet->ebuf; + u32 tlen = packet->tlen; + struct hfi1_qp *qp = packet->qp; + bool has_grh = rcv_flags & HFI1_HAS_GRH; + bool sc4_bit = has_sc4_bit(packet); + u8 sc; + u32 bth1; + int is_mcast; + struct ib_grh *grh = NULL; + + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & HFI1_QPN_MASK; + dlid = be16_to_cpu(hdr->lrh[1]); + is_mcast = (dlid > HFI1_MULTICAST_LID_BASE) && + (dlid != HFI1_PERMISSIVE_LID); + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & HFI1_BECN_SMASK)) { + /* + * In pre-B0 h/w the CNP_OPCODE is handled via an + * error path (errata 291394). + */ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 lqpn = be32_to_cpu(ohdr->bth[1]) & HFI1_QPN_MASK; + u8 sl, sc5; + + sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + sc5 |= sc4_bit; + sl = ibp->sc_to_sl[sc5]; + + process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD); + } + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + opcode &= 0xff; + + pkey = (u16)be32_to_cpu(ohdr->bth[0]); + + if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) { + u16 slid = be16_to_cpu(hdr->lrh[3]); + u8 sc5; + + sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + sc5 |= sc4_bit; + + return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh); + } + /* + * Get the number of bytes the message was padded by + * and drop incomplete packets. + */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + + tlen -= hdrsize + pad + 4; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) + goto drop; + if (qp->ibqp.qp_num > 1) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 slid; + u8 sc5; + + sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + sc5 |= sc4_bit; + + slid = be16_to_cpu(hdr->lrh[3]); + if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) { + /* + * Traps will not be sent for packets dropped + * by the HW. This is fine, as sending trap + * for invalid pkeys is optional according to + * IB spec (release 1.3, section 10.9.4) + */ + hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, + pkey, + (be16_to_cpu(hdr->lrh[0]) >> 4) & + 0xF, + src_qp, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + return; + } + } else { + /* GSI packet */ + mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); + if (mgmt_pkey_idx < 0) + goto drop; + + } + if (unlikely(qkey != qp->qkey)) { + hfi1_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + src_qp, qp->ibqp.qp_num, + hdr->lrh[3], hdr->lrh[1]); + return; + } + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely(qp->ibqp.qp_num == 1 && + (tlen > 2048 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) + goto drop; + } else { + /* Received on QP0, and so by definition, this is an SMP */ + struct opa_smp *smp = (struct opa_smp *)data; + u16 slid = be16_to_cpu(hdr->lrh[3]); + u8 sc5; + + sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + sc5 |= sc4_bit; + + if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp)) + goto drop; + + if (tlen > 2048) + goto drop; + if ((hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + goto drop; + + /* look up SMI pkey */ + mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); + if (mgmt_pkey_idx < 0) + goto drop; + + } + + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + tlen -= sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else + goto drop; + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & HFI1_R_REUSE_SGE) + qp->r_flags &= ~HFI1_R_REUSE_SGE; + else { + int ret; + + ret = hfi1_get_rwqe(qp, 0); + if (ret < 0) { + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->n_vl15_dropped++; + return; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= HFI1_R_REUSE_SGE; + goto drop; + } + if (has_grh) { + hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), 1); + wc.wc_flags |= IB_WC_GRH; + } else + hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); + hfi1_put_ss(&qp->r_sge); + if (!test_and_clear_bit(HFI1_R_WRID_VALID, &qp->r_aflags)) + return; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + + if (qp->ibqp.qp_type == IB_QPT_GSI || + qp->ibqp.qp_type == IB_QPT_SMI) { + if (mgmt_pkey_idx < 0) { + if (net_ratelimit()) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = ppd->dd; + + dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n", + qp->ibqp.qp_type); + mgmt_pkey_idx = 0; + } + } + wc.pkey_index = (unsigned)mgmt_pkey_idx; + } else + wc.pkey_index = 0; + + wc.slid = be16_to_cpu(hdr->lrh[3]); + sc = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + sc |= sc4_bit; + wc.sl = ibp->sc_to_sl[sc]; + + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= HFI1_MULTICAST_LID_BASE ? 0 : + dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + hfi1_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + return; + +drop: + ibp->n_pkt_drops++; +} diff --git a/drivers/staging/rdma/hfi1/user_pages.c b/drivers/staging/rdma/hfi1/user_pages.c new file mode 100644 index 000000000..9071afbd7 --- /dev/null +++ b/drivers/staging/rdma/hfi1/user_pages.c @@ -0,0 +1,156 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include + +#include "hfi.h" + +static void __hfi1_release_user_pages(struct page **p, size_t num_pages, + int dirty) +{ + size_t i; + + for (i = 0; i < num_pages; i++) { + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } +} + +/* + * Call with current->mm->mmap_sem held. + */ +static int __hfi1_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + unsigned long lock_limit; + size_t got; + int ret; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto bail; + } + + for (got = 0; got < num_pages; got += ret) { + ret = get_user_pages(current, current->mm, + start_page + got * PAGE_SIZE, + num_pages - got, 1, 1, + p + got, NULL); + if (ret < 0) + goto bail_release; + } + + current->mm->pinned_vm += num_pages; + + ret = 0; + goto bail; + +bail_release: + __hfi1_release_user_pages(p, got, 0); +bail: + return ret; +} + +/** + * hfi1_map_page - a safety wrapper around pci_map_page() + * + */ +dma_addr_t hfi1_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + dma_addr_t phys; + + phys = pci_map_page(hwdev, page, offset, size, direction); + + return phys; +} + +/** + * hfi1_get_user_pages - lock user pages into memory + * @start_page: the start page + * @num_pages: the number of pages + * @p: the output page structures + * + * This function takes a given start page (page aligned user virtual + * address) and pins it and the following specified number of pages. For + * now, num_pages is always 1, but that will probably change at some point + * (because caller is doing expected sends on a single virtually contiguous + * buffer, so we can do all pages at once). + */ +int hfi1_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __hfi1_get_user_pages(start_page, num_pages, p); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void hfi1_release_user_pages(struct page **p, size_t num_pages) +{ + if (current->mm) /* during close after signal, mm can be NULL */ + down_write(¤t->mm->mmap_sem); + + __hfi1_release_user_pages(p, num_pages, 1); + + if (current->mm) { + current->mm->pinned_vm -= num_pages; + up_write(¤t->mm->mmap_sem); + } +} diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c new file mode 100644 index 000000000..55526613a --- /dev/null +++ b/drivers/staging/rdma/hfi1/user_sdma.c @@ -0,0 +1,1444 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "sdma.h" +#include "user_sdma.h" +#include "sdma.h" +#include "verbs.h" /* for the headers */ +#include "common.h" /* for struct hfi1_tid_info */ +#include "trace.h" + +static uint hfi1_sdma_comp_ring_size = 128; +module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); + +/* The maximum number of Data io vectors per message/request */ +#define MAX_VECTORS_PER_REQ 8 +/* + * Maximum number of packet to send from each message/request + * before moving to the next one. + */ +#define MAX_PKTS_PER_QUEUE 16 + +#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) + +#define req_opcode(x) \ + (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_version(x) \ + (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_iovcnt(x) \ + (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) + +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define BTH_SEQ_MASK 0x7ffull + +/* + * Define fields in the KDETH header so we can update the header + * template. + */ +#define KDETH_OFFSET_SHIFT 0 +#define KDETH_OFFSET_MASK 0x7fff +#define KDETH_OM_SHIFT 15 +#define KDETH_OM_MASK 0x1 +#define KDETH_TID_SHIFT 16 +#define KDETH_TID_MASK 0x3ff +#define KDETH_TIDCTRL_SHIFT 26 +#define KDETH_TIDCTRL_MASK 0x3 +#define KDETH_INTR_SHIFT 28 +#define KDETH_INTR_MASK 0x1 +#define KDETH_SH_SHIFT 29 +#define KDETH_SH_MASK 0x1 +#define KDETH_HCRC_UPPER_SHIFT 16 +#define KDETH_HCRC_UPPER_MASK 0xff +#define KDETH_HCRC_LOWER_SHIFT 24 +#define KDETH_HCRC_LOWER_MASK 0xff + +#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) +#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) + +#define KDETH_GET(val, field) \ + (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) +#define KDETH_SET(dw, field, val) do { \ + u32 dwval = le32_to_cpu(dw); \ + dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ + dwval |= (((val) & KDETH_##field##_MASK) << \ + KDETH_##field##_SHIFT); \ + dw = cpu_to_le32(dwval); \ + } while (0) + +#define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ + do { \ + if ((idx) < ARRAY_SIZE((arr))) \ + (arr)[(idx++)] = sdma_build_ahg_descriptor( \ + (__force u16)(value), (dw), (bit), \ + (width)); \ + else \ + return -ERANGE; \ + } while (0) + +/* KDETH OM multipliers and switch over point */ +#define KDETH_OM_SMALL 4 +#define KDETH_OM_LARGE 64 +#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) + +/* Last packet in the request */ +#define USER_SDMA_TXREQ_FLAGS_LAST_PKT (1 << 0) + +#define SDMA_REQ_IN_USE 0 +#define SDMA_REQ_FOR_THREAD 1 +#define SDMA_REQ_SEND_DONE 2 +#define SDMA_REQ_HAVE_AHG 3 +#define SDMA_REQ_HAS_ERROR 4 +#define SDMA_REQ_DONE_ERROR 5 + +#define SDMA_PKT_Q_INACTIVE (1 << 0) +#define SDMA_PKT_Q_ACTIVE (1 << 1) +#define SDMA_PKT_Q_DEFERRED (1 << 2) + +/* + * Maximum retry attempts to submit a TX request + * before putting the process to sleep. + */ +#define MAX_DEFER_RETRY_COUNT 1 + +static unsigned initial_pkt_count = 8; + +#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ + +struct user_sdma_iovec { + struct iovec iov; + /* number of pages in this vector */ + unsigned npages; + /* array of pinned pages for this vector */ + struct page **pages; + /* offset into the virtual address space of the vector at + * which we last left off. */ + u64 offset; +}; + +struct user_sdma_request { + struct sdma_req_info info; + struct hfi1_user_sdma_pkt_q *pq; + struct hfi1_user_sdma_comp_q *cq; + /* This is the original header from user space */ + struct hfi1_pkt_header hdr; + /* + * Pointer to the SDMA engine for this request. + * Since different request could be on different VLs, + * each request will need it's own engine pointer. + */ + struct sdma_engine *sde; + u8 ahg_idx; + u32 ahg[9]; + /* + * KDETH.Offset (Eager) field + * We need to remember the initial value so the headers + * can be updated properly. + */ + u32 koffset; + /* + * KDETH.OFFSET (TID) field + * The offset can cover multiple packets, depending on the + * size of the TID entry. + */ + u32 tidoffset; + /* + * KDETH.OM + * Remember this because the header template always sets it + * to 0. + */ + u8 omfactor; + /* + * pointer to the user's task_struct. We are going to + * get a reference to it so we can process io vectors + * at a later time. + */ + struct task_struct *user_proc; + /* + * pointer to the user's mm_struct. We are going to + * get a reference to it so it doesn't get freed + * since we might not be in process context when we + * are processing the iov's. + * Using this mm_struct, we can get vma based on the + * iov's address (find_vma()). + */ + struct mm_struct *user_mm; + /* + * We copy the iovs for this request (based on + * info.iovcnt). These are only the data vectors + */ + unsigned data_iovs; + /* total length of the data in the request */ + u32 data_len; + /* progress index moving along the iovs array */ + unsigned iov_idx; + struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; + /* number of elements copied to the tids array */ + u16 n_tids; + /* TID array values copied from the tid_iov vector */ + u32 *tids; + u16 tididx; + u32 sent; + u64 seqnum; + spinlock_t list_lock; + struct list_head txps; + unsigned long flags; +}; + +struct user_sdma_txreq { + /* Packet header for the txreq */ + struct hfi1_pkt_header hdr; + struct sdma_txreq txreq; + struct user_sdma_request *req; + struct user_sdma_iovec *iovec1; + struct user_sdma_iovec *iovec2; + u16 flags; + unsigned busycount; + u64 seqnum; +}; + +#define SDMA_DBG(req, fmt, ...) \ + hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ + (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ + ##__VA_ARGS__) +#define SDMA_Q_DBG(pq, fmt, ...) \ + hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ + (pq)->subctxt, ##__VA_ARGS__) + +static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); +static int num_user_pages(const struct iovec *); +static void user_sdma_txreq_cb(struct sdma_txreq *, int, int); +static void user_sdma_free_request(struct user_sdma_request *); +static int pin_vector_pages(struct user_sdma_request *, + struct user_sdma_iovec *); +static void unpin_vector_pages(struct user_sdma_iovec *); +static int check_header_template(struct user_sdma_request *, + struct hfi1_pkt_header *, u32, u32); +static int set_txreq_header(struct user_sdma_request *, + struct user_sdma_txreq *, u32); +static int set_txreq_header_ahg(struct user_sdma_request *, + struct user_sdma_txreq *, u32); +static inline void set_comp_state(struct user_sdma_request *, + enum hfi1_sdma_comp_state, int); +static inline u32 set_pkt_bth_psn(__be32, u8, u32); +static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); + +static int defer_packet_queue( + struct sdma_engine *, + struct iowait *, + struct sdma_txreq *, + unsigned seq); +static void activate_packet_queue(struct iowait *, int); + +static inline int iovec_may_free(struct user_sdma_iovec *iovec, + void (*free)(struct user_sdma_iovec *)) +{ + if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { + free(iovec); + return 1; + } + return 0; +} + +static inline void iovec_set_complete(struct user_sdma_iovec *iovec) +{ + iovec->offset = iovec->iov.iov_len; +} + +static int defer_packet_queue( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *txreq, + unsigned seq) +{ + struct hfi1_user_sdma_pkt_q *pq = + container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + struct hfi1_ibdev *dev = &pq->dd->verbs_dev; + struct user_sdma_txreq *tx = + container_of(txreq, struct user_sdma_txreq, txreq); + + if (sdma_progress(sde, seq, txreq)) { + if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) + goto eagain; + } + /* + * We are assuming that if the list is enqueued somewhere, it + * is to the dmawait list since that is the only place where + * it is supposed to be enqueued. + */ + xchg(&pq->state, SDMA_PKT_Q_DEFERRED); + write_seqlock(&dev->iowait_lock); + if (list_empty(&pq->busy.list)) + list_add_tail(&pq->busy.list, &sde->dmawait); + write_sequnlock(&dev->iowait_lock); + return -EBUSY; +eagain: + return -EAGAIN; +} + +static void activate_packet_queue(struct iowait *wait, int reason) +{ + struct hfi1_user_sdma_pkt_q *pq = + container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + xchg(&pq->state, SDMA_PKT_Q_ACTIVE); + wake_up(&wait->wait_dma); +}; + +static void sdma_kmem_cache_ctor(void *obj) +{ + struct user_sdma_txreq *tx = (struct user_sdma_txreq *)obj; + + memset(tx, 0, sizeof(*tx)); +} + +int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) +{ + int ret = 0; + unsigned memsize; + char buf[64]; + struct hfi1_devdata *dd; + struct hfi1_user_sdma_comp_q *cq; + struct hfi1_user_sdma_pkt_q *pq; + unsigned long flags; + + if (!uctxt || !fp) { + ret = -EBADF; + goto done; + } + + if (!hfi1_sdma_comp_ring_size) { + ret = -EINVAL; + goto done; + } + + dd = uctxt->dd; + + pq = kzalloc(sizeof(*pq), GFP_KERNEL); + if (!pq) { + dd_dev_err(dd, + "[%u:%u] Failed to allocate SDMA request struct\n", + uctxt->ctxt, subctxt_fp(fp)); + goto pq_nomem; + } + memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size; + pq->reqs = kmalloc(memsize, GFP_KERNEL); + if (!pq->reqs) { + dd_dev_err(dd, + "[%u:%u] Failed to allocate SDMA request queue (%u)\n", + uctxt->ctxt, subctxt_fp(fp), memsize); + goto pq_reqs_nomem; + } + INIT_LIST_HEAD(&pq->list); + pq->dd = dd; + pq->ctxt = uctxt->ctxt; + pq->subctxt = subctxt_fp(fp); + pq->n_max_reqs = hfi1_sdma_comp_ring_size; + pq->state = SDMA_PKT_Q_INACTIVE; + atomic_set(&pq->n_reqs, 0); + + iowait_init(&pq->busy, 0, NULL, defer_packet_queue, + activate_packet_queue); + pq->reqidx = 0; + snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, + subctxt_fp(fp)); + pq->txreq_cache = kmem_cache_create(buf, + sizeof(struct user_sdma_txreq), + L1_CACHE_BYTES, + SLAB_HWCACHE_ALIGN, + sdma_kmem_cache_ctor); + if (!pq->txreq_cache) { + dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", + uctxt->ctxt); + goto pq_txreq_nomem; + } + user_sdma_pkt_fp(fp) = pq; + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + dd_dev_err(dd, + "[%u:%u] Failed to allocate SDMA completion queue\n", + uctxt->ctxt, subctxt_fp(fp)); + goto cq_nomem; + } + + memsize = ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size, + PAGE_SIZE); + cq->comps = vmalloc_user(memsize); + if (!cq->comps) { + dd_dev_err(dd, + "[%u:%u] Failed to allocate SDMA completion queue entries\n", + uctxt->ctxt, subctxt_fp(fp)); + goto cq_comps_nomem; + } + cq->nentries = hfi1_sdma_comp_ring_size; + user_sdma_comp_fp(fp) = cq; + + spin_lock_irqsave(&uctxt->sdma_qlock, flags); + list_add(&pq->list, &uctxt->sdma_queues); + spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); + goto done; + +cq_comps_nomem: + kfree(cq); +cq_nomem: + kmem_cache_destroy(pq->txreq_cache); +pq_txreq_nomem: + kfree(pq->reqs); +pq_reqs_nomem: + kfree(pq); + user_sdma_pkt_fp(fp) = NULL; +pq_nomem: + ret = -ENOMEM; +done: + return ret; +} + +int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_user_sdma_pkt_q *pq; + unsigned long flags; + + hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, + uctxt->ctxt, fd->subctxt); + pq = fd->pq; + if (pq) { + u16 i, j; + + spin_lock_irqsave(&uctxt->sdma_qlock, flags); + if (!list_empty(&pq->list)) + list_del_init(&pq->list); + spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); + iowait_sdma_drain(&pq->busy); + if (pq->reqs) { + for (i = 0, j = 0; i < atomic_read(&pq->n_reqs) && + j < pq->n_max_reqs; j++) { + struct user_sdma_request *req = &pq->reqs[j]; + + if (test_bit(SDMA_REQ_IN_USE, &req->flags)) { + set_comp_state(req, ERROR, -ECOMM); + user_sdma_free_request(req); + i++; + } + } + kfree(pq->reqs); + } + if (pq->txreq_cache) + kmem_cache_destroy(pq->txreq_cache); + kfree(pq); + fd->pq = NULL; + } + if (fd->cq) { + if (fd->cq->comps) + vfree(fd->cq->comps); + kfree(fd->cq); + fd->cq = NULL; + } + return 0; +} + +int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, + unsigned long dim, unsigned long *count) +{ + int ret = 0, i = 0, sent; + struct hfi1_ctxtdata *uctxt = ctxt_fp(fp); + struct hfi1_user_sdma_pkt_q *pq = user_sdma_pkt_fp(fp); + struct hfi1_user_sdma_comp_q *cq = user_sdma_comp_fp(fp); + struct hfi1_devdata *dd = pq->dd; + unsigned long idx = 0; + u8 pcount = initial_pkt_count; + struct sdma_req_info info; + struct user_sdma_request *req; + u8 opcode, sc, vl; + + if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { + hfi1_cdbg( + SDMA, + "[%u:%u:%u] First vector not big enough for header %lu/%lu", + dd->unit, uctxt->ctxt, subctxt_fp(fp), + iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); + ret = -EINVAL; + goto done; + } + ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); + if (ret) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", + dd->unit, uctxt->ctxt, subctxt_fp(fp), ret); + ret = -EFAULT; + goto done; + } + trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, subctxt_fp(fp), + (u16 *)&info); + if (cq->comps[info.comp_idx].status == QUEUED) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", + dd->unit, uctxt->ctxt, subctxt_fp(fp), + info.comp_idx); + ret = -EBADSLT; + goto done; + } + if (!info.fragsize) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Request does not specify fragsize", + dd->unit, uctxt->ctxt, subctxt_fp(fp), info.comp_idx); + ret = -EINVAL; + goto done; + } + /* + * We've done all the safety checks that we can up to this point, + * "allocate" the request entry. + */ + hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, + uctxt->ctxt, subctxt_fp(fp), info.comp_idx); + req = pq->reqs + info.comp_idx; + memset(req, 0, sizeof(*req)); + /* Mark the request as IN_USE before we start filling it in. */ + set_bit(SDMA_REQ_IN_USE, &req->flags); + req->data_iovs = req_iovcnt(info.ctrl) - 1; + req->pq = pq; + req->cq = cq; + INIT_LIST_HEAD(&req->txps); + spin_lock_init(&req->list_lock); + memcpy(&req->info, &info, sizeof(info)); + + if (req_opcode(info.ctrl) == EXPECTED) + req->data_iovs--; + + if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { + SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, + MAX_VECTORS_PER_REQ); + ret = -EINVAL; + goto done; + } + /* Copy the header from the user buffer */ + ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), + sizeof(req->hdr)); + if (ret) { + SDMA_DBG(req, "Failed to copy header template (%d)", ret); + ret = -EFAULT; + goto free_req; + } + + /* If Static rate control is not enabled, sanitize the header. */ + if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) + req->hdr.pbc[2] = 0; + + /* Validate the opcode. Do not trust packets from user space blindly. */ + opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; + if ((opcode & USER_OPCODE_CHECK_MASK) != + USER_OPCODE_CHECK_VAL) { + SDMA_DBG(req, "Invalid opcode (%d)", opcode); + ret = -EINVAL; + goto free_req; + } + /* + * Validate the vl. Do not trust packets from user space blindly. + * VL comes from PBC, SC comes from LRH, and the VL needs to + * match the SC look up. + */ + vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; + sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | + (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); + if (vl >= dd->pport->vls_operational || + vl != sc_to_vlt(dd, sc)) { + SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); + ret = -EINVAL; + goto free_req; + } + + /* + * Also should check the BTH.lnh. If it says the next header is GRH then + * the RXE parsing will be off and will land in the middle of the KDETH + * or miss it entirely. + */ + if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { + SDMA_DBG(req, "User tried to pass in a GRH"); + ret = -EINVAL; + goto free_req; + } + + req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); + /* Calculate the initial TID offset based on the values of + KDETH.OFFSET and KDETH.OM that are passed in. */ + req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * + (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? + KDETH_OM_LARGE : KDETH_OM_SMALL); + SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); + idx++; + + /* Save all the IO vector structures */ + while (i < req->data_iovs) { + memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); + req->iovs[i].offset = 0; + req->data_len += req->iovs[i++].iov.iov_len; + } + SDMA_DBG(req, "total data length %u", req->data_len); + + if (pcount > req->info.npkts) + pcount = req->info.npkts; + /* + * Copy any TID info + * User space will provide the TID info only when the + * request type is EXPECTED. This is true even if there is + * only one packet in the request and the header is already + * setup. The reason for the singular TID case is that the + * driver needs to perform safety checks. + */ + if (req_opcode(req->info.ctrl) == EXPECTED) { + u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); + + if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { + ret = -EINVAL; + goto free_req; + } + req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL); + if (!req->tids) { + ret = -ENOMEM; + goto free_req; + } + /* + * We have to copy all of the tids because they may vary + * in size and, therefore, the TID count might not be + * equal to the pkt count. However, there is no way to + * tell at this point. + */ + ret = copy_from_user(req->tids, iovec[idx].iov_base, + ntids * sizeof(*req->tids)); + if (ret) { + SDMA_DBG(req, "Failed to copy %d TIDs (%d)", + ntids, ret); + ret = -EFAULT; + goto free_req; + } + req->n_tids = ntids; + idx++; + } + + /* Have to select the engine */ + req->sde = sdma_select_engine_vl(dd, + (u32)(uctxt->ctxt + subctxt_fp(fp)), + vl); + if (!req->sde || !sdma_running(req->sde)) { + ret = -ECOMM; + goto free_req; + } + + /* We don't need an AHG entry if the request contains only one packet */ + if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) { + int ahg = sdma_ahg_alloc(req->sde); + + if (likely(ahg >= 0)) { + req->ahg_idx = (u8)ahg; + set_bit(SDMA_REQ_HAVE_AHG, &req->flags); + } + } + + set_comp_state(req, QUEUED, 0); + /* Send the first N packets in the request to buy us some time */ + sent = user_sdma_send_pkts(req, pcount); + if (unlikely(sent < 0)) { + if (sent != -EBUSY) { + ret = sent; + goto send_err; + } else + sent = 0; + } + atomic_inc(&pq->n_reqs); + + if (sent < req->info.npkts) { + /* Take the references to the user's task and mm_struct */ + get_task_struct(current); + req->user_proc = current; + + /* + * This is a somewhat blocking send implementation. + * The driver will block the caller until all packets of the + * request have been submitted to the SDMA engine. However, it + * will not wait for send completions. + */ + while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { + ret = user_sdma_send_pkts(req, pcount); + if (ret < 0) { + if (ret != -EBUSY) + goto send_err; + wait_event_interruptible_timeout( + pq->busy.wait_dma, + (pq->state == SDMA_PKT_Q_ACTIVE), + msecs_to_jiffies( + SDMA_IOWAIT_TIMEOUT)); + } + } + + } + ret = 0; + *count += idx; + goto done; +send_err: + set_comp_state(req, ERROR, ret); +free_req: + user_sdma_free_request(req); +done: + return ret; +} + +static inline u32 compute_data_length(struct user_sdma_request *req, + struct user_sdma_txreq *tx) +{ + /* + * Determine the proper size of the packet data. + * The size of the data of the first packet is in the header + * template. However, it includes the header and ICRC, which need + * to be subtracted. + * The size of the remaining packets is the minimum of the frag + * size (MTU) or remaining data in the request. + */ + u32 len; + + if (!req->seqnum) { + len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - + (sizeof(tx->hdr) - 4)); + } else if (req_opcode(req->info.ctrl) == EXPECTED) { + u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * + PAGE_SIZE; + /* Get the data length based on the remaining space in the + * TID pair. */ + len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); + /* If we've filled up the TID pair, move to the next one. */ + if (unlikely(!len) && ++req->tididx < req->n_tids && + req->tids[req->tididx]) { + tidlen = EXP_TID_GET(req->tids[req->tididx], + LEN) * PAGE_SIZE; + req->tidoffset = 0; + len = min_t(u32, tidlen, req->info.fragsize); + } + /* Since the TID pairs map entire pages, make sure that we + * are not going to try to send more data that we have + * remaining. */ + len = min(len, req->data_len - req->sent); + } else + len = min(req->data_len - req->sent, (u32)req->info.fragsize); + SDMA_DBG(req, "Data Length = %u", len); + return len; +} + +static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) +{ + /* (Size of complete header - size of PBC) + 4B ICRC + data length */ + return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); +} + +static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) +{ + int ret = 0; + unsigned npkts = 0; + struct user_sdma_txreq *tx = NULL; + struct hfi1_user_sdma_pkt_q *pq = NULL; + struct user_sdma_iovec *iovec = NULL; + + if (!req->pq) { + ret = -EINVAL; + goto done; + } + + pq = req->pq; + + /* + * Check if we might have sent the entire request already + */ + if (unlikely(req->seqnum == req->info.npkts)) { + if (!list_empty(&req->txps)) + goto dosend; + goto done; + } + + if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) + maxpkts = req->info.npkts - req->seqnum; + + while (npkts < maxpkts) { + u32 datalen = 0, queued = 0, data_sent = 0; + u64 iov_offset = 0; + + /* + * Check whether any of the completions have come back + * with errors. If so, we are not going to process any + * more packets from this request. + */ + if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { + set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + ret = -EFAULT; + goto done; + } + + tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); + if (!tx) { + ret = -ENOMEM; + goto done; + } + tx->flags = 0; + tx->req = req; + tx->busycount = 0; + tx->iovec1 = NULL; + tx->iovec2 = NULL; + + if (req->seqnum == req->info.npkts - 1) + tx->flags |= USER_SDMA_TXREQ_FLAGS_LAST_PKT; + + /* + * Calculate the payload size - this is min of the fragment + * (MTU) size or the remaining bytes in the request but only + * if we have payload data. + */ + if (req->data_len) { + iovec = &req->iovs[req->iov_idx]; + if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { + if (++req->iov_idx == req->data_iovs) { + ret = -EFAULT; + goto free_txreq; + } + iovec = &req->iovs[req->iov_idx]; + WARN_ON(iovec->offset); + } + + /* + * This request might include only a header and no user + * data, so pin pages only if there is data and it the + * pages have not been pinned already. + */ + if (unlikely(!iovec->pages && iovec->iov.iov_len)) { + ret = pin_vector_pages(req, iovec); + if (ret) + goto free_tx; + } + + tx->iovec1 = iovec; + datalen = compute_data_length(req, tx); + if (!datalen) { + SDMA_DBG(req, + "Request has data but pkt len is 0"); + ret = -EFAULT; + goto free_tx; + } + } + + if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { + if (!req->seqnum) { + u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); + u32 lrhlen = get_lrh_len(req->hdr, datalen); + /* + * Copy the request header into the tx header + * because the HW needs a cacheline-aligned + * address. + * This copy can be optimized out if the hdr + * member of user_sdma_request were also + * cacheline aligned. + */ + memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); + if (PBC2LRH(pbclen) != lrhlen) { + pbclen = (pbclen & 0xf000) | + LRH2PBC(lrhlen); + tx->hdr.pbc[0] = cpu_to_le16(pbclen); + } + ret = sdma_txinit_ahg(&tx->txreq, + SDMA_TXREQ_F_AHG_COPY, + sizeof(tx->hdr) + datalen, + req->ahg_idx, 0, NULL, 0, + user_sdma_txreq_cb); + if (ret) + goto free_tx; + ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, + &tx->hdr, + sizeof(tx->hdr)); + if (ret) + goto free_txreq; + } else { + int changes; + + changes = set_txreq_header_ahg(req, tx, + datalen); + if (changes < 0) + goto free_tx; + sdma_txinit_ahg(&tx->txreq, + SDMA_TXREQ_F_USE_AHG, + datalen, req->ahg_idx, changes, + req->ahg, sizeof(req->hdr), + user_sdma_txreq_cb); + } + } else { + ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + + datalen, user_sdma_txreq_cb); + if (ret) + goto free_tx; + /* + * Modify the header for this packet. This only needs + * to be done if we are not going to use AHG. Otherwise, + * the HW will do it based on the changes we gave it + * during sdma_txinit_ahg(). + */ + ret = set_txreq_header(req, tx, datalen); + if (ret) + goto free_txreq; + } + + /* + * If the request contains any data vectors, add up to + * fragsize bytes to the descriptor. + */ + while (queued < datalen && + (req->sent + data_sent) < req->data_len) { + unsigned long base, offset; + unsigned pageidx, len; + + base = (unsigned long)iovec->iov.iov_base; + offset = ((base + iovec->offset + iov_offset) & + ~PAGE_MASK); + pageidx = (((iovec->offset + iov_offset + + base) - (base & PAGE_MASK)) >> PAGE_SHIFT); + len = offset + req->info.fragsize > PAGE_SIZE ? + PAGE_SIZE - offset : req->info.fragsize; + len = min((datalen - queued), len); + ret = sdma_txadd_page(pq->dd, &tx->txreq, + iovec->pages[pageidx], + offset, len); + if (ret) { + dd_dev_err(pq->dd, + "SDMA txreq add page failed %d\n", + ret); + iovec_set_complete(iovec); + goto free_txreq; + } + iov_offset += len; + queued += len; + data_sent += len; + if (unlikely(queued < datalen && + pageidx == iovec->npages && + req->iov_idx < req->data_iovs - 1)) { + iovec->offset += iov_offset; + iovec = &req->iovs[++req->iov_idx]; + if (!iovec->pages) { + ret = pin_vector_pages(req, iovec); + if (ret) + goto free_txreq; + } + iov_offset = 0; + tx->iovec2 = iovec; + + } + } + /* + * The txreq was submitted successfully so we can update + * the counters. + */ + req->koffset += datalen; + if (req_opcode(req->info.ctrl) == EXPECTED) + req->tidoffset += datalen; + req->sent += data_sent; + if (req->data_len) { + if (tx->iovec1 && !tx->iovec2) + tx->iovec1->offset += iov_offset; + else if (tx->iovec2) + tx->iovec2->offset += iov_offset; + } + /* + * It is important to increment this here as it is used to + * generate the BTH.PSN and, therefore, can't be bulk-updated + * outside of the loop. + */ + tx->seqnum = req->seqnum++; + list_add_tail(&tx->txreq.list, &req->txps); + npkts++; + } +dosend: + ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps); + if (list_empty(&req->txps)) + if (req->seqnum == req->info.npkts) { + set_bit(SDMA_REQ_SEND_DONE, &req->flags); + /* + * The txreq has already been submitted to the HW queue + * so we can free the AHG entry now. Corruption will not + * happen due to the sequential manner in which + * descriptors are processed. + */ + if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) + sdma_ahg_free(req->sde, req->ahg_idx); + } + goto done; +free_txreq: + sdma_txclean(pq->dd, &tx->txreq); +free_tx: + kmem_cache_free(pq->txreq_cache, tx); +done: + return ret; +} + +/* + * How many pages in this iovec element? + */ +static inline int num_user_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +static int pin_vector_pages(struct user_sdma_request *req, + struct user_sdma_iovec *iovec) { + int ret = 0; + unsigned pinned; + + iovec->npages = num_user_pages(&iovec->iov); + iovec->pages = kzalloc(sizeof(*iovec->pages) * + iovec->npages, GFP_KERNEL); + if (!iovec->pages) { + SDMA_DBG(req, "Failed page array alloc"); + ret = -ENOMEM; + goto done; + } + /* If called by the kernel thread, use the user's mm */ + if (current->flags & PF_KTHREAD) + use_mm(req->user_proc->mm); + pinned = get_user_pages_fast( + (unsigned long)iovec->iov.iov_base, + iovec->npages, 0, iovec->pages); + /* If called by the kernel thread, unuse the user's mm */ + if (current->flags & PF_KTHREAD) + unuse_mm(req->user_proc->mm); + if (pinned != iovec->npages) { + SDMA_DBG(req, "Failed to pin pages (%u/%u)", pinned, + iovec->npages); + ret = -EFAULT; + goto pfree; + } + goto done; +pfree: + unpin_vector_pages(iovec); +done: + return ret; +} + +static void unpin_vector_pages(struct user_sdma_iovec *iovec) +{ + unsigned i; + + if (ACCESS_ONCE(iovec->offset) != iovec->iov.iov_len) { + hfi1_cdbg(SDMA, + "the complete vector has not been sent yet %llu %zu", + iovec->offset, iovec->iov.iov_len); + return; + } + for (i = 0; i < iovec->npages; i++) + if (iovec->pages[i]) + put_page(iovec->pages[i]); + kfree(iovec->pages); + iovec->pages = NULL; + iovec->npages = 0; + iovec->offset = 0; +} + +static int check_header_template(struct user_sdma_request *req, + struct hfi1_pkt_header *hdr, u32 lrhlen, + u32 datalen) +{ + /* + * Perform safety checks for any type of packet: + * - transfer size is multiple of 64bytes + * - packet length is multiple of 4bytes + * - entire request length is multiple of 4bytes + * - packet length is not larger than MTU size + * + * These checks are only done for the first packet of the + * transfer since the header is "given" to us by user space. + * For the remainder of the packets we compute the values. + */ + if (req->info.fragsize % PIO_BLOCK_SIZE || + lrhlen & 0x3 || req->data_len & 0x3 || + lrhlen > get_lrh_len(*hdr, req->info.fragsize)) + return -EINVAL; + + if (req_opcode(req->info.ctrl) == EXPECTED) { + /* + * The header is checked only on the first packet. Furthermore, + * we ensure that at least one TID entry is copied when the + * request is submitted. Therefore, we don't have to verify that + * tididx points to something sane. + */ + u32 tidval = req->tids[req->tididx], + tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, + tididx = EXP_TID_GET(tidval, IDX), + tidctrl = EXP_TID_GET(tidval, CTRL), + tidoff; + __le32 kval = hdr->kdeth.ver_tid_offset; + + tidoff = KDETH_GET(kval, OFFSET) * + (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? + KDETH_OM_LARGE : KDETH_OM_SMALL); + /* + * Expected receive packets have the following + * additional checks: + * - offset is not larger than the TID size + * - TIDCtrl values match between header and TID array + * - TID indexes match between header and TID array + */ + if ((tidoff + datalen > tidlen) || + KDETH_GET(kval, TIDCTRL) != tidctrl || + KDETH_GET(kval, TID) != tididx) + return -EINVAL; + } + return 0; +} + +/* + * Correctly set the BTH.PSN field based on type of + * transfer - eager packets can just increment the PSN but + * expected packets encode generation and sequence in the + * BTH.PSN field so just incrementing will result in errors. + */ +static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) +{ + u32 val = be32_to_cpu(bthpsn), + mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : + 0xffffffull), + psn = val & mask; + if (expct) + psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); + else + psn = psn + frags; + return psn & mask; +} + +static int set_txreq_header(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 datalen) +{ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct hfi1_pkt_header *hdr = &tx->hdr; + u16 pbclen; + int ret; + u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen); + + /* Copy the header template to the request before modification */ + memcpy(hdr, &req->hdr, sizeof(*hdr)); + + /* + * Check if the PBC and LRH length are mismatched. If so + * adjust both in the header. + */ + pbclen = le16_to_cpu(hdr->pbc[0]); + if (PBC2LRH(pbclen) != lrhlen) { + pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); + hdr->pbc[0] = cpu_to_le16(pbclen); + hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); + /* + * Third packet + * This is the first packet in the sequence that has + * a "static" size that can be used for the rest of + * the packets (besides the last one). + */ + if (unlikely(req->seqnum == 2)) { + /* + * From this point on the lengths in both the + * PBC and LRH are the same until the last + * packet. + * Adjust the template so we don't have to update + * every packet + */ + req->hdr.pbc[0] = hdr->pbc[0]; + req->hdr.lrh[2] = hdr->lrh[2]; + } + } + /* + * We only have to modify the header if this is not the + * first packet in the request. Otherwise, we use the + * header given to us. + */ + if (unlikely(!req->seqnum)) { + ret = check_header_template(req, hdr, lrhlen, datalen); + if (ret) + return ret; + goto done; + + } + + hdr->bth[2] = cpu_to_be32( + set_pkt_bth_psn(hdr->bth[2], + (req_opcode(req->info.ctrl) == EXPECTED), + req->seqnum)); + + /* Set ACK request on last packet */ + if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) + hdr->bth[2] |= cpu_to_be32(1UL<<31); + + /* Set the new offset */ + hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); + /* Expected packets have to fill in the new TID information */ + if (req_opcode(req->info.ctrl) == EXPECTED) { + tidval = req->tids[req->tididx]; + /* + * If the offset puts us at the end of the current TID, + * advance everything. + */ + if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * + PAGE_SIZE)) { + req->tidoffset = 0; + /* Since we don't copy all the TIDs, all at once, + * we have to check again. */ + if (++req->tididx > req->n_tids - 1 || + !req->tids[req->tididx]) { + return -EINVAL; + } + tidval = req->tids[req->tididx]; + } + req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= + KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL; + /* Set KDETH.TIDCtrl based on value for this TID. */ + KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, + EXP_TID_GET(tidval, CTRL)); + /* Set KDETH.TID based on value for this TID */ + KDETH_SET(hdr->kdeth.ver_tid_offset, TID, + EXP_TID_GET(tidval, IDX)); + /* Clear KDETH.SH only on the last packet */ + if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) + KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); + /* + * Set the KDETH.OFFSET and KDETH.OM based on size of + * transfer. + */ + SDMA_DBG(req, "TID offset %ubytes %uunits om%u", + req->tidoffset, req->tidoffset / req->omfactor, + !!(req->omfactor - KDETH_OM_SMALL)); + KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, + req->tidoffset / req->omfactor); + KDETH_SET(hdr->kdeth.ver_tid_offset, OM, + !!(req->omfactor - KDETH_OM_SMALL)); + } +done: + trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, hdr, tidval); + return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); +} + +static int set_txreq_header_ahg(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 len) +{ + int diff = 0; + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct hfi1_pkt_header *hdr = &req->hdr; + u16 pbclen = le16_to_cpu(hdr->pbc[0]); + u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len); + + if (PBC2LRH(pbclen) != lrhlen) { + /* PBC.PbcLengthDWs */ + AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, + cpu_to_le16(LRH2PBC(lrhlen))); + /* LRH.PktLen (we need the full 16 bits due to byte swap) */ + AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, + cpu_to_be16(lrhlen >> 2)); + } + + /* + * Do the common updates + */ + /* BTH.PSN and BTH.A */ + val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & + (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); + if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) + val32 |= 1UL << 31; + AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); + AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); + /* KDETH.Offset */ + AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, + cpu_to_le16(req->koffset & 0xffff)); + AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, + cpu_to_le16(req->koffset >> 16)); + if (req_opcode(req->info.ctrl) == EXPECTED) { + __le16 val; + + tidval = req->tids[req->tididx]; + + /* + * If the offset puts us at the end of the current TID, + * advance everything. + */ + if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * + PAGE_SIZE)) { + req->tidoffset = 0; + /* Since we don't copy all the TIDs, all at once, + * we have to check again. */ + if (++req->tididx > req->n_tids - 1 || + !req->tids[req->tididx]) { + return -EINVAL; + } + tidval = req->tids[req->tididx]; + } + req->omfactor = ((EXP_TID_GET(tidval, LEN) * + PAGE_SIZE) >= + KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE : + KDETH_OM_SMALL; + /* KDETH.OM and KDETH.OFFSET (TID) */ + AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, + ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 | + ((req->tidoffset / req->omfactor) & 0x7fff))); + /* KDETH.TIDCtrl, KDETH.TID */ + val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | + (EXP_TID_GET(tidval, IDX) & 0x3ff)); + /* Clear KDETH.SH on last packet */ + if (unlikely(tx->flags & USER_SDMA_TXREQ_FLAGS_LAST_PKT)) { + val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset, + INTR) >> 16); + val &= cpu_to_le16(~(1U << 13)); + AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); + } else + AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val); + } + + trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, req->sde->this_idx, + req->ahg_idx, req->ahg, diff, tidval); + return diff; +} + +static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status, + int drain) +{ + struct user_sdma_txreq *tx = + container_of(txreq, struct user_sdma_txreq, txreq); + struct user_sdma_request *req = tx->req; + struct hfi1_user_sdma_pkt_q *pq = req ? req->pq : NULL; + u64 tx_seqnum; + + if (unlikely(!req || !pq)) + return; + + if (tx->iovec1) + iovec_may_free(tx->iovec1, unpin_vector_pages); + if (tx->iovec2) + iovec_may_free(tx->iovec2, unpin_vector_pages); + + tx_seqnum = tx->seqnum; + kmem_cache_free(pq->txreq_cache, tx); + + if (status != SDMA_TXREQ_S_OK) { + dd_dev_err(pq->dd, "SDMA completion with error %d", status); + set_comp_state(req, ERROR, status); + set_bit(SDMA_REQ_HAS_ERROR, &req->flags); + /* Do not free the request until the sender loop has ack'ed + * the error and we've seen all txreqs. */ + if (tx_seqnum == ACCESS_ONCE(req->seqnum) && + test_bit(SDMA_REQ_DONE_ERROR, &req->flags)) { + atomic_dec(&pq->n_reqs); + user_sdma_free_request(req); + } + } else { + if (tx_seqnum == req->info.npkts - 1) { + /* We've sent and completed all packets in this + * request. Signal completion to the user */ + atomic_dec(&pq->n_reqs); + set_comp_state(req, COMPLETE, 0); + user_sdma_free_request(req); + } + } + if (!atomic_read(&pq->n_reqs)) + xchg(&pq->state, SDMA_PKT_Q_INACTIVE); +} + +static void user_sdma_free_request(struct user_sdma_request *req) +{ + if (!list_empty(&req->txps)) { + struct sdma_txreq *t, *p; + + list_for_each_entry_safe(t, p, &req->txps, list) { + struct user_sdma_txreq *tx = + container_of(t, struct user_sdma_txreq, txreq); + list_del_init(&t->list); + sdma_txclean(req->pq->dd, t); + kmem_cache_free(req->pq->txreq_cache, tx); + } + } + if (req->data_iovs) { + int i; + + for (i = 0; i < req->data_iovs; i++) + if (req->iovs[i].npages && req->iovs[i].pages) + unpin_vector_pages(&req->iovs[i]); + } + if (req->user_proc) + put_task_struct(req->user_proc); + kfree(req->tids); + clear_bit(SDMA_REQ_IN_USE, &req->flags); +} + +static inline void set_comp_state(struct user_sdma_request *req, + enum hfi1_sdma_comp_state state, + int ret) +{ + SDMA_DBG(req, "Setting completion status %u %d", state, ret); + req->cq->comps[req->info.comp_idx].status = state; + if (state == ERROR) + req->cq->comps[req->info.comp_idx].errcode = -ret; + trace_hfi1_sdma_user_completion(req->pq->dd, req->pq->ctxt, + req->pq->subctxt, req->info.comp_idx, + state, ret); +} diff --git a/drivers/staging/rdma/hfi1/user_sdma.h b/drivers/staging/rdma/hfi1/user_sdma.h new file mode 100644 index 000000000..fa4422553 --- /dev/null +++ b/drivers/staging/rdma/hfi1/user_sdma.h @@ -0,0 +1,89 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include + +#include "common.h" +#include "iowait.h" + +#define EXP_TID_TIDLEN_MASK 0x7FFULL +#define EXP_TID_TIDLEN_SHIFT 0 +#define EXP_TID_TIDCTRL_MASK 0x3ULL +#define EXP_TID_TIDCTRL_SHIFT 20 +#define EXP_TID_TIDIDX_MASK 0x7FFULL +#define EXP_TID_TIDIDX_SHIFT 22 +#define EXP_TID_GET(tid, field) \ + (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK) + +extern uint extended_psn; + +struct hfi1_user_sdma_pkt_q { + struct list_head list; + unsigned ctxt; + unsigned subctxt; + u16 n_max_reqs; + atomic_t n_reqs; + u16 reqidx; + struct hfi1_devdata *dd; + struct kmem_cache *txreq_cache; + struct user_sdma_request *reqs; + struct iowait busy; + unsigned state; +}; + +struct hfi1_user_sdma_comp_q { + u16 nentries; + struct hfi1_sdma_comp_entry *comps; +}; + +int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *); +int hfi1_user_sdma_free_queues(struct hfi1_filedata *); +int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long, + unsigned long *); diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c new file mode 100644 index 000000000..41bb59eb0 --- /dev/null +++ b/drivers/staging/rdma/hfi1/verbs.c @@ -0,0 +1,2145 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "common.h" +#include "device.h" +#include "trace.h" +#include "qp.h" +#include "sdma.h" + +unsigned int hfi1_lkey_table_size = 16; +module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int hfi1_max_pds = 0xFFFF; +module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int hfi1_max_ahs = 0xFFFF; +module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int hfi1_max_cqes = 0x2FFFF; +module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int hfi1_max_cqs = 0x1FFFF; +module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int hfi1_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int hfi1_max_qps = 16384; +module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int hfi1_max_sges = 0x60; +module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int hfi1_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int hfi1_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached, + uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int hfi1_max_srqs = 1024; +module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int hfi1_max_srq_sges = 128; +module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int hfi1_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status, + int drained); + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; hfi1_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_hfi1_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = HFI1_POST_RECV_OK, + [IB_QPS_RTR] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK, + [IB_QPS_RTS] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | + HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK | + HFI1_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | + HFI1_POST_SEND_OK | HFI1_PROCESS_SEND_OK, + [IB_QPS_SQE] = HFI1_POST_RECV_OK | HFI1_PROCESS_RECV_OK | + HFI1_POST_SEND_OK | HFI1_FLUSH_SEND, + [IB_QPS_ERR] = HFI1_POST_RECV_OK | HFI1_FLUSH_RECV | + HFI1_POST_SEND_OK | HFI1_FLUSH_SEND, +}; + +struct hfi1_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct hfi1_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct hfi1_ucontext, ibucontext); +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * Length of header by opcode, 0 --> not supported + */ +const u8 hdr_len_by_opcode[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4, + [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4, + [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28, + [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12 +}; + +static const opcode_handler opcode_handler_tbl[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv, + [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv, + /* CNP */ + [IB_OPCODE_CNP] = &hfi1_cnp_rcv +}; + +/* + * System image GUID. + */ +__be64 ib_hfi1_sys_image_guid; + +/** + * hfi1_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void hfi1_copy_sge( + struct hfi1_sge_state *ss, + void *data, u32 length, + int release) +{ + struct hfi1_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + hfi1_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * hfi1_skip_sge - skip over SGE memory + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release) +{ + struct hfi1_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + hfi1_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/** + * post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int post_one_send(struct hfi1_qp *qp, struct ib_send_wr *wr) +{ + struct hfi1_swqe *wqe; + u32 next; + int i; + int j; + int acc; + struct hfi1_lkey_table *rkt; + struct hfi1_pd *pd; + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + + /* IB spec says that num_sge == 0 is OK. */ + if (unlikely(wr->num_sge > qp->s_max_sge)) + return -EINVAL; + + ppd = &dd->pport[qp->port_num - 1]; + ibp = &ppd->ibport_data; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (wr->opcode == IB_WR_FAST_REG_MR) { + return -EINVAL; + } else if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + return -EINVAL; + } else if (qp->ibqp.qp_type != IB_QPT_RC) { + /* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + return -EINVAL; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + return -EINVAL; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + return -EINVAL; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + return -EINVAL; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + return -EINVAL; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) + return -ENOMEM; + + rkt = &to_idev(qp->ibqp.device)->lk_table; + pd = to_ipd(qp->ibqp.pd); + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + j = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = hfi1_lkey_ok(rkt, pd, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval_free; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval_free; + } else { + struct hfi1_ah *ah = to_iah(wr->wr.ud.ah); + + atomic_inc(&ah->refcount); + } + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + return 0; + +bail_inval_free: + /* release mr holds */ + while (j) { + struct hfi1_sge *sge = &wqe->sg_list[--j]; + + hfi1_put_mr(sge->mr); + } + return -EINVAL; +} + +/** + * post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + int err = 0; + int call_send; + unsigned long flags; + unsigned nreq = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_hfi1_state_ops[qp->state] & HFI1_POST_SEND_OK))) { + spin_unlock_irqrestore(&qp->s_lock, flags); + return -EINVAL; + } + + /* sq empty and not list -> call send */ + call_send = qp->s_head == qp->s_last && !wr->next; + + for (; wr; wr = wr->next) { + err = post_one_send(qp, wr); + if (unlikely(err)) { + *bad_wr = wr; + goto bail; + } + nreq++; + } +bail: + if (nreq && !call_send) + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + if (nreq && call_send) + hfi1_do_send(&qp->s_iowait.iowork); + return err; +} + +/** + * post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + struct hfi1_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_hfi1_state_ops[qp->state] & HFI1_POST_RECV_OK) || !wq) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct hfi1_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/* + * Make sure the QP is ready and able to accept the given opcode. + */ +static inline int qp_ok(int opcode, struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp; + + if (!(ib_hfi1_state_ops[packet->qp->state] & HFI1_PROCESS_RECV_OK)) + goto dropit; + if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) || + (opcode == IB_OPCODE_CNP)) + return 1; +dropit: + ibp = &packet->rcd->ppd->ibport_data; + ibp->n_pkt_drops++; + return 0; +} + + +/** + * hfi1_ib_rcv - process an incoming packet + * @packet: data packet information + * + * This is called to process an incoming packet at interrupt level. + * + * Tlen is the length of the header + data + CRC in bytes. + */ +void hfi1_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == HFI1_LRH_BTH) + packet->ohdr = &hdr->u.oth; + else if (lnh == HFI1_LRH_GRH) { + u32 vtf; + + packet->ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + packet->rcv_flags |= HFI1_HAS_GRH; + } else + goto drop; + + trace_input_ibhdr(rcd->dd, hdr); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(packet->ohdr->bth[1]) & HFI1_QPN_MASK; + lid = be16_to_cpu(hdr->lrh[1]); + if (unlikely((lid >= HFI1_MULTICAST_LID_BASE) && + (lid != HFI1_PERMISSIVE_LID))) { + struct hfi1_mcast *mcast; + struct hfi1_mcast_qp *p; + + if (lnh != HFI1_LRH_GRH) + goto drop; + mcast = hfi1_mcast_find(ibp, &hdr->u.l.grh.dgid); + if (mcast == NULL) + goto drop; + list_for_each_entry_rcu(p, &mcast->qp_list, list) { + packet->qp = p->qp; + spin_lock(&packet->qp->r_lock); + if (likely((qp_ok(opcode, packet)))) + opcode_handler_tbl[opcode](packet); + spin_unlock(&packet->qp->r_lock); + } + /* + * Notify hfi1_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + rcu_read_lock(); + packet->qp = hfi1_lookup_qpn(ibp, qp_num); + if (!packet->qp) { + rcu_read_unlock(); + goto drop; + } + spin_lock(&packet->qp->r_lock); + if (likely((qp_ok(opcode, packet)))) + opcode_handler_tbl[opcode](packet); + spin_unlock(&packet->qp->r_lock); + rcu_read_unlock(); + } + return; + +drop: + ibp->n_pkt_drops++; +} + +/* + * This is called from a timer to check for QPs + * which need kernel memory in order to send a packet. + */ +static void mem_timer(unsigned long data) +{ + struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data; + struct list_head *list = &dev->memwait; + struct hfi1_qp *qp = NULL; + struct iowait *wait; + unsigned long flags; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + if (!list_empty(list)) { + wait = list_first_entry(list, struct iowait, list); + qp = container_of(wait, struct hfi1_qp, s_iowait); + list_del_init(&qp->s_iowait.list); + /* refcount held until actual wake up */ + if (!list_empty(list)) + mod_timer(&dev->mem_timer, jiffies + 1); + } + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + + if (qp) + hfi1_qp_wakeup(qp, HFI1_S_WAIT_KMEM); +} + +void update_sge(struct hfi1_sge_state *ss, u32 length) +{ + struct hfi1_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= HFI1_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +static noinline struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct hfi1_qp *qp) +{ + struct verbs_txreq *tx; + unsigned long flags; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (!tx) { + spin_lock_irqsave(&qp->s_lock, flags); + write_seqlock(&dev->iowait_lock); + if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK && + list_empty(&qp->s_iowait.list)) { + dev->n_txwait++; + qp->s_flags |= HFI1_S_WAIT_TX; + list_add_tail(&qp->s_iowait.list, &dev->txwait); + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TX); + atomic_inc(&qp->refcount); + } + qp->s_flags &= ~HFI1_S_BUSY; + write_sequnlock(&dev->iowait_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + tx = ERR_PTR(-EBUSY); + } + return tx; +} + +static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, + struct hfi1_qp *qp) +{ + struct verbs_txreq *tx; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (!tx) { + /* call slow path to get the lock */ + tx = __get_txreq(dev, qp); + if (IS_ERR(tx)) + return tx; + } + tx->qp = qp; + return tx; +} + +void hfi1_put_txreq(struct verbs_txreq *tx) +{ + struct hfi1_ibdev *dev; + struct hfi1_qp *qp; + unsigned long flags; + unsigned int seq; + + qp = tx->qp; + dev = to_idev(qp->ibqp.device); + + if (tx->mr) { + hfi1_put_mr(tx->mr); + tx->mr = NULL; + } + sdma_txclean(dd_from_dev(dev), &tx->txreq); + + /* Free verbs_txreq and return to slab cache */ + kmem_cache_free(dev->verbs_txreq_cache, tx); + + do { + seq = read_seqbegin(&dev->iowait_lock); + if (!list_empty(&dev->txwait)) { + struct iowait *wait; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + /* Wake up first QP wanting a free struct */ + wait = list_first_entry(&dev->txwait, struct iowait, + list); + qp = container_of(wait, struct hfi1_qp, s_iowait); + list_del_init(&qp->s_iowait.list); + /* refcount held until actual wake up */ + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + hfi1_qp_wakeup(qp, HFI1_S_WAIT_TX); + break; + } + } while (read_seqretry(&dev->iowait_lock, seq)); +} + +/* + * This is called with progress side lock held. + */ +/* New API */ +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status, + int drained) +{ + struct verbs_txreq *tx = + container_of(cookie, struct verbs_txreq, txreq); + struct hfi1_qp *qp = tx->qp; + + spin_lock(&qp->s_lock); + if (tx->wqe) + hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + else if (qp->ibqp.qp_type == IB_QPT_RC) { + struct hfi1_ib_header *hdr; + + hdr = &tx->phdr.hdr; + hfi1_rc_send_complete(qp, hdr); + } + if (drained) { + /* + * This happens when the send engine notes + * a QP in the error state and cannot + * do the flush work until that QP's + * sdma work has finished. + */ + if (qp->s_flags & HFI1_S_WAIT_DMA) { + qp->s_flags &= ~HFI1_S_WAIT_DMA; + hfi1_schedule_send(qp); + } + } + spin_unlock(&qp->s_lock); + + hfi1_put_txreq(tx); +} + +static int wait_kmem(struct hfi1_ibdev *dev, struct hfi1_qp *qp) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { + write_seqlock(&dev->iowait_lock); + if (list_empty(&qp->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= HFI1_S_WAIT_KMEM; + list_add_tail(&qp->s_iowait.list, &dev->memwait); + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_KMEM); + atomic_inc(&qp->refcount); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~HFI1_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + return ret; +} + +/* + * This routine calls txadds for each sg entry. + * + * Add failures will revert the sge cursor + */ +static int build_verbs_ulp_payload( + struct sdma_engine *sde, + struct hfi1_sge_state *ss, + u32 length, + struct verbs_txreq *tx) +{ + struct hfi1_sge *sg_list = ss->sg_list; + struct hfi1_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 len; + int ret = 0; + + while (length) { + len = ss->sge.length; + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + WARN_ON_ONCE(len == 0); + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + ss->sge.vaddr, + len); + if (ret) + goto bail_txadd; + update_sge(ss, len); + length -= len; + } + return ret; +bail_txadd: + /* unwind cursor */ + ss->sge = sge; + ss->num_sge = num_sge; + ss->sg_list = sg_list; + return ret; +} + +/* + * Build the number of DMA descriptors needed to send length bytes of data. + * + * NOTE: DMA mapping is held in the tx until completed in the ring or + * the tx desc is freed without having been submitted to the ring + * + * This routine insures the following all the helper routine + * calls succeed. + */ +/* New API */ +static int build_verbs_tx_desc( + struct sdma_engine *sde, + struct hfi1_sge_state *ss, + u32 length, + struct verbs_txreq *tx, + struct ahg_ib_header *ahdr, + u64 pbc) +{ + int ret = 0; + struct hfi1_pio_header *phdr; + u16 hdrbytes = tx->hdr_dwords << 2; + + phdr = &tx->phdr; + if (!ahdr->ahgcount) { + ret = sdma_txinit_ahg( + &tx->txreq, + ahdr->tx_flags, + hdrbytes + length, + ahdr->ahgidx, + 0, + NULL, + 0, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + phdr->pbc = cpu_to_le64(pbc); + memcpy(&phdr->hdr, &ahdr->ibh, hdrbytes - sizeof(phdr->pbc)); + /* add the header */ + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + &tx->phdr, + tx->hdr_dwords << 2); + if (ret) + goto bail_txadd; + } else { + struct hfi1_other_headers *sohdr = &ahdr->ibh.u.oth; + struct hfi1_other_headers *dohdr = &phdr->hdr.u.oth; + + /* needed in rc_send_complete() */ + phdr->hdr.lrh[0] = ahdr->ibh.lrh[0]; + if ((be16_to_cpu(phdr->hdr.lrh[0]) & 3) == HFI1_LRH_GRH) { + sohdr = &ahdr->ibh.u.l.oth; + dohdr = &phdr->hdr.u.l.oth; + } + /* opcode */ + dohdr->bth[0] = sohdr->bth[0]; + /* PSN/ACK */ + dohdr->bth[2] = sohdr->bth[2]; + ret = sdma_txinit_ahg( + &tx->txreq, + ahdr->tx_flags, + length, + ahdr->ahgidx, + ahdr->ahgcount, + ahdr->ahgdesc, + hdrbytes, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + } + + /* add the ulp payload - if any. ss can be NULL for acks */ + if (ss) + ret = build_verbs_ulp_payload(sde, ss, length, tx); +bail_txadd: + return ret; +} + +int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *ahdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc) +{ + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct verbs_txreq *tx; + struct sdma_txreq *stx; + u64 pbc_flags = 0; + struct sdma_engine *sde; + u8 sc5 = qp->s_sc; + int ret; + + if (!list_empty(&qp->s_iowait.tx_head)) { + stx = list_first_entry( + &qp->s_iowait.tx_head, + struct sdma_txreq, + list); + list_del_init(&stx->list); + tx = container_of(stx, struct verbs_txreq, txreq); + ret = sdma_send_txreq(tx->sde, &qp->s_iowait, stx); + if (unlikely(ret == -ECOMM)) + goto bail_ecomm; + return ret; + } + + tx = get_txreq(dev, qp); + if (IS_ERR(tx)) + goto bail_tx; + + if (!qp->s_hdr->sde) { + tx->sde = sde = qp_to_sdma_engine(qp, sc5); + if (!sde) + goto bail_no_sde; + } else + tx->sde = sde = qp->s_hdr->sde; + + if (likely(pbc == 0)) { + u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + /* No vl15 here */ + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + } + tx->wqe = qp->s_wqe; + tx->mr = qp->s_rdma_mr; + if (qp->s_rdma_mr) + qp->s_rdma_mr = NULL; + tx->hdr_dwords = hdrwords + 2; + ret = build_verbs_tx_desc(sde, ss, len, tx, ahdr, pbc); + if (unlikely(ret)) + goto bail_build; + trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh); + ret = sdma_send_txreq(sde, &qp->s_iowait, &tx->txreq); + if (unlikely(ret == -ECOMM)) + goto bail_ecomm; + return ret; + +bail_no_sde: + hfi1_put_txreq(tx); +bail_ecomm: + /* The current one got "sent" */ + return 0; +bail_build: + /* kmalloc or mapping fail */ + hfi1_put_txreq(tx); + return wait_kmem(dev, qp); +bail_tx: + return PTR_ERR(tx); +} + +/* + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int no_bufs_available(struct hfi1_qp *qp, struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + struct hfi1_ibdev *dev = &dd->verbs_dev; + unsigned long flags; + int ret = 0; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, sc_piobufavail() + * could be called. Therefore, put QP on the I/O wait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_hfi1_state_ops[qp->state] & HFI1_PROCESS_RECV_OK) { + write_seqlock(&dev->iowait_lock); + if (list_empty(&qp->s_iowait.list)) { + struct hfi1_ibdev *dev = &dd->verbs_dev; + int was_empty; + + dev->n_piowait++; + qp->s_flags |= HFI1_S_WAIT_PIO; + was_empty = list_empty(&sc->piowait); + list_add_tail(&qp->s_iowait.list, &sc->piowait); + trace_hfi1_qpsleep(qp, HFI1_S_WAIT_PIO); + atomic_inc(&qp->refcount); + /* counting: only call wantpiobuf_intr if first user */ + if (was_empty) + hfi1_sc_wantpiobuf_intr(sc, 1); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~HFI1_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_pportdata *ppd = dd->pport + (qp->port_num - 1); + u8 vl; + + vl = sc_to_vlt(dd, sc5); + if (vl >= ppd->vls_supported && vl != 15) + return NULL; + return dd->vld[vl].sc; +} + +int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *ahdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 *hdr = (u32 *)&ahdr->ibh; + u64 pbc_flags = 0; + u32 sc5; + unsigned long flags = 0; + struct send_context *sc; + struct pio_buf *pbuf; + int wc_status = IB_WC_SUCCESS; + + /* vl15 special case taken care of in ud.c */ + sc5 = qp->s_sc; + sc = qp_to_send_context(qp, sc5); + + if (!sc) + return -EINVAL; + if (likely(pbc == 0)) { + u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + } + pbuf = sc_buffer_alloc(sc, plen, NULL, NULL); + if (unlikely(pbuf == NULL)) { + if (ppd->host_link_state != HLS_UP_ACTIVE) { + /* + * If we have filled the PIO buffers to capacity and are + * not in an active state this request is not going to + * go out to so just complete it with an error or else a + * ULP or the core may be stuck waiting. + */ + hfi1_cdbg( + PIO, + "alloc failed. state not active, completing"); + wc_status = IB_WC_GENERAL_ERR; + goto pio_bail; + } else { + /* + * This is a normal occurrence. The PIO buffs are full + * up but we are still happily sending, well we could be + * so lets continue to queue the request. + */ + hfi1_cdbg(PIO, "alloc failed. state active, queuing"); + return no_bufs_available(qp, sc); + } + } + + if (len == 0) { + pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords); + } else { + if (ss) { + seg_pio_copy_start(pbuf, pbc, hdr, hdrwords*4); + while (len) { + void *addr = ss->sge.vaddr; + u32 slen = ss->sge.length; + + if (slen > len) + slen = len; + update_sge(ss, slen); + seg_pio_copy_mid(pbuf, addr, slen); + len -= slen; + } + seg_pio_copy_end(pbuf); + } + } + + trace_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &ahdr->ibh); + + if (qp->s_rdma_mr) { + hfi1_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + +pio_bail: + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_send_complete(qp, qp->s_wqe, wc_status); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_rc_send_complete(qp, &ahdr->ibh); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + return 0; +} +/* + * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent + * being an entry from the ingress partition key table), return 0 + * otherwise. Use the matching criteria for egress partition keys + * specified in the OPAv1 spec., section 9.1l.7. + */ +static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) +{ + u16 mkey = pkey & PKEY_LOW_15_MASK; + u16 ment = ent & PKEY_LOW_15_MASK; + + if (mkey == ment) { + /* + * If pkey[15] is set (full partition member), + * is bit 15 in the corresponding table element + * clear (limited member)? + */ + if (pkey & PKEY_MEMBER_MASK) + return !!(ent & PKEY_MEMBER_MASK); + return 1; + } + return 0; +} + +/* + * egress_pkey_check - return 0 if hdr's pkey matches according to the + * criteria in the OPAv1 spec., section 9.11.7. + */ +static inline int egress_pkey_check(struct hfi1_pportdata *ppd, + struct hfi1_ib_header *hdr, + struct hfi1_qp *qp) +{ + struct hfi1_other_headers *ohdr; + struct hfi1_devdata *dd; + int i = 0; + u16 pkey; + u8 lnh, sc5 = qp->s_sc; + + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) + return 0; + + /* locate the pkey within the headers */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + ohdr = &hdr->u.oth; + + pkey = (u16)be32_to_cpu(ohdr->bth[0]); + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + + /* Is the pkey = 0x0, or 0x8000? */ + if ((pkey & PKEY_LOW_15_MASK) == 0) + goto bad; + + /* The most likely matching pkey has index qp->s_pkey_index */ + if (unlikely(!egress_pkey_matches_entry(pkey, + ppd->pkeys[qp->s_pkey_index]))) { + /* no match - try the entire table */ + for (; i < MAX_PKEY_VALUES; i++) { + if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) + break; + } + } + + if (i < MAX_PKEY_VALUES) + return 0; +bad: + incr_cntr64(&ppd->port_xmit_constraint_errors); + dd = ppd->dd; + if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) { + u16 slid = be16_to_cpu(hdr->lrh[3]); + + dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK; + dd->err_info_xmit_constraint.slid = slid; + dd->err_info_xmit_constraint.pkey = pkey; + } + return 1; +} + +/** + * hfi1_verbs_send - send a packet + * @qp: the QP to send on + * @ahdr: the packet header + * @hdrwords: the number of 32-bit words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + * + * Return zero if packet is sent or queued OK. + * Return non-zero and clear qp->s_flags HFI1_S_BUSY otherwise. + */ +int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + u32 plen; + int ret; + int pio = 0; + unsigned long flags = 0; + u32 dwords = (len + 3) >> 2; + + /* + * VL15 packets (IB_QPT_SMI) will always use PIO, so we + * can defer SDMA restart until link goes ACTIVE without + * worrying about just how we got there. + */ + if ((qp->ibqp.qp_type == IB_QPT_SMI) || + !(dd->flags & HFI1_HAS_SEND_DMA)) + pio = 1; + + ret = egress_pkey_check(dd->pport, &ahdr->ibh, qp); + if (unlikely(ret)) { + /* + * The value we are returning here does not get propagated to + * the verbs caller. Thus we need to complete the request with + * error otherwise the caller could be sitting waiting on the + * completion event. Only do this for PIO. SDMA has its own + * mechanism for handling the errors. So for SDMA we can just + * return. + */ + if (pio) { + hfi1_cdbg(PIO, "%s() Failed. Completing with err", + __func__); + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + return -EINVAL; + } + + /* + * Calculate the send buffer trigger address. + * The +2 counts for the pbc control qword + */ + plen = hdrwords + dwords + 2; + + if (pio) { + ret = dd->process_pio_send( + qp, ahdr, hdrwords, ss, len, plen, dwords, 0); + } else { +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(dd, "CONFIG SDMA %s:%d %s()\n", + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(dd, "SDMA hdrwords = %u, len = %u\n", hdrwords, len); +#endif + ret = dd->process_dma_send( + qp, ahdr, hdrwords, ss, len, plen, dwords, 0); + } + + return ret; +} + +static int query_device(struct ib_device *ibdev, + struct ib_device_attr *props, + struct ib_udata *uhw) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibdev *dev = to_idev(ibdev); + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + memset(props, 0, sizeof(*props)); + + props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + + props->page_size_cap = PAGE_SIZE; + props->vendor_id = + dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; + props->vendor_part_id = dd->pcidev->device; + props->hw_ver = dd->minrev; + props->sys_image_guid = ib_hfi1_sys_image_guid; + props->max_mr_size = ~0ULL; + props->max_qp = hfi1_max_qps; + props->max_qp_wr = hfi1_max_qp_wrs; + props->max_sge = hfi1_max_sges; + props->max_sge_rd = hfi1_max_sges; + props->max_cq = hfi1_max_cqs; + props->max_ah = hfi1_max_ahs; + props->max_cqe = hfi1_max_cqes; + props->max_mr = dev->lk_table.max; + props->max_fmr = dev->lk_table.max; + props->max_map_per_fmr = 32767; + props->max_pd = hfi1_max_pds; + props->max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; + /* props->max_res_rd_atom */ + props->max_srq = hfi1_max_srqs; + props->max_srq_wr = hfi1_max_srq_wrs; + props->max_srq_sge = hfi1_max_srq_sges; + /* props->local_ca_ack_delay */ + props->atomic_cap = IB_ATOMIC_GLOB; + props->max_pkeys = hfi1_get_npkeys(dd); + props->max_mcast_grp = hfi1_max_mcast_grps; + props->max_mcast_qp_attach = hfi1_max_mcast_qp_attached; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + + return 0; +} + +static inline u16 opa_speed_to_ib(u16 in) +{ + u16 out = 0; + + if (in & OPA_LINK_SPEED_25G) + out |= IB_SPEED_EDR; + if (in & OPA_LINK_SPEED_12_5G) + out |= IB_SPEED_FDR; + + return out; +} + +/* + * Convert a single OPA link width (no multiple flags) to an IB value. + * A zero OPA link width means link down, which means the IB width value + * is a don't care. + */ +static inline u16 opa_width_to_ib(u16 in) +{ + switch (in) { + case OPA_LINK_WIDTH_1X: + /* map 2x and 3x to 1x as they don't exist in IB */ + case OPA_LINK_WIDTH_2X: + case OPA_LINK_WIDTH_3X: + return IB_WIDTH_1X; + default: /* link down or unknown, return our largest width */ + case OPA_LINK_WIDTH_4X: + return IB_WIDTH_4X; + } +} + +static int query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 lid = ppd->lid; + + memset(props, 0, sizeof(*props)); + props->lid = lid ? lid : 0; + props->lmc = ppd->lmc; + props->sm_lid = ibp->sm_lid; + props->sm_sl = ibp->sm_sl; + /* OPA logical states match IB logical states */ + props->state = driver_lstate(ppd); + props->phys_state = hfi1_ibphys_portstate(ppd); + props->port_cap_flags = ibp->port_cap_flags; + props->gid_tbl_len = HFI1_GUIDS_PER_PORT; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = hfi1_get_npkeys(dd); + props->bad_pkey_cntr = ibp->pkey_violations; + props->qkey_viol_cntr = ibp->qkey_violations; + props->active_width = (u8)opa_width_to_ib(ppd->link_width_active); + /* see rate_show() in ib core/sysfs.c */ + props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active); + props->max_vl_num = ppd->vls_supported; + props->init_type_reply = 0; + + /* Once we are a "first class" citizen and have added the OPA MTUs to + * the core we can advertise the larger MTU enum to the ULPs, for now + * advertise only 4K. + * + * Those applications which are either OPA aware or pass the MTU enum + * from the Path Records to us will get the new 8k MTU. Those that + * attempt to process the MTU enum may fail in various ways. + */ + props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ? + 4096 : hfi1_max_mtu), IB_MTU_4096); + props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : + mtu_to_enum(ppd->ibmtu, IB_MTU_2048); + props->subnet_timeout = ibp->subnet_timeout; + + return 0; +} + +static int port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + err = query_port(ibdev, port_num, &attr); + if (err) + return err; + + memset(immutable, 0, sizeof(*immutable)); + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_INTEL_OPA; + immutable->max_mad_size = OPA_MGMT_MAD_SIZE; + + return 0; +} + +static int modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + unsigned i; + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(device->node_desc, device_modify->node_desc, 64); + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; + + hfi1_node_desc_chg(ibp); + } + } + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { + ib_hfi1_sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; + + hfi1_sys_guid_chg(ibp); + } + } + + ret = 0; + +bail: + return ret; +} + +static int modify_port(struct ib_device *ibdev, u8 port, + int port_modify_mask, struct ib_port_modify *props) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + int ret = 0; + + ibp->port_cap_flags |= props->set_port_cap_mask; + ibp->port_cap_flags &= ~props->clr_port_cap_mask; + if (props->set_port_cap_mask || props->clr_port_cap_mask) + hfi1_cap_mask_chg(ibp); + if (port_modify_mask & IB_PORT_SHUTDOWN) { + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0, + OPA_LINKDOWN_REASON_UNKNOWN); + ret = set_link_state(ppd, HLS_DN_DOWNDEF); + } + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + ibp->qkey_violations = 0; + return ret; +} + +static int query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int ret = 0; + + if (!port || port > dd->num_pports) + ret = -EINVAL; + else { + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + gid->global.subnet_prefix = ibp->gid_prefix; + if (index == 0) + gid->global.interface_id = cpu_to_be64(ppd->guid); + else if (index < HFI1_GUIDS_PER_PORT) + gid->global.interface_id = ibp->guids[index - 1]; + else + ret = -EINVAL; + } + + return ret; +} + +static struct ib_pd *alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct hfi1_ibdev *dev = to_idev(ibdev); + struct hfi1_pd *pd; + struct ib_pd *ret; + + /* + * This is actually totally arbitrary. Some correctness tests + * assume there's a maximum number of PDs that can be allocated. + * We don't actually have this limit, but we fail the test if + * we allow allocations of more than we report for this value. + */ + + pd = kmalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock(&dev->n_pds_lock); + if (dev->n_pds_allocated == hfi1_max_pds) { + spin_unlock(&dev->n_pds_lock); + kfree(pd); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_pds_allocated++; + spin_unlock(&dev->n_pds_lock); + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata != NULL; + + ret = &pd->ibpd; + +bail: + return ret; +} + +static int dealloc_pd(struct ib_pd *ibpd) +{ + struct hfi1_pd *pd = to_ipd(ibpd); + struct hfi1_ibdev *dev = to_idev(ibpd->device); + + spin_lock(&dev->n_pds_lock); + dev->n_pds_allocated--; + spin_unlock(&dev->n_pds_lock); + + kfree(pd); + + return 0; +} + +/* + * convert ah port,sl to sc + */ +u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num); + + return ibp->sl_to_sc[ah->sl]; +} + +int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr) +{ + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u8 sc5; + + /* A multicast address requires a GRH (see ch. 8.4.1). */ + if (ah_attr->dlid >= HFI1_MULTICAST_LID_BASE && + ah_attr->dlid != HFI1_PERMISSIVE_LID && + !(ah_attr->ah_flags & IB_AH_GRH)) + goto bail; + if ((ah_attr->ah_flags & IB_AH_GRH) && + ah_attr->grh.sgid_index >= HFI1_GUIDS_PER_PORT) + goto bail; + if (ah_attr->dlid == 0) + goto bail; + if (ah_attr->port_num < 1 || + ah_attr->port_num > ibdev->phys_port_cnt) + goto bail; + if (ah_attr->static_rate != IB_RATE_PORT_CURRENT && + ib_rate_to_mbps(ah_attr->static_rate) < 0) + goto bail; + if (ah_attr->sl >= OPA_MAX_SLS) + goto bail; + /* test the mapping for validity */ + ibp = to_iport(ibdev, ah_attr->port_num); + ppd = ppd_from_ibp(ibp); + sc5 = ibp->sl_to_sc[ah_attr->sl]; + dd = dd_from_ppd(ppd); + if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf) + goto bail; + return 0; +bail: + return -EINVAL; +} + +/** + * create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + */ +static struct ib_ah *create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct hfi1_ah *ah; + struct ib_ah *ret; + struct hfi1_ibdev *dev = to_idev(pd->device); + unsigned long flags; + + if (hfi1_check_ah(pd->device, ah_attr)) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + ah = kmalloc(sizeof(*ah), GFP_ATOMIC); + if (!ah) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + if (dev->n_ahs_allocated == hfi1_max_ahs) { + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + kfree(ah); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_ahs_allocated++; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + /* ib_create_ah() will initialize ah->ibah. */ + ah->attr = *ah_attr; + atomic_set(&ah->refcount, 0); + + ret = &ah->ibah; + +bail: + return ret; +} + +struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid) +{ + struct ib_ah_attr attr; + struct ib_ah *ah = ERR_PTR(-EINVAL); + struct hfi1_qp *qp0; + + memset(&attr, 0, sizeof(attr)); + attr.dlid = dlid; + attr.port_num = ppd_from_ibp(ibp)->port; + rcu_read_lock(); + qp0 = rcu_dereference(ibp->qp[0]); + if (qp0) + ah = ib_create_ah(qp0->ibqp.pd, &attr); + rcu_read_unlock(); + return ah; +} + +/** + * destroy_ah - destroy an address handle + * @ibah: the AH to destroy + * + * This may be called from interrupt context. + */ +static int destroy_ah(struct ib_ah *ibah) +{ + struct hfi1_ibdev *dev = to_idev(ibah->device); + struct hfi1_ah *ah = to_iah(ibah); + unsigned long flags; + + if (atomic_read(&ah->refcount) != 0) + return -EBUSY; + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + dev->n_ahs_allocated--; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + kfree(ah); + + return 0; +} + +static int modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct hfi1_ah *ah = to_iah(ibah); + + if (hfi1_check_ah(ibah->device, ah_attr)) + return -EINVAL; + + ah->attr = *ah_attr; + + return 0; +} + +static int query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct hfi1_ah *ah = to_iah(ibah); + + *ah_attr = ah->attr; + + return 0; +} + +/** + * hfi1_get_npkeys - return the size of the PKEY table for context 0 + * @dd: the hfi1_ib device + */ +unsigned hfi1_get_npkeys(struct hfi1_devdata *dd) +{ + return ARRAY_SIZE(dd->pport[0].pkeys); +} + +static int query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int ret; + + if (index >= hfi1_get_npkeys(dd)) { + ret = -EINVAL; + goto bail; + } + + *pkey = hfi1_get_pkey(to_iport(ibdev, port), index); + ret = 0; + +bail: + return ret; +} + +/** + * alloc_ucontext - allocate a ucontest + * @ibdev: the infiniband device + * @udata: not used by the driver + */ + +static struct ib_ucontext *alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct hfi1_ucontext *context; + struct ib_ucontext *ret; + + context = kmalloc(sizeof(*context), GFP_KERNEL); + if (!context) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + ret = &context->ibucontext; + +bail: + return ret; +} + +static int dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static void init_ibport(struct hfi1_pportdata *ppd) +{ + struct hfi1_ibport *ibp = &ppd->ibport_data; + size_t sz = ARRAY_SIZE(ibp->sl_to_sc); + int i; + + for (i = 0; i < sz; i++) { + ibp->sl_to_sc[i] = i; + ibp->sc_to_sl[i] = i; + } + + spin_lock_init(&ibp->lock); + /* Set the prefix to the default value (see ch. 4.1.1) */ + ibp->gid_prefix = IB_DEFAULT_GID_PREFIX; + ibp->sm_lid = 0; + /* Below should only set bits defined in OPA PortInfo.CapabilityMask */ + ibp->port_cap_flags = IB_PORT_AUTO_MIGR_SUP | + IB_PORT_CAP_MASK_NOTICE_SUP; + ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + RCU_INIT_POINTER(ibp->qp[0], NULL); + RCU_INIT_POINTER(ibp->qp[1], NULL); +} + +static void verbs_txreq_kmem_cache_ctor(void *obj) +{ + struct verbs_txreq *tx = (struct verbs_txreq *)obj; + + memset(tx, 0, sizeof(*tx)); +} + +/** + * hfi1_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return 0 if successful, errno if unsuccessful. + */ +int hfi1_register_ib_device(struct hfi1_devdata *dd) +{ + struct hfi1_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->ibdev; + struct hfi1_pportdata *ppd = dd->pport; + unsigned i, lk_tab_size; + int ret; + size_t lcpysz = IB_DEVICE_NAME_MAX; + u16 descq_cnt; + + ret = hfi1_qp_init(dev); + if (ret) + goto err_qp_init; + + + for (i = 0; i < dd->num_pports; i++) + init_ibport(ppd + i); + + /* Only need to initialize non-zero fields. */ + spin_lock_init(&dev->n_pds_lock); + spin_lock_init(&dev->n_ahs_lock); + spin_lock_init(&dev->n_cqs_lock); + spin_lock_init(&dev->n_qps_lock); + spin_lock_init(&dev->n_srqs_lock); + spin_lock_init(&dev->n_mcast_grps_lock); + init_timer(&dev->mem_timer); + dev->mem_timer.function = mem_timer; + dev->mem_timer.data = (unsigned long) dev; + + /* + * The top hfi1_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + spin_lock_init(&dev->lk_table.lock); + dev->lk_table.max = 1 << hfi1_lkey_table_size; + /* ensure generation is at least 4 bits (keys.c) */ + if (hfi1_lkey_table_size > MAX_LKEY_TABLE_BITS) { + dd_dev_warn(dd, "lkey bits %u too large, reduced to %u\n", + hfi1_lkey_table_size, MAX_LKEY_TABLE_BITS); + hfi1_lkey_table_size = MAX_LKEY_TABLE_BITS; + } + lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table); + dev->lk_table.table = (struct hfi1_mregion __rcu **) + vmalloc(lk_tab_size); + if (dev->lk_table.table == NULL) { + ret = -ENOMEM; + goto err_lk; + } + RCU_INIT_POINTER(dev->dma_mr, NULL); + for (i = 0; i < dev->lk_table.max; i++) + RCU_INIT_POINTER(dev->lk_table.table[i], NULL); + INIT_LIST_HEAD(&dev->pending_mmaps); + spin_lock_init(&dev->pending_lock); + seqlock_init(&dev->iowait_lock); + dev->mmap_offset = PAGE_SIZE; + spin_lock_init(&dev->mmap_offset_lock); + INIT_LIST_HEAD(&dev->txwait); + INIT_LIST_HEAD(&dev->memwait); + + descq_cnt = sdma_get_descq_cnt(); + + /* SLAB_HWCACHE_ALIGN for AHG */ + dev->verbs_txreq_cache = kmem_cache_create("hfi1_vtxreq_cache", + sizeof(struct verbs_txreq), + 0, SLAB_HWCACHE_ALIGN, + verbs_txreq_kmem_cache_ctor); + if (!dev->verbs_txreq_cache) { + ret = -ENOMEM; + goto err_verbs_txreq; + } + + /* + * The system image GUID is supposed to be the same for all + * HFIs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!ib_hfi1_sys_image_guid) + ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid); + lcpysz = strlcpy(ibdev->name, class_name(), lcpysz); + strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz); + ibdev->owner = THIS_MODULE; + ibdev->node_guid = cpu_to_be64(ppd->guid); + ibdev->uverbs_abi_ver = HFI1_UVERBS_ABI_VERSION; + ibdev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_MODIFY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + ibdev->node_type = RDMA_NODE_IB_CA; + ibdev->phys_port_cnt = dd->num_pports; + ibdev->num_comp_vectors = 1; + ibdev->dma_device = &dd->pcidev->dev; + ibdev->query_device = query_device; + ibdev->modify_device = modify_device; + ibdev->query_port = query_port; + ibdev->modify_port = modify_port; + ibdev->query_pkey = query_pkey; + ibdev->query_gid = query_gid; + ibdev->alloc_ucontext = alloc_ucontext; + ibdev->dealloc_ucontext = dealloc_ucontext; + ibdev->alloc_pd = alloc_pd; + ibdev->dealloc_pd = dealloc_pd; + ibdev->create_ah = create_ah; + ibdev->destroy_ah = destroy_ah; + ibdev->modify_ah = modify_ah; + ibdev->query_ah = query_ah; + ibdev->create_srq = hfi1_create_srq; + ibdev->modify_srq = hfi1_modify_srq; + ibdev->query_srq = hfi1_query_srq; + ibdev->destroy_srq = hfi1_destroy_srq; + ibdev->create_qp = hfi1_create_qp; + ibdev->modify_qp = hfi1_modify_qp; + ibdev->query_qp = hfi1_query_qp; + ibdev->destroy_qp = hfi1_destroy_qp; + ibdev->post_send = post_send; + ibdev->post_recv = post_receive; + ibdev->post_srq_recv = hfi1_post_srq_receive; + ibdev->create_cq = hfi1_create_cq; + ibdev->destroy_cq = hfi1_destroy_cq; + ibdev->resize_cq = hfi1_resize_cq; + ibdev->poll_cq = hfi1_poll_cq; + ibdev->req_notify_cq = hfi1_req_notify_cq; + ibdev->get_dma_mr = hfi1_get_dma_mr; + ibdev->reg_phys_mr = hfi1_reg_phys_mr; + ibdev->reg_user_mr = hfi1_reg_user_mr; + ibdev->dereg_mr = hfi1_dereg_mr; + ibdev->alloc_mr = hfi1_alloc_mr; + ibdev->alloc_fast_reg_page_list = hfi1_alloc_fast_reg_page_list; + ibdev->free_fast_reg_page_list = hfi1_free_fast_reg_page_list; + ibdev->alloc_fmr = hfi1_alloc_fmr; + ibdev->map_phys_fmr = hfi1_map_phys_fmr; + ibdev->unmap_fmr = hfi1_unmap_fmr; + ibdev->dealloc_fmr = hfi1_dealloc_fmr; + ibdev->attach_mcast = hfi1_multicast_attach; + ibdev->detach_mcast = hfi1_multicast_detach; + ibdev->process_mad = hfi1_process_mad; + ibdev->mmap = hfi1_mmap; + ibdev->dma_ops = &hfi1_dma_mapping_ops; + ibdev->get_port_immutable = port_immutable; + + strncpy(ibdev->node_desc, init_utsname()->nodename, + sizeof(ibdev->node_desc)); + + ret = ib_register_device(ibdev, hfi1_create_port_files); + if (ret) + goto err_reg; + + ret = hfi1_create_agents(dev); + if (ret) + goto err_agents; + + ret = hfi1_verbs_register_sysfs(dd); + if (ret) + goto err_class; + + goto bail; + +err_class: + hfi1_free_agents(dev); +err_agents: + ib_unregister_device(ibdev); +err_reg: +err_verbs_txreq: + kmem_cache_destroy(dev->verbs_txreq_cache); + vfree(dev->lk_table.table); +err_lk: + hfi1_qp_exit(dev); +err_qp_init: + dd_dev_err(dd, "cannot register verbs: %d!\n", -ret); +bail: + return ret; +} + +void hfi1_unregister_ib_device(struct hfi1_devdata *dd) +{ + struct hfi1_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->ibdev; + + hfi1_verbs_unregister_sysfs(dd); + + hfi1_free_agents(dev); + + ib_unregister_device(ibdev); + + if (!list_empty(&dev->txwait)) + dd_dev_err(dd, "txwait list not empty!\n"); + if (!list_empty(&dev->memwait)) + dd_dev_err(dd, "memwait list not empty!\n"); + if (dev->dma_mr) + dd_dev_err(dd, "DMA MR not NULL!\n"); + + hfi1_qp_exit(dev); + del_timer_sync(&dev->mem_timer); + kmem_cache_destroy(dev->verbs_txreq_cache); + vfree(dev->lk_table.table); +} + +/* + * This must be called with s_lock held. + */ +void hfi1_schedule_send(struct hfi1_qp *qp) +{ + if (hfi1_send_ok(qp)) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + iowait_schedule(&qp->s_iowait, ppd->hfi1_wq); + } +} + +void hfi1_cnp_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + + if (packet->qp->ibqp.qp_type == IB_QPT_UC) + hfi1_uc_rcv(packet); + else if (packet->qp->ibqp.qp_type == IB_QPT_UD) + hfi1_ud_rcv(packet); + else + ibp->n_pkt_drops++; +} diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h new file mode 100644 index 000000000..ed903a93b --- /dev/null +++ b/drivers/staging/rdma/hfi1/verbs.h @@ -0,0 +1,1151 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef HFI1_VERBS_H +#define HFI1_VERBS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct hfi1_ctxtdata; +struct hfi1_pportdata; +struct hfi1_devdata; +struct hfi1_packet; + +#include "iowait.h" + +#define HFI1_MAX_RDMA_ATOMIC 16 +#define HFI1_GUIDS_PER_PORT 5 + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define HFI1_UVERBS_ABI_VERSION 2 + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +#define IB_SEQ_NAK (3 << 29) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +/* Flags for checking QP state (see ib_hfi1_state_ops[]) */ +#define HFI1_POST_SEND_OK 0x01 +#define HFI1_POST_RECV_OK 0x02 +#define HFI1_PROCESS_RECV_OK 0x04 +#define HFI1_PROCESS_SEND_OK 0x08 +#define HFI1_PROCESS_NEXT_SEND_OK 0x10 +#define HFI1_FLUSH_SEND 0x20 +#define HFI1_FLUSH_RECV 0x40 +#define HFI1_PROCESS_OR_FLUSH_SEND \ + (HFI1_PROCESS_SEND_OK | HFI1_FLUSH_SEND) + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +#define HFI1_VENDOR_IPG cpu_to_be16(0xFFA0) + +#define IB_BTH_REQ_ACK (1 << 31) +#define IB_BTH_SOLICITED (1 << 23) +#define IB_BTH_MIG_REQ (1 << 22) + +#define IB_GRH_VERSION 6 +#define IB_GRH_VERSION_MASK 0xF +#define IB_GRH_VERSION_SHIFT 28 +#define IB_GRH_TCLASS_MASK 0xFF +#define IB_GRH_TCLASS_SHIFT 20 +#define IB_GRH_FLOW_MASK 0xFFFFF +#define IB_GRH_FLOW_SHIFT 0 +#define IB_GRH_NEXT_HDR 0x1B + +#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) + +/* flags passed by hfi1_ib_rcv() */ +enum { + HFI1_HAS_GRH = (1 << 0), +}; + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __packed; + +struct ib_atomic_eth { + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __packed; + +union ib_ehdrs { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be32 atomic_ack_eth[2]; + } at; + __be32 imm_data; + __be32 aeth; + struct ib_atomic_eth atomic_eth; +} __packed; + +struct hfi1_other_headers { + __be32 bth[3]; + union ib_ehdrs u; +} __packed; + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct hfi1_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct hfi1_other_headers oth; + } l; + struct hfi1_other_headers oth; + } u; +} __packed; + +struct ahg_ib_header { + struct sdma_engine *sde; + u32 ahgdesc[2]; + u16 tx_flags; + u8 ahgcount; + u8 ahgidx; + struct hfi1_ib_header ibh; +}; + +struct hfi1_pio_header { + __le64 pbc; + struct hfi1_ib_header hdr; +} __packed; + +/* + * used for force cacheline alignment for AHG + */ +struct tx_pio_header { + struct hfi1_pio_header phdr; +} ____cacheline_aligned; + +/* + * There is one struct hfi1_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct hfi1_mcast_qp. + */ +struct hfi1_mcast_qp { + struct list_head list; + struct hfi1_qp *qp; +}; + +struct hfi1_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* Protection domain */ +struct hfi1_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address Handle */ +struct hfi1_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; + atomic_t refcount; +}; + +/* + * This structure is used by hfi1_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct hfi1_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct hfi1_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct hfi1_cq { + struct ib_cq ibcq; + struct kthread_work comptask; + struct hfi1_devdata *dd; + spinlock_t lock; /* protect changes in this struct */ + u8 notify; + u8 triggered; + struct hfi1_cq_wc *queue; + struct hfi1_mmap_info *ip; +}; + +/* + * A segment is a linear region of low physical memory. + * Used by the verbs layer. + */ +struct hfi1_seg { + void *vaddr; + size_t length; +}; + +/* The number of hfi1_segs that fit in a page. */ +#define HFI1_SEGSZ (PAGE_SIZE / sizeof(struct hfi1_seg)) + +struct hfi1_segarray { + struct hfi1_seg segs[HFI1_SEGSZ]; +}; + +struct hfi1_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of hfi1_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + u8 page_shift; /* 0 - non unform/non powerof2 sizes */ + u8 lkey_published; /* in global table */ + struct completion comp; /* complete when refcount goes to zero */ + atomic_t refcount; + struct hfi1_segarray *map[0]; /* the segments */ +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct hfi1_sge { + struct hfi1_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +/* Memory region */ +struct hfi1_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct hfi1_mregion mr; /* must be last */ +}; + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct hfi1_swqe { + struct ib_send_wr wr; /* don't use wr.sg_list */ + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct hfi1_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct hfi1_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct hfi1_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct hfi1_rwqe wq[0]; +}; + +struct hfi1_rq { + struct hfi1_rwq *wq; + u32 size; /* size of RWQE array */ + u8 max_sge; + /* protect changes in this struct */ + spinlock_t lock ____cacheline_aligned_in_smp; +}; + +struct hfi1_srq { + struct ib_srq ibsrq; + struct hfi1_rq rq; + struct hfi1_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +struct hfi1_sge_state { + struct hfi1_sge *sg_list; /* next SGE to be used if any */ + struct hfi1_sge sge; /* progress state for the current SGE */ + u32 total_len; + u8 num_sge; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct hfi1_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + u32 lpsn; + union { + struct hfi1_sge rdma_sge; + u64 atomic_data; + }; +}; + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct hfi1_qp { + struct ib_qp ibqp; + /* read mostly fields above and below */ + struct ib_ah_attr remote_ah_attr; + struct ib_ah_attr alt_ah_attr; + struct hfi1_qp __rcu *next; /* link list for QPN hash table */ + struct hfi1_swqe *s_wq; /* send work queue */ + struct hfi1_mmap_info *ip; + struct ahg_ib_header *s_hdr; /* next packet header to send */ + u8 s_sc; /* SC[0..4] for next packet */ + unsigned long timeout_jiffies; /* computed from timeout */ + + enum ib_mtu path_mtu; + int srate_mbps; /* s_srate (below) converted to Mbit/s */ + u32 remote_qpn; + u32 pmtu; /* decoded from path_mtu */ + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ + u32 s_ahgpsn; /* set to the psn in the copy of the header */ + + u8 state; /* QP state */ + u8 allowed_ops; /* high order bits of allowed opcodes */ + u8 qp_access_flags; + u8 alt_timeout; /* Alternate path timeout for this QP */ + u8 timeout; /* Timeout for this QP */ + u8 s_srate; + u8 s_mig_state; + u8 port_num; + u8 s_pkey_index; /* PKEY index to use */ + u8 s_alt_pkey_index; /* Alternate path PKEY index to use */ + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_draining; + + /* start of read/write fields */ + atomic_t refcount ____cacheline_aligned_in_smp; + wait_queue_head_t wait; + + + struct hfi1_ack_entry s_ack_queue[HFI1_MAX_RDMA_ATOMIC + 1] + ____cacheline_aligned_in_smp; + struct hfi1_sge_state s_rdma_read_sge; + + spinlock_t r_lock ____cacheline_aligned_in_smp; /* used for APM */ + unsigned long r_aflags; + u64 r_wr_id; /* ID for current receive WQE */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_psn; /* expected rcv packet sequence number */ + u32 r_msn; /* message sequence number */ + + u8 r_state; /* opcode of last packet received */ + u8 r_flags; + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + + struct list_head rspwait; /* link for waiting to respond */ + + struct hfi1_sge_state r_sge; /* current receive data */ + struct hfi1_rq r_rq; /* receive work queue */ + + spinlock_t s_lock ____cacheline_aligned_in_smp; + struct hfi1_sge_state *s_cur_sge; + u32 s_flags; + struct hfi1_swqe *s_wqe; + struct hfi1_sge_state s_sge; /* current send request data */ + struct hfi1_mregion *s_rdma_mr; + struct sdma_engine *s_sde; /* current sde */ + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_next_psn; /* PSN for next request */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_sending_psn; /* lowest PSN that is being sent */ + u32 s_sending_hpsn; /* highest PSN that is being sent */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_head; /* new entries added here */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_acked; /* last un-ACK'ed entry */ + u32 s_last; /* last completed entry */ + u32 s_ssn; /* SSN of tail entry */ + u32 s_lsn; /* limit sequence number (credit) */ + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u16 s_rdma_ack_cnt; + s8 s_ahgidx; + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + + struct hfi1_sge_state s_ack_rdma_sge; + struct timer_list s_timer; + + struct iowait s_iowait; + + struct hfi1_sge r_sg_list[0] /* verified SGEs */ + ____cacheline_aligned_in_smp; +}; + +/* + * Atomic bit definitions for r_aflags. + */ +#define HFI1_R_WRID_VALID 0 +#define HFI1_R_REWIND_SGE 1 + +/* + * Bit definitions for r_flags. + */ +#define HFI1_R_REUSE_SGE 0x01 +#define HFI1_R_RDMAR_SEQ 0x02 +#define HFI1_R_RSP_NAK 0x04 +#define HFI1_R_RSP_SEND 0x08 +#define HFI1_R_COMM_EST 0x10 + +/* + * Bit definitions for s_flags. + * + * HFI1_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled + * HFI1_S_BUSY - send tasklet is processing the QP + * HFI1_S_TIMER - the RC retry timer is active + * HFI1_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics + * HFI1_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * HFI1_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete + * before processing the next SWQE + * HFI1_S_WAIT_RNR - waiting for RNR timeout + * HFI1_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * HFI1_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA + * HFI1_S_WAIT_PIO - waiting for a send buffer to be available + * HFI1_S_WAIT_TX - waiting for a struct verbs_txreq to be available + * HFI1_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available + * HFI1_S_WAIT_KMEM - waiting for kernel memory to be available + * HFI1_S_WAIT_PSN - waiting for a packet to exit the send DMA queue + * HFI1_S_WAIT_ACK - waiting for an ACK packet before sending more requests + * HFI1_S_SEND_ONE - send one packet, request ACK, then wait for ACK + * HFI1_S_ECN - a BECN was queued to the send engine + */ +#define HFI1_S_SIGNAL_REQ_WR 0x0001 +#define HFI1_S_BUSY 0x0002 +#define HFI1_S_TIMER 0x0004 +#define HFI1_S_RESP_PENDING 0x0008 +#define HFI1_S_ACK_PENDING 0x0010 +#define HFI1_S_WAIT_FENCE 0x0020 +#define HFI1_S_WAIT_RDMAR 0x0040 +#define HFI1_S_WAIT_RNR 0x0080 +#define HFI1_S_WAIT_SSN_CREDIT 0x0100 +#define HFI1_S_WAIT_DMA 0x0200 +#define HFI1_S_WAIT_PIO 0x0400 +#define HFI1_S_WAIT_TX 0x0800 +#define HFI1_S_WAIT_DMA_DESC 0x1000 +#define HFI1_S_WAIT_KMEM 0x2000 +#define HFI1_S_WAIT_PSN 0x4000 +#define HFI1_S_WAIT_ACK 0x8000 +#define HFI1_S_SEND_ONE 0x10000 +#define HFI1_S_UNLIMITED_CREDIT 0x20000 +#define HFI1_S_AHG_VALID 0x40000 +#define HFI1_S_AHG_CLEAR 0x80000 +#define HFI1_S_ECN 0x100000 + +/* + * Wait flags that would prevent any packet type from being sent. + */ +#define HFI1_S_ANY_WAIT_IO (HFI1_S_WAIT_PIO | HFI1_S_WAIT_TX | \ + HFI1_S_WAIT_DMA_DESC | HFI1_S_WAIT_KMEM) + +/* + * Wait flags that would prevent send work requests from making progress. + */ +#define HFI1_S_ANY_WAIT_SEND (HFI1_S_WAIT_FENCE | HFI1_S_WAIT_RDMAR | \ + HFI1_S_WAIT_RNR | HFI1_S_WAIT_SSN_CREDIT | HFI1_S_WAIT_DMA | \ + HFI1_S_WAIT_PSN | HFI1_S_WAIT_ACK) + +#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | HFI1_S_ANY_WAIT_SEND) + +#define HFI1_PSN_CREDIT 16 + +/* + * Since struct hfi1_swqe is not a fixed size, we can't simply index into + * struct hfi1_qp.s_wq. This function does the array index computation. + */ +static inline struct hfi1_swqe *get_swqe_ptr(struct hfi1_qp *qp, + unsigned n) +{ + return (struct hfi1_swqe *)((char *)qp->s_wq + + (sizeof(struct hfi1_swqe) + + qp->s_max_sge * + sizeof(struct hfi1_sge)) * n); +} + +/* + * Since struct hfi1_rwqe is not a fixed size, we can't simply index into + * struct hfi1_rwq.wq. This function does the array index computation. + */ +static inline struct hfi1_rwqe *get_rwqe_ptr(struct hfi1_rq *rq, unsigned n) +{ + return (struct hfi1_rwqe *) + ((char *) rq->wq->wq + + (sizeof(struct hfi1_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +#define MAX_LKEY_TABLE_BITS 23 + +struct hfi1_lkey_table { + spinlock_t lock; /* protect changes in this struct */ + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct hfi1_mregion __rcu **table; +}; + +struct hfi1_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct hfi1_opcode_stats_perctx { + struct hfi1_opcode_stats stats[256]; +}; + +static inline void inc_opstats( + u32 tlen, + struct hfi1_opcode_stats *stats) +{ +#ifdef CONFIG_DEBUG_FS + stats->n_bytes += tlen; + stats->n_packets++; +#endif +} + +struct hfi1_ibport { + struct hfi1_qp __rcu *qp[2]; + struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ + struct hfi1_ah *sm_ah; + struct hfi1_ah *smi_ah; + struct rb_root mcast_tree; + spinlock_t lock; /* protect changes in this struct */ + + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + unsigned long trap_timeout; + __be64 gid_prefix; /* in network order */ + __be64 mkey; + __be64 guids[HFI1_GUIDS_PER_PORT - 1]; /* writable GUIDs */ + u64 tid; /* TID for traps */ + u64 n_rc_resends; + u64 n_seq_naks; + u64 n_rdma_seq; + u64 n_rnr_naks; + u64 n_other_naks; + u64 n_loop_pkts; + u64 n_pkt_drops; + u64 n_vl15_dropped; + u64 n_rc_timeouts; + u64 n_dmawait; + u64 n_unaligned; + u64 n_rc_dupreq; + u64 n_rc_seqnak; + + /* Hot-path per CPU counters to avoid cacheline trading to update */ + u64 z_rc_acks; + u64 z_rc_qacks; + u64 z_rc_delayed_comp; + u64 __percpu *rc_acks; + u64 __percpu *rc_qacks; + u64 __percpu *rc_delayed_comp; + + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 pkey_violations; + u16 qkey_violations; + u16 mkey_violations; + u16 mkey_lease_period; + u16 sm_lid; + u16 repress_traps; + u8 sm_sl; + u8 mkeyprot; + u8 subnet_timeout; + u8 vl_high_limit; + /* the first 16 entries are sl_to_vl for !OPA */ + u8 sl_to_sc[32]; + u8 sc_to_sl[32]; +}; + + +struct hfi1_qp_ibdev; +struct hfi1_ibdev { + struct ib_device ibdev; + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; /* protect mmap_offset */ + u32 mmap_offset; + struct hfi1_mregion __rcu *dma_mr; + + struct hfi1_qp_ibdev *qp_dev; + + /* QP numbers are shared by all IB ports */ + struct hfi1_lkey_table lk_table; + /* protect wait lists */ + seqlock_t iowait_lock; + struct list_head txwait; /* list for wait verbs_txreq */ + struct list_head memwait; /* list for wait kernel memory */ + struct list_head txreq_free; + struct kmem_cache *verbs_txreq_cache; + struct timer_list mem_timer; + + /* other waiters */ + spinlock_t pending_lock; + + u64 n_piowait; + u64 n_txwait; + u64 n_kmem_wait; + + u32 n_pds_allocated; /* number of PDs allocated for device */ + spinlock_t n_pds_lock; + u32 n_ahs_allocated; /* number of AHs allocated for device */ + spinlock_t n_ahs_lock; + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; + u32 n_qps_allocated; /* number of QPs allocated for device */ + spinlock_t n_qps_lock; + u32 n_srqs_allocated; /* number of SRQs allocated for device */ + spinlock_t n_srqs_lock; + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; +#ifdef CONFIG_DEBUG_FS + /* per HFI debugfs */ + struct dentry *hfi1_ibdev_dbg; + /* per HFI symlinks to above */ + struct dentry *hfi1_ibdev_link; +#endif +}; + +struct hfi1_verbs_counters { + u64 symbol_error_counter; + u64 link_error_recovery_counter; + u64 link_downed_counter; + u64 port_rcv_errors; + u64 port_rcv_remphys_errors; + u64 port_xmit_discards; + u64 port_xmit_data; + u64 port_rcv_data; + u64 port_xmit_packets; + u64 port_rcv_packets; + u32 local_link_integrity_errors; + u32 excessive_buffer_overrun_errors; + u32 vl15_dropped; +}; + +static inline struct hfi1_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct hfi1_mr, ibmr); +} + +static inline struct hfi1_pd *to_ipd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct hfi1_pd, ibpd); +} + +static inline struct hfi1_ah *to_iah(struct ib_ah *ibah) +{ + return container_of(ibah, struct hfi1_ah, ibah); +} + +static inline struct hfi1_cq *to_icq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct hfi1_cq, ibcq); +} + +static inline struct hfi1_srq *to_isrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct hfi1_srq, ibsrq); +} + +static inline struct hfi1_qp *to_iqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct hfi1_qp, ibqp); +} + +static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct hfi1_ibdev, ibdev); +} + +/* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int hfi1_send_ok(struct hfi1_qp *qp) +{ + return !(qp->s_flags & (HFI1_S_BUSY | HFI1_S_ANY_WAIT_IO)) && + (qp->s_hdrwords || (qp->s_flags & HFI1_S_RESP_PENDING) || + !(qp->s_flags & HFI1_S_ANY_WAIT_SEND)); +} + +/* + * This must be called with s_lock held. + */ +void hfi1_schedule_send(struct hfi1_qp *qp); +void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, __be16 lid1, __be16 lid2); +void hfi1_cap_mask_chg(struct hfi1_ibport *ibp); +void hfi1_sys_guid_chg(struct hfi1_ibport *ibp); +void hfi1_node_desc_chg(struct hfi1_ibport *ibp); +int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, size_t in_mad_size, + struct ib_mad_hdr *out_mad, size_t *out_mad_size, + u16 *out_mad_pkey_index); +int hfi1_create_agents(struct hfi1_ibdev *dev); +void hfi1_free_agents(struct hfi1_ibdev *dev); + +/* + * The PSN_MASK and PSN_SHIFT allow for + * 1) comparing two PSNs + * 2) returning the PSN with any upper bits masked + * 3) returning the difference between to PSNs + * + * The number of significant bits in the PSN must + * necessarily be at least one bit less than + * the container holding the PSN. + */ +#ifndef CONFIG_HFI1_VERBS_31BIT_PSN +#define PSN_MASK 0xFFFFFF +#define PSN_SHIFT 8 +#else +#define PSN_MASK 0x7FFFFFFF +#define PSN_SHIFT 1 +#endif +#define PSN_MODIFY_MASK 0xFFFFFF + +/* Number of bits to pay attention to in the opcode for checking qp type */ +#define OPCODE_QP_MASK 0xE0 + +/* + * Compare the lower 24 bits of the msn values. + * Returns an integer <, ==, or > than zero. + */ +static inline int cmp_msn(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << 8; +} + +/* + * Compare two PSNs + * Returns an integer <, ==, or > than zero. + */ +static inline int cmp_psn(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << PSN_SHIFT; +} + +/* + * Return masked PSN + */ +static inline u32 mask_psn(u32 a) +{ + return a & PSN_MASK; +} + +/* + * Return delta between two PSNs + */ +static inline u32 delta_psn(u32 a, u32 b) +{ + return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; +} + +struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid); + +int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp); + +struct verbs_txreq; +void hfi1_put_txreq(struct verbs_txreq *tx); + +int hfi1_verbs_send(struct hfi1_qp *qp, struct ahg_ib_header *ahdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len); + +void hfi1_copy_sge(struct hfi1_sge_state *ss, void *data, u32 length, + int release); + +void hfi1_skip_sge(struct hfi1_sge_state *ss, u32 length, int release); + +void hfi1_cnp_rcv(struct hfi1_packet *packet); + +void hfi1_uc_rcv(struct hfi1_packet *packet); + +void hfi1_rc_rcv(struct hfi1_packet *packet); + +void hfi1_rc_hdrerr( + struct hfi1_ctxtdata *rcd, + struct hfi1_ib_header *hdr, + u32 rcv_flags, + struct hfi1_qp *qp); + +u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); + +int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); + +struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid); + +void hfi1_rc_rnr_retry(unsigned long arg); + +void hfi1_rc_send_complete(struct hfi1_qp *qp, struct hfi1_ib_header *hdr); + +void hfi1_rc_error(struct hfi1_qp *qp, enum ib_wc_status err); + +void hfi1_ud_rcv(struct hfi1_packet *packet); + +int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey); + +int hfi1_alloc_lkey(struct hfi1_mregion *mr, int dma_region); + +void hfi1_free_lkey(struct hfi1_mregion *mr); + +int hfi1_lkey_ok(struct hfi1_lkey_table *rkt, struct hfi1_pd *pd, + struct hfi1_sge *isge, struct ib_sge *sge, int acc); + +int hfi1_rkey_ok(struct hfi1_qp *qp, struct hfi1_sge *sge, + u32 len, u64 vaddr, u32 rkey, int acc); + +int hfi1_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_srq *hfi1_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + +int hfi1_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); + +int hfi1_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); + +int hfi1_destroy_srq(struct ib_srq *ibsrq); + +int hfi1_cq_init(struct hfi1_devdata *dd); + +void hfi1_cq_exit(struct hfi1_devdata *dd); + +void hfi1_cq_enter(struct hfi1_cq *cq, struct ib_wc *entry, int sig); + +int hfi1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); + +struct ib_cq *hfi1_create_cq( + struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata); + +int hfi1_destroy_cq(struct ib_cq *ibcq); + +int hfi1_req_notify_cq( + struct ib_cq *ibcq, + enum ib_cq_notify_flags notify_flags); + +int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); + +struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc); + +struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); + +struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); + +int hfi1_dereg_mr(struct ib_mr *ibmr); + +struct ib_mr *hfi1_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_entries); + +struct ib_fast_reg_page_list *hfi1_alloc_fast_reg_page_list( + struct ib_device *ibdev, int page_list_len); + +void hfi1_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl); + +int hfi1_fast_reg_mr(struct hfi1_qp *qp, struct ib_send_wr *wr); + +struct ib_fmr *hfi1_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int hfi1_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, + int list_len, u64 iova); + +int hfi1_unmap_fmr(struct list_head *fmr_list); + +int hfi1_dealloc_fmr(struct ib_fmr *ibfmr); + +static inline void hfi1_get_mr(struct hfi1_mregion *mr) +{ + atomic_inc(&mr->refcount); +} + +static inline void hfi1_put_mr(struct hfi1_mregion *mr) +{ + if (unlikely(atomic_dec_and_test(&mr->refcount))) + complete(&mr->comp); +} + +static inline void hfi1_put_ss(struct hfi1_sge_state *ss) +{ + while (ss->num_sge) { + hfi1_put_mr(ss->sge.mr); + if (--ss->num_sge) + ss->sge = *ss->sg_list++; + } +} + +void hfi1_release_mmap_info(struct kref *ref); + +struct hfi1_mmap_info *hfi1_create_mmap_info(struct hfi1_ibdev *dev, u32 size, + struct ib_ucontext *context, + void *obj); + +void hfi1_update_mmap_info(struct hfi1_ibdev *dev, struct hfi1_mmap_info *ip, + u32 size, void *obj); + +int hfi1_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +int hfi1_get_rwqe(struct hfi1_qp *qp, int wr_id_only); + +void hfi1_migrate_qp(struct hfi1_qp *qp); + +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, + int has_grh, struct hfi1_qp *qp, u32 bth0); + +u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void clear_ahg(struct hfi1_qp *qp); + +void hfi1_make_ruc_header(struct hfi1_qp *qp, struct hfi1_other_headers *ohdr, + u32 bth0, u32 bth2, int middle); + +void hfi1_do_send(struct work_struct *work); + +void hfi1_send_complete(struct hfi1_qp *qp, struct hfi1_swqe *wqe, + enum ib_wc_status status); + +void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct hfi1_qp *qp, int is_fecn); + +int hfi1_make_rc_req(struct hfi1_qp *qp); + +int hfi1_make_uc_req(struct hfi1_qp *qp); + +int hfi1_make_ud_req(struct hfi1_qp *qp); + +int hfi1_register_ib_device(struct hfi1_devdata *); + +void hfi1_unregister_ib_device(struct hfi1_devdata *); + +void hfi1_ib_rcv(struct hfi1_packet *packet); + +unsigned hfi1_get_npkeys(struct hfi1_devdata *); + +int hfi1_verbs_send_dma(struct hfi1_qp *qp, struct ahg_ib_header *hdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc); + +int hfi1_verbs_send_pio(struct hfi1_qp *qp, struct ahg_ib_header *hdr, + u32 hdrwords, struct hfi1_sge_state *ss, u32 len, + u32 plen, u32 dwords, u64 pbc); + +struct send_context *qp_to_send_context(struct hfi1_qp *qp, u8 sc5); + +extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; + +extern const u8 hdr_len_by_opcode[]; + +extern const int ib_hfi1_state_ops[]; + +extern __be64 ib_hfi1_sys_image_guid; /* in network order */ + +extern unsigned int hfi1_lkey_table_size; + +extern unsigned int hfi1_max_cqes; + +extern unsigned int hfi1_max_cqs; + +extern unsigned int hfi1_max_qp_wrs; + +extern unsigned int hfi1_max_qps; + +extern unsigned int hfi1_max_sges; + +extern unsigned int hfi1_max_mcast_grps; + +extern unsigned int hfi1_max_mcast_qp_attached; + +extern unsigned int hfi1_max_srqs; + +extern unsigned int hfi1_max_srq_sges; + +extern unsigned int hfi1_max_srq_wrs; + +extern const u32 ib_hfi1_rnr_table[]; + +extern struct ib_dma_mapping_ops hfi1_dma_mapping_ops; + +#endif /* HFI1_VERBS_H */ diff --git a/drivers/staging/rdma/hfi1/verbs_mcast.c b/drivers/staging/rdma/hfi1/verbs_mcast.c new file mode 100644 index 000000000..afc6b4c61 --- /dev/null +++ b/drivers/staging/rdma/hfi1/verbs_mcast.c @@ -0,0 +1,385 @@ +/* + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include + +#include "hfi.h" + +/** + * mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct + * @qp: the QP to link + */ +static struct hfi1_mcast_qp *mcast_qp_alloc(struct hfi1_qp *qp) +{ + struct hfi1_mcast_qp *mqp; + + mqp = kmalloc(sizeof(*mqp), GFP_KERNEL); + if (!mqp) + goto bail; + + mqp->qp = qp; + atomic_inc(&qp->refcount); + +bail: + return mqp; +} + +static void mcast_qp_free(struct hfi1_mcast_qp *mqp) +{ + struct hfi1_qp *qp = mqp->qp; + + /* Notify hfi1_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + kfree(mqp); +} + +/** + * mcast_alloc - allocate the multicast GID structure + * @mgid: the multicast GID + * + * A list of QPs will be attached to this structure. + */ +static struct hfi1_mcast *mcast_alloc(union ib_gid *mgid) +{ + struct hfi1_mcast *mcast; + + mcast = kmalloc(sizeof(*mcast), GFP_KERNEL); + if (!mcast) + goto bail; + + mcast->mgid = *mgid; + INIT_LIST_HEAD(&mcast->qp_list); + init_waitqueue_head(&mcast->wait); + atomic_set(&mcast->refcount, 0); + mcast->n_attached = 0; + +bail: + return mcast; +} + +static void mcast_free(struct hfi1_mcast *mcast) +{ + struct hfi1_mcast_qp *p, *tmp; + + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) + mcast_qp_free(p); + + kfree(mcast); +} + +/** + * hfi1_mcast_find - search the global table for the given multicast GID + * @ibp: the IB port structure + * @mgid: the multicast GID to search for + * + * Returns NULL if not found. + * + * The caller is responsible for decrementing the reference count if found. + */ +struct hfi1_mcast *hfi1_mcast_find(struct hfi1_ibport *ibp, union ib_gid *mgid) +{ + struct rb_node *n; + unsigned long flags; + struct hfi1_mcast *mcast; + + spin_lock_irqsave(&ibp->lock, flags); + n = ibp->mcast_tree.rb_node; + while (n) { + int ret; + + mcast = rb_entry(n, struct hfi1_mcast, rb_node); + + ret = memcmp(mgid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else { + atomic_inc(&mcast->refcount); + spin_unlock_irqrestore(&ibp->lock, flags); + goto bail; + } + } + spin_unlock_irqrestore(&ibp->lock, flags); + + mcast = NULL; + +bail: + return mcast; +} + +/** + * mcast_add - insert mcast GID into table and attach QP struct + * @mcast: the mcast GID table + * @mqp: the QP to attach + * + * Return zero if both were added. Return EEXIST if the GID was already in + * the table but the QP was added. Return ESRCH if the QP was already + * attached and neither structure was added. + */ +static int mcast_add(struct hfi1_ibdev *dev, struct hfi1_ibport *ibp, + struct hfi1_mcast *mcast, struct hfi1_mcast_qp *mqp) +{ + struct rb_node **n = &ibp->mcast_tree.rb_node; + struct rb_node *pn = NULL; + int ret; + + spin_lock_irq(&ibp->lock); + + while (*n) { + struct hfi1_mcast *tmcast; + struct hfi1_mcast_qp *p; + + pn = *n; + tmcast = rb_entry(pn, struct hfi1_mcast, rb_node); + + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) { + n = &pn->rb_left; + continue; + } + if (ret > 0) { + n = &pn->rb_right; + continue; + } + + /* Search the QP list to see if this is already there. */ + list_for_each_entry_rcu(p, &tmcast->qp_list, list) { + if (p->qp == mqp->qp) { + ret = ESRCH; + goto bail; + } + } + if (tmcast->n_attached == hfi1_max_mcast_qp_attached) { + ret = ENOMEM; + goto bail; + } + + tmcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); + ret = EEXIST; + goto bail; + } + + spin_lock(&dev->n_mcast_grps_lock); + if (dev->n_mcast_grps_allocated == hfi1_max_mcast_grps) { + spin_unlock(&dev->n_mcast_grps_lock); + ret = ENOMEM; + goto bail; + } + + dev->n_mcast_grps_allocated++; + spin_unlock(&dev->n_mcast_grps_lock); + + mcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &mcast->qp_list); + + atomic_inc(&mcast->refcount); + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &ibp->mcast_tree); + + ret = 0; + +bail: + spin_unlock_irq(&ibp->lock); + + return ret; +} + +int hfi1_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + struct hfi1_ibdev *dev = to_idev(ibqp->device); + struct hfi1_ibport *ibp; + struct hfi1_mcast *mcast; + struct hfi1_mcast_qp *mqp; + int ret; + + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { + ret = -EINVAL; + goto bail; + } + + /* + * Allocate data structures since its better to do this outside of + * spin locks and it will most likely be needed. + */ + mcast = mcast_alloc(gid); + if (mcast == NULL) { + ret = -ENOMEM; + goto bail; + } + mqp = mcast_qp_alloc(qp); + if (mqp == NULL) { + mcast_free(mcast); + ret = -ENOMEM; + goto bail; + } + ibp = to_iport(ibqp->device, qp->port_num); + switch (mcast_add(dev, ibp, mcast, mqp)) { + case ESRCH: + /* Neither was used: OK to attach the same QP twice. */ + mcast_qp_free(mqp); + mcast_free(mcast); + break; + + case EEXIST: /* The mcast wasn't used */ + mcast_free(mcast); + break; + + case ENOMEM: + /* Exceeded the maximum number of mcast groups. */ + mcast_qp_free(mqp); + mcast_free(mcast); + ret = -ENOMEM; + goto bail; + + default: + break; + } + + ret = 0; + +bail: + return ret; +} + +int hfi1_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct hfi1_qp *qp = to_iqp(ibqp); + struct hfi1_ibdev *dev = to_idev(ibqp->device); + struct hfi1_ibport *ibp = to_iport(ibqp->device, qp->port_num); + struct hfi1_mcast *mcast = NULL; + struct hfi1_mcast_qp *p, *tmp; + struct rb_node *n; + int last = 0; + int ret; + + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { + ret = -EINVAL; + goto bail; + } + + spin_lock_irq(&ibp->lock); + + /* Find the GID in the mcast table. */ + n = ibp->mcast_tree.rb_node; + while (1) { + if (n == NULL) { + spin_unlock_irq(&ibp->lock); + ret = -EINVAL; + goto bail; + } + + mcast = rb_entry(n, struct hfi1_mcast, rb_node); + ret = memcmp(gid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + break; + } + + /* Search the QP list. */ + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { + if (p->qp != qp) + continue; + /* + * We found it, so remove it, but don't poison the forward + * link until we are sure there are no list walkers. + */ + list_del_rcu(&p->list); + mcast->n_attached--; + + /* If this was the last attached QP, remove the GID too. */ + if (list_empty(&mcast->qp_list)) { + rb_erase(&mcast->rb_node, &ibp->mcast_tree); + last = 1; + } + break; + } + + spin_unlock_irq(&ibp->lock); + + if (p) { + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + mcast_qp_free(p); + } + if (last) { + atomic_dec(&mcast->refcount); + wait_event(mcast->wait, !atomic_read(&mcast->refcount)); + mcast_free(mcast); + spin_lock_irq(&dev->n_mcast_grps_lock); + dev->n_mcast_grps_allocated--; + spin_unlock_irq(&dev->n_mcast_grps_lock); + } + + ret = 0; + +bail: + return ret; +} + +int hfi1_mcast_tree_empty(struct hfi1_ibport *ibp) +{ + return ibp->mcast_tree.rb_node == NULL; +} diff --git a/drivers/staging/rdma/ipath/Kconfig b/drivers/staging/rdma/ipath/Kconfig new file mode 100644 index 000000000..041ce0634 --- /dev/null +++ b/drivers/staging/rdma/ipath/Kconfig @@ -0,0 +1,16 @@ +config INFINIBAND_IPATH + tristate "QLogic HTX HCA support" + depends on 64BIT && NET && HT_IRQ + ---help--- + This is a driver for the deprecated QLogic Hyper-Transport + IB host channel adapter (model QHT7140), + including InfiniBand verbs support. This driver allows these + devices to be used with both kernel upper level protocols such + as IP-over-InfiniBand as well as with userspace applications + (in conjunction with InfiniBand userspace access). + For QLogic PCIe QLE based cards, use the QIB driver instead. + + If you have this hardware you will need to boot with PAT disabled + on your x86-64 systems, use the nopat kernel parameter. + + Note that this driver will soon be removed entirely from the kernel. diff --git a/drivers/staging/rdma/ipath/Makefile b/drivers/staging/rdma/ipath/Makefile new file mode 100644 index 000000000..4496f2820 --- /dev/null +++ b/drivers/staging/rdma/ipath/Makefile @@ -0,0 +1,37 @@ +ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \ + -DIPATH_KERN_TYPE=0 + +obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o + +ib_ipath-y := \ + ipath_cq.o \ + ipath_diag.o \ + ipath_dma.o \ + ipath_driver.o \ + ipath_eeprom.o \ + ipath_file_ops.o \ + ipath_fs.o \ + ipath_init_chip.o \ + ipath_intr.o \ + ipath_keys.o \ + ipath_mad.o \ + ipath_mmap.o \ + ipath_mr.o \ + ipath_qp.o \ + ipath_rc.o \ + ipath_ruc.o \ + ipath_sdma.o \ + ipath_srq.o \ + ipath_stats.o \ + ipath_sysfs.o \ + ipath_uc.o \ + ipath_ud.o \ + ipath_user_pages.o \ + ipath_user_sdma.o \ + ipath_verbs_mcast.o \ + ipath_verbs.o + +ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o + +ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o +ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o diff --git a/drivers/staging/rdma/ipath/TODO b/drivers/staging/rdma/ipath/TODO new file mode 100644 index 000000000..cb00158d6 --- /dev/null +++ b/drivers/staging/rdma/ipath/TODO @@ -0,0 +1,5 @@ +The ipath driver has been moved to staging in preparation for its removal in a +few releases. The driver will be deleted during the 4.6 merge window. + +Contact Dennis Dalessandro and +Cc: linux-rdma@vger.kernel.org diff --git a/drivers/staging/rdma/ipath/ipath_common.h b/drivers/staging/rdma/ipath/ipath_common.h new file mode 100644 index 000000000..28cfe97cf --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_common.h @@ -0,0 +1,851 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_COMMON_H +#define _IPATH_COMMON_H + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + + +/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */ +#define IPATH_SRC_OUI_1 0x00 +#define IPATH_SRC_OUI_2 0x11 +#define IPATH_SRC_OUI_3 0x75 + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in + * fastpath code + * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in faspath code + * _IPATH_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + * _IPATH_DEBUGGING define as 0 if you want to remove debug prints + */ + +/* + * The value in the BTH QP field that InfiniPath uses to differentiate + * an infinipath protocol IB packet vs standard IB transport + */ +#define IPATH_KD_QP 0x656b79 + +/* + * valid states passed to ipath_set_linkstate() user call + */ +#define IPATH_IB_LINKDOWN 0 +#define IPATH_IB_LINKARM 1 +#define IPATH_IB_LINKACTIVE 2 +#define IPATH_IB_LINKDOWN_ONLY 3 +#define IPATH_IB_LINKDOWN_SLEEP 4 +#define IPATH_IB_LINKDOWN_DISABLE 5 +#define IPATH_IB_LINK_LOOPBACK 6 /* enable local loopback */ +#define IPATH_IB_LINK_EXTERNAL 7 /* normal, disable local loopback */ +#define IPATH_IB_LINK_NO_HRTBT 8 /* disable Heartbeat, e.g. for loopback */ +#define IPATH_IB_LINK_HRTBT 9 /* enable heartbeat, normal, non-loopback */ + +/* + * These 3 values (SDR and DDR may be ORed for auto-speed + * negotiation) are used for the 3rd argument to path_f_set_ib_cfg + * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs. They + * are also the the possible values for ipath_link_speed_enabled and active + * The values were chosen to match values used within the IB spec. + */ +#define IPATH_IB_SDR 1 +#define IPATH_IB_DDR 2 + +/* + * stats maintained by the driver. For now, at least, this is global + * to all minor devices. + */ +struct infinipath_stats { + /* number of interrupts taken */ + __u64 sps_ints; + /* number of interrupts for errors */ + __u64 sps_errints; + /* number of errors from chip (not incl. packet errors or CRC) */ + __u64 sps_errs; + /* number of packet errors from chip other than CRC */ + __u64 sps_pkterrs; + /* number of packets with CRC errors (ICRC and VCRC) */ + __u64 sps_crcerrs; + /* number of hardware errors reported (parity, etc.) */ + __u64 sps_hwerrs; + /* number of times IB link changed state unexpectedly */ + __u64 sps_iblink; + __u64 sps_unused; /* was fastrcvint, no longer implemented */ + /* number of kernel (port0) packets received */ + __u64 sps_port0pkts; + /* number of "ethernet" packets sent by driver */ + __u64 sps_ether_spkts; + /* number of "ethernet" packets received by driver */ + __u64 sps_ether_rpkts; + /* number of SMA packets sent by driver. Obsolete. */ + __u64 sps_sma_spkts; + /* number of SMA packets received by driver. Obsolete. */ + __u64 sps_sma_rpkts; + /* number of times all ports rcvhdrq was full and packet dropped */ + __u64 sps_hdrqfull; + /* number of times all ports egrtid was full and packet dropped */ + __u64 sps_etidfull; + /* + * number of times we tried to send from driver, but no pio buffers + * avail + */ + __u64 sps_nopiobufs; + /* number of ports currently open */ + __u64 sps_ports; + /* list of pkeys (other than default) accepted (0 means not set) */ + __u16 sps_pkeys[4]; + __u16 sps_unused16[4]; /* available; maintaining compatible layout */ + /* number of user ports per chip (not IB ports) */ + __u32 sps_nports; + /* not our interrupt, or already handled */ + __u32 sps_nullintr; + /* max number of packets handled per receive call */ + __u32 sps_maxpkts_call; + /* avg number of packets handled per receive call */ + __u32 sps_avgpkts_call; + /* total number of pages locked */ + __u64 sps_pagelocks; + /* total number of pages unlocked */ + __u64 sps_pageunlocks; + /* + * Number of packets dropped in kernel other than errors (ether + * packets if ipath not configured, etc.) + */ + __u64 sps_krdrops; + __u64 sps_txeparity; /* PIO buffer parity error, recovered */ + /* pad for future growth */ + __u64 __sps_pad[45]; +}; + +/* + * These are the status bits readable (in ascii form, 64bit value) + * from the "status" sysfs file. + */ +#define IPATH_STATUS_INITTED 0x1 /* basic initialization done */ +#define IPATH_STATUS_DISABLED 0x2 /* hardware disabled */ +/* Device has been disabled via admin request */ +#define IPATH_STATUS_ADMIN_DISABLED 0x4 +/* Chip has been found and initted */ +#define IPATH_STATUS_CHIP_PRESENT 0x20 +/* IB link is at ACTIVE, usable for data traffic */ +#define IPATH_STATUS_IB_READY 0x40 +/* link is configured, LID, MTU, etc. have been set */ +#define IPATH_STATUS_IB_CONF 0x80 +/* no link established, probably no cable */ +#define IPATH_STATUS_IB_NOCABLE 0x100 +/* A Fatal hardware error has occurred. */ +#define IPATH_STATUS_HWERROR 0x200 + +/* + * The list of usermode accessible registers. Also see Reg_* later in file. + */ +typedef enum _ipath_ureg { + /* (RO) DMA RcvHdr to be used next. */ + ur_rcvhdrtail = 0, + /* (RW) RcvHdr entry to be processed next by host. */ + ur_rcvhdrhead = 1, + /* (RO) Index of next Eager index to use. */ + ur_rcvegrindextail = 2, + /* (RW) Eager TID to be processed next */ + ur_rcvegrindexhead = 3, + /* For internal use only; max register number. */ + _IPATH_UregMax +} ipath_ureg; + +/* bit values for spi_runtime_flags */ +#define IPATH_RUNTIME_HT 0x1 +#define IPATH_RUNTIME_PCIE 0x2 +#define IPATH_RUNTIME_FORCE_WC_ORDER 0x4 +#define IPATH_RUNTIME_RCVHDR_COPY 0x8 +#define IPATH_RUNTIME_MASTER 0x10 +#define IPATH_RUNTIME_NODMA_RTAIL 0x80 +#define IPATH_RUNTIME_SDMA 0x200 +#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400 +#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800 + +/* + * This structure is returned by ipath_userinit() immediately after + * open to get implementation-specific info, and info specific to this + * instance. + * + * This struct must have explict pad fields where type sizes + * may result in different alignments between 32 and 64 bit + * programs, since the 64 bit * bit kernel requires the user code + * to have matching offsets + */ +struct ipath_base_info { + /* version of hardware, for feature checking. */ + __u32 spi_hw_version; + /* version of software, for feature checking. */ + __u32 spi_sw_version; + /* InfiniPath port assigned, goes into sent packets */ + __u16 spi_port; + __u16 spi_subport; + /* + * IB MTU, packets IB data must be less than this. + * The MTU is in bytes, and will be a multiple of 4 bytes. + */ + __u32 spi_mtu; + /* + * Size of a PIO buffer. Any given packet's total size must be less + * than this (in words). Included is the starting control word, so + * if 513 is returned, then total pkt size is 512 words or less. + */ + __u32 spi_piosize; + /* size of the TID cache in infinipath, in entries */ + __u32 spi_tidcnt; + /* size of the TID Eager list in infinipath, in entries */ + __u32 spi_tidegrcnt; + /* size of a single receive header queue entry in words. */ + __u32 spi_rcvhdrent_size; + /* + * Count of receive header queue entries allocated. + * This may be less than the spu_rcvhdrcnt passed in!. + */ + __u32 spi_rcvhdr_cnt; + + /* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */ + __u32 spi_runtime_flags; + + /* address where receive buffer queue is mapped into */ + __u64 spi_rcvhdr_base; + + /* user program. */ + + /* base address of eager TID receive buffers. */ + __u64 spi_rcv_egrbufs; + + /* Allocated by initialization code, not by protocol. */ + + /* + * Size of each TID buffer in host memory, starting at + * spi_rcv_egrbufs. The buffers are virtually contiguous. + */ + __u32 spi_rcv_egrbufsize; + /* + * The special QP (queue pair) value that identifies an infinipath + * protocol packet from standard IB packets. More, probably much + * more, to be added. + */ + __u32 spi_qpair; + + /* + * User register base for init code, not to be used directly by + * protocol or applications. + */ + __u64 __spi_uregbase; + /* + * Maximum buffer size in bytes that can be used in a single TID + * entry (assuming the buffer is aligned to this boundary). This is + * the minimum of what the hardware and software support Guaranteed + * to be a power of 2. + */ + __u32 spi_tid_maxsize; + /* + * alignment of each pio send buffer (byte count + * to add to spi_piobufbase to get to second buffer) + */ + __u32 spi_pioalign; + /* + * The index of the first pio buffer available to this process; + * needed to do lookup in spi_pioavailaddr; not added to + * spi_piobufbase. + */ + __u32 spi_pioindex; + /* number of buffers mapped for this process */ + __u32 spi_piocnt; + + /* + * Base address of writeonly pio buffers for this process. + * Each buffer has spi_piosize words, and is aligned on spi_pioalign + * boundaries. spi_piocnt buffers are mapped from this address + */ + __u64 spi_piobufbase; + + /* + * Base address of readonly memory copy of the pioavail registers. + * There are 2 bits for each buffer. + */ + __u64 spi_pioavailaddr; + + /* + * Address where driver updates a copy of the interface and driver + * status (IPATH_STATUS_*) as a 64 bit value. It's followed by a + * string indicating hardware error, if there was one. + */ + __u64 spi_status; + + /* number of chip ports available to user processes */ + __u32 spi_nports; + /* unit number of chip we are using */ + __u32 spi_unit; + /* num bufs in each contiguous set */ + __u32 spi_rcv_egrperchunk; + /* size in bytes of each contiguous set */ + __u32 spi_rcv_egrchunksize; + /* total size of mmap to cover full rcvegrbuffers */ + __u32 spi_rcv_egrbuftotlen; + __u32 spi_filler_for_align; + /* address of readonly memory copy of the rcvhdrq tail register. */ + __u64 spi_rcvhdr_tailaddr; + + /* shared memory pages for subports if port is shared */ + __u64 spi_subport_uregbase; + __u64 spi_subport_rcvegrbuf; + __u64 spi_subport_rcvhdr_base; + + /* shared memory page for hardware port if it is shared */ + __u64 spi_port_uregbase; + __u64 spi_port_rcvegrbuf; + __u64 spi_port_rcvhdr_base; + __u64 spi_port_rcvhdr_tailaddr; + +} __attribute__ ((aligned(8))); + + +/* + * This version number is given to the driver by the user code during + * initialization in the spu_userversion field of ipath_user_info, so + * the driver can check for compatibility with user code. + * + * The major version changes when data structures + * change in an incompatible way. The driver must be the same or higher + * for initialization to succeed. In some cases, a higher version + * driver will not interoperate with older software, and initialization + * will return an error. + */ +#define IPATH_USER_SWMAJOR 1 + +/* + * Minor version differences are always compatible + * a within a major version, however if user software is larger + * than driver software, some new features and/or structure fields + * may not be implemented; the user code must deal with this if it + * cares, or it must abort after initialization reports the difference. + */ +#define IPATH_USER_SWMINOR 6 + +#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR) + +#define IPATH_KERN_TYPE 0 + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a QLogic release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of ipath_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION) + +/* + * This structure is passed to ipath_userinit() to tell the driver where + * user code buffers are, sizes, etc. The offsets and sizes of the + * fields must remain unchanged, for binary compatibility. It can + * be extended, if userversion is changed so user code can tell, if needed + */ +struct ipath_user_info { + /* + * version of user software, to detect compatibility issues. + * Should be set to IPATH_USER_SWVERSION. + */ + __u32 spu_userversion; + + /* desired number of receive header queue entries */ + __u32 spu_rcvhdrcnt; + + /* size of struct base_info to write to */ + __u32 spu_base_info_size; + + /* + * number of words in KD protocol header + * This tells InfiniPath how many words to copy to rcvhdrq. If 0, + * kernel uses a default. Once set, attempts to set any other value + * are an error (EAGAIN) until driver is reloaded. + */ + __u32 spu_rcvhdrsize; + + /* + * If two or more processes wish to share a port, each process + * must set the spu_subport_cnt and spu_subport_id to the same + * values. The only restriction on the spu_subport_id is that + * it be unique for a given node. + */ + __u16 spu_subport_cnt; + __u16 spu_subport_id; + + __u32 spu_unused; /* kept for compatible layout */ + + /* + * address of struct base_info to write to + */ + __u64 spu_base_info; + +} __attribute__ ((aligned(8))); + +/* User commands. */ + +#define IPATH_CMD_MIN 16 + +#define __IPATH_CMD_USER_INIT 16 /* old set up userspace (for old user code) */ +#define IPATH_CMD_PORT_INFO 17 /* find out what resources we got */ +#define IPATH_CMD_RECV_CTRL 18 /* control receipt of packets */ +#define IPATH_CMD_TID_UPDATE 19 /* update expected TID entries */ +#define IPATH_CMD_TID_FREE 20 /* free expected TID entries */ +#define IPATH_CMD_SET_PART_KEY 21 /* add partition key */ +#define __IPATH_CMD_SLAVE_INFO 22 /* return info on slave processes (for old user code) */ +#define IPATH_CMD_ASSIGN_PORT 23 /* allocate HCA and port */ +#define IPATH_CMD_USER_INIT 24 /* set up userspace */ +#define IPATH_CMD_UNUSED_1 25 +#define IPATH_CMD_UNUSED_2 26 +#define IPATH_CMD_PIOAVAILUPD 27 /* force an update of PIOAvail reg */ +#define IPATH_CMD_POLL_TYPE 28 /* set the kind of polling we want */ +#define IPATH_CMD_ARMLAUNCH_CTRL 29 /* armlaunch detection control */ +/* 30 is unused */ +#define IPATH_CMD_SDMA_INFLIGHT 31 /* sdma inflight counter request */ +#define IPATH_CMD_SDMA_COMPLETE 32 /* sdma completion counter request */ + +/* + * Poll types + */ +#define IPATH_POLL_TYPE_URGENT 0x01 +#define IPATH_POLL_TYPE_OVERFLOW 0x02 + +struct ipath_port_info { + __u32 num_active; /* number of active units */ + __u32 unit; /* unit (chip) assigned to caller */ + __u16 port; /* port on unit assigned to caller */ + __u16 subport; /* subport on unit assigned to caller */ + __u16 num_ports; /* number of ports available on unit */ + __u16 num_subports; /* number of subports opened on port */ +}; + +struct ipath_tid_info { + __u32 tidcnt; + /* make structure same size in 32 and 64 bit */ + __u32 tid__unused; + /* virtual address of first page in transfer */ + __u64 tidvaddr; + /* pointer (same size 32/64 bit) to __u16 tid array */ + __u64 tidlist; + + /* + * pointer (same size 32/64 bit) to bitmap of TIDs used + * for this call; checked for being large enough at open + */ + __u64 tidmap; +}; + +struct ipath_cmd { + __u32 type; /* command type */ + union { + struct ipath_tid_info tid_info; + struct ipath_user_info user_info; + + /* + * address in userspace where we should put the sdma + * inflight counter + */ + __u64 sdma_inflight; + /* + * address in userspace where we should put the sdma + * completion counter + */ + __u64 sdma_complete; + /* address in userspace of struct ipath_port_info to + write result to */ + __u64 port_info; + /* enable/disable receipt of packets */ + __u32 recv_ctrl; + /* enable/disable armlaunch errors (non-zero to enable) */ + __u32 armlaunch_ctrl; + /* partition key to set */ + __u16 part_key; + /* user address of __u32 bitmask of active slaves */ + __u64 slave_mask_addr; + /* type of polling we want */ + __u16 poll_type; + } cmd; +}; + +struct ipath_iovec { + /* Pointer to data, but same size 32 and 64 bit */ + __u64 iov_base; + + /* + * Length of data; don't need 64 bits, but want + * ipath_sendpkt to remain same size as before 32 bit changes, so... + */ + __u64 iov_len; +}; + +/* + * Describes a single packet for send. Each packet can have one or more + * buffers, but the total length (exclusive of IB headers) must be less + * than the MTU, and if using the PIO method, entire packet length, + * including IB headers, must be less than the ipath_piosize value (words). + * Use of this necessitates including sys/uio.h + */ +struct __ipath_sendpkt { + __u32 sps_flags; /* flags for packet (TBD) */ + __u32 sps_cnt; /* number of entries to use in sps_iov */ + /* array of iov's describing packet. TEMPORARY */ + struct ipath_iovec sps_iov[4]; +}; + +/* + * diagnostics can send a packet by "writing" one of the following + * two structs to diag data special file + * The first is the legacy version for backward compatibility + */ +struct ipath_diag_pkt { + __u32 unit; + __u64 data; + __u32 len; +}; + +/* The second diag_pkt struct is the expanded version that allows + * more control over the packet, specifically, by allowing a custom + * pbc (+ static rate) qword, so that special modes and deliberate + * changes to CRCs can be used. The elements were also re-ordered + * for better alignment and to avoid padding issues. + */ +struct ipath_diag_xpkt { + __u64 data; + __u64 pbc_wd; + __u32 unit; + __u32 len; +}; + +/* + * Data layout in I2C flash (for GUID, etc.) + * All fields are little-endian binary unless otherwise stated + */ +#define IPATH_FLASH_VERSION 2 +struct ipath_flash { + /* flash layout version (IPATH_FLASH_VERSION) */ + __u8 if_fversion; + /* checksum protecting if_length bytes */ + __u8 if_csum; + /* + * valid length (in use, protected by if_csum), including + * if_fversion and if_csum themselves) + */ + __u8 if_length; + /* the GUID, in network order */ + __u8 if_guid[8]; + /* number of GUIDs to use, starting from if_guid */ + __u8 if_numguid; + /* the (last 10 characters of) board serial number, in ASCII */ + char if_serial[12]; + /* board mfg date (YYYYMMDD ASCII) */ + char if_mfgdate[8]; + /* last board rework/test date (YYYYMMDD ASCII) */ + char if_testdate[8]; + /* logging of error counts, TBD */ + __u8 if_errcntp[4]; + /* powered on hours, updated at driver unload */ + __u8 if_powerhour[2]; + /* ASCII free-form comment field */ + char if_comment[32]; + /* Backwards compatible prefix for longer QLogic Serial Numbers */ + char if_sprefix[4]; + /* 82 bytes used, min flash size is 128 bytes */ + __u8 if_future[46]; +}; + +/* + * These are the counters implemented in the chip, and are listed in order. + * The InterCaps naming is taken straight from the chip spec. + */ +struct infinipath_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 TxSDmaDescCnt; /* was Reserved1 */ + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 RxP9HdrEgrOvflCnt; /* was Reserved6 */ + __u64 RxP10HdrEgrOvflCnt; /* was Reserved7 */ + __u64 RxP11HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP12HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP13HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP14HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP15HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 RxP16HdrEgrOvflCnt; /* new for IBA7220 */ + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; + /* The following are new for IBA7220 */ + __u64 RxVL15DroppedPktCnt; + __u64 RxOtherLocalPhyErrCnt; + __u64 PcieRetryBufDiagQwordCnt; + __u64 ExcessBufferOvflCnt; + __u64 LocalLinkIntegrityErrCnt; + __u64 RxVlErrCnt; + __u64 RxDlidFltrCnt; +}; + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software + * The other bits that are used only by the driver or diags are in + * ipath_registers.h + */ + +/* RcvHdrFlags bits */ +#define INFINIPATH_RHF_LENGTH_MASK 0x7FF +#define INFINIPATH_RHF_LENGTH_SHIFT 0 +#define INFINIPATH_RHF_RCVTYPE_MASK 0x7 +#define INFINIPATH_RHF_RCVTYPE_SHIFT 11 +#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF +#define INFINIPATH_RHF_EGRINDEX_SHIFT 16 +#define INFINIPATH_RHF_SEQ_MASK 0xF +#define INFINIPATH_RHF_SEQ_SHIFT 0 +#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF +#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4 +#define INFINIPATH_RHF_H_ICRCERR 0x80000000 +#define INFINIPATH_RHF_H_VCRCERR 0x40000000 +#define INFINIPATH_RHF_H_PARITYERR 0x20000000 +#define INFINIPATH_RHF_H_LENERR 0x10000000 +#define INFINIPATH_RHF_H_MTUERR 0x08000000 +#define INFINIPATH_RHF_H_IHDRERR 0x04000000 +#define INFINIPATH_RHF_H_TIDERR 0x02000000 +#define INFINIPATH_RHF_H_MKERR 0x01000000 +#define INFINIPATH_RHF_H_IBERR 0x00800000 +#define INFINIPATH_RHF_H_ERR_MASK 0xFF800000 +#define INFINIPATH_RHF_L_USE_EGR 0x80000000 +#define INFINIPATH_RHF_L_SWA 0x00008000 +#define INFINIPATH_RHF_L_SWB 0x00004000 + +/* infinipath header fields */ +#define INFINIPATH_I_VERS_MASK 0xF +#define INFINIPATH_I_VERS_SHIFT 28 +#define INFINIPATH_I_PORT_MASK 0xF +#define INFINIPATH_I_PORT_SHIFT 24 +#define INFINIPATH_I_TID_MASK 0x7FF +#define INFINIPATH_I_TID_SHIFT 13 +#define INFINIPATH_I_OFFSET_MASK 0x1FFF +#define INFINIPATH_I_OFFSET_SHIFT 0 + +/* K_PktFlags bits */ +#define INFINIPATH_KPF_INTR 0x1 +#define INFINIPATH_KPF_SUBPORT_MASK 0x3 +#define INFINIPATH_KPF_SUBPORT_SHIFT 1 + +#define INFINIPATH_MAX_SUBPORT 4 + +/* SendPIO per-buffer control */ +#define INFINIPATH_SP_TEST 0x40 +#define INFINIPATH_SP_TESTEBP 0x20 +#define INFINIPATH_SP_TRIGGER_SHIFT 15 + +/* SendPIOAvail bits */ +#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1 +#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0 + +/* infinipath header format */ +struct ipath_header { + /* + * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset - + * 14 bits before ECO change ~28 Dec 03. After that, Vers 4, + * Port 4, TID 11, offset 13. + */ + __le32 ver_port_tid_offset; + __le16 chksum; + __le16 pkt_flags; +}; + +/* infinipath user message header format. + * This structure contains the first 4 fields common to all protocols + * that employ infinipath. + */ +struct ipath_message_header { + __be16 lrh[4]; + __be32 bth[3]; + /* fields below this point are in host byte order */ + struct ipath_header iph; + __u8 sub_opcode; +}; + +/* infinipath ethernet header format */ +struct ether_header { + __be16 lrh[4]; + __be32 bth[3]; + struct ipath_header iph; + __u8 sub_opcode; + __u8 cmd; + __be16 lid; + __u16 mac[3]; + __u8 frag_num; + __u8 seq_num; + __le32 len; + /* MUST be of word size due to PIO write requirements */ + __le32 csum; + __le16 csum_offset; + __le16 flags; + __u16 first_2_bytes; + __u8 unused[2]; /* currently unused */ +}; + + +/* IB - LRH header consts */ +#define IPATH_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define IPATH_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define IPATH_DEFAULT_P_KEY 0xFFFF +#define IPATH_PERMISSIVE_LID 0xFFFF +#define IPATH_AETH_CREDIT_SHIFT 24 +#define IPATH_AETH_CREDIT_MASK 0x1F +#define IPATH_AETH_CREDIT_INVAL 0x1F +#define IPATH_PSN_MASK 0xFFFFFF +#define IPATH_MSN_MASK 0xFFFFFF +#define IPATH_QPN_MASK 0xFFFFFF +#define IPATH_MULTICAST_LID_BASE 0xC000 +#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK +#define IPATH_MULTICAST_QPN 0xFFFFFF + +/* Receive Header Queue: receive type (from infinipath) */ +#define RCVHQ_RCV_TYPE_EXPECTED 0 +#define RCVHQ_RCV_TYPE_EAGER 1 +#define RCVHQ_RCV_TYPE_NON_KD 2 +#define RCVHQ_RCV_TYPE_ERROR 3 + + +/* sub OpCodes - ith4x */ +#define IPATH_ITH4X_OPCODE_ENCAP 0x81 +#define IPATH_ITH4X_OPCODE_LID_ARP 0x82 + +#define IPATH_HEADER_QUEUE_WORDS 9 + +/* functions for extracting fields from rcvhdrq entries for the driver. + */ +static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf) +{ + return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK; +} + +static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT) + & INFINIPATH_RHF_RCVTYPE_MASK; +} + +static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf) +{ + return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT) + & INFINIPATH_RHF_LENGTH_MASK) << 2; +} + +static inline __u32 ipath_hdrget_index(const __le32 * rbuf) +{ + return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT) + & INFINIPATH_RHF_EGRINDEX_MASK; +} + +static inline __u32 ipath_hdrget_seq(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT) + & INFINIPATH_RHF_SEQ_MASK; +} + +static inline __u32 ipath_hdrget_offset(const __le32 *rbuf) +{ + return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT) + & INFINIPATH_RHF_HDRQ_OFFSET_MASK; +} + +static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf) +{ + return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR; +} + +static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword) +{ + return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT) + & INFINIPATH_I_VERS_MASK; +} + +#endif /* _IPATH_COMMON_H */ diff --git a/drivers/staging/rdma/ipath/ipath_cq.c b/drivers/staging/rdma/ipath/ipath_cq.c new file mode 100644 index 000000000..e9dd9112e --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_cq.c @@ -0,0 +1,483 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_cq_enter - add a new entry to the completion queue + * @cq: completion queue + * @entry: work completion entry to add + * @sig: true if @entry is a solicitated entry + * + * This may be called with qp->s_lock held. + */ +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) +{ + struct ipath_cq_wc *wc; + unsigned long flags; + u32 head; + u32 next; + + spin_lock_irqsave(&cq->lock, flags); + + /* + * Note that the head pointer might be writable by user processes. + * Take care to verify it is a sane value. + */ + wc = cq->queue; + head = wc->head; + if (head >= (unsigned) cq->ibcq.cqe) { + head = cq->ibcq.cqe; + next = 0; + } else + next = head + 1; + if (unlikely(next == wc->tail)) { + spin_unlock_irqrestore(&cq->lock, flags); + if (cq->ibcq.event_handler) { + struct ib_event ev; + + ev.device = cq->ibcq.device; + ev.element.cq = &cq->ibcq; + ev.event = IB_EVENT_CQ_ERR; + cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); + } + return; + } + if (cq->ip) { + wc->uqueue[head].wr_id = entry->wr_id; + wc->uqueue[head].status = entry->status; + wc->uqueue[head].opcode = entry->opcode; + wc->uqueue[head].vendor_err = entry->vendor_err; + wc->uqueue[head].byte_len = entry->byte_len; + wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data; + wc->uqueue[head].qp_num = entry->qp->qp_num; + wc->uqueue[head].src_qp = entry->src_qp; + wc->uqueue[head].wc_flags = entry->wc_flags; + wc->uqueue[head].pkey_index = entry->pkey_index; + wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].sl = entry->sl; + wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; + wc->uqueue[head].port_num = entry->port_num; + /* Make sure entry is written before the head index. */ + smp_wmb(); + } else + wc->kqueue[head] = *entry; + wc->head = next; + + if (cq->notify == IB_CQ_NEXT_COMP || + (cq->notify == IB_CQ_SOLICITED && solicited)) { + cq->notify = IB_CQ_NONE; + cq->triggered++; + /* + * This will cause send_complete() to be called in + * another thread. + */ + tasklet_hi_schedule(&cq->comptask); + } + + spin_unlock_irqrestore(&cq->lock, flags); + + if (entry->status != IB_WC_SUCCESS) + to_idev(cq->ibcq.device)->n_wqe_errs++; +} + +/** + * ipath_poll_cq - poll for work completion entries + * @ibcq: the completion queue to poll + * @num_entries: the maximum number of entries to return + * @entry: pointer to array where work completions are placed + * + * Returns the number of completion entries polled. + * + * This may be called from interrupt context. Also called by ib_poll_cq() + * in the generic verbs code. + */ +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *wc; + unsigned long flags; + int npolled; + u32 tail; + + /* The kernel can only poll a kernel completion queue */ + if (cq->ip) { + npolled = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&cq->lock, flags); + + wc = cq->queue; + tail = wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { + if (tail == wc->head) + break; + /* The kernel doesn't need a RMB since it has the lock. */ + *entry = wc->kqueue[tail]; + if (tail >= cq->ibcq.cqe) + tail = 0; + else + tail++; + } + wc->tail = tail; + + spin_unlock_irqrestore(&cq->lock, flags); + +bail: + return npolled; +} + +static void send_complete(unsigned long data) +{ + struct ipath_cq *cq = (struct ipath_cq *)data; + + /* + * The completion handler will most likely rearm the notification + * and poll for all pending entries. If a new completion entry + * is added while we are in this routine, tasklet_hi_schedule() + * won't call us again until we return so we check triggered to + * see if we need to call the handler again. + */ + for (;;) { + u8 triggered = cq->triggered; + + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + + if (cq->triggered == triggered) + return; + } +} + +/** + * ipath_create_cq - create a completion queue + * @ibdev: the device this completion queue is attached to + * @attr: creation attributes + * @context: unused by the InfiniPath driver + * @udata: unused by the InfiniPath driver + * + * Returns a pointer to the completion queue or negative errno values + * for failure. + * + * Called by ib_create_cq() in the generic verbs code. + */ +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + int entries = attr->cqe; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cq *cq; + struct ipath_cq_wc *wc; + struct ib_cq *ret; + u32 sz; + + if (attr->flags) + return ERR_PTR(-EINVAL); + + if (entries < 1 || entries > ib_ipath_max_cqes) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + /* Allocate the completion queue structure. */ + cq = kmalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Allocate the completion queue entries and head/tail pointers. + * This is allocated separately so that it can be resized and + * also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (entries + 1); + else + sz += sizeof(struct ib_wc) * (entries + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = ERR_PTR(-ENOMEM); + goto bail_cq; + } + + /* + * Return the address of the WC as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + + cq->ip = ipath_create_mmap_info(dev, sz, context, wc); + if (!cq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wc; + } + + err = ib_copy_to_udata(udata, &cq->ip->offset, + sizeof(cq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + cq->ip = NULL; + + spin_lock(&dev->n_cqs_lock); + if (dev->n_cqs_allocated == ib_ipath_max_cqs) { + spin_unlock(&dev->n_cqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_cqs_allocated++; + spin_unlock(&dev->n_cqs_lock); + + if (cq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + /* + * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. + * The number of entries should be >= the number requested or return + * an error. + */ + cq->ibcq.cqe = entries; + cq->notify = IB_CQ_NONE; + cq->triggered = 0; + spin_lock_init(&cq->lock); + tasklet_init(&cq->comptask, send_complete, (unsigned long)cq); + wc->head = 0; + wc->tail = 0; + cq->queue = wc; + + ret = &cq->ibcq; + + goto done; + +bail_ip: + kfree(cq->ip); +bail_wc: + vfree(wc); +bail_cq: + kfree(cq); +done: + return ret; +} + +/** + * ipath_destroy_cq - destroy a completion queue + * @ibcq: the completion queue to destroy. + * + * Returns 0 for success. + * + * Called by ib_destroy_cq() in the generic verbs code. + */ +int ipath_destroy_cq(struct ib_cq *ibcq) +{ + struct ipath_ibdev *dev = to_idev(ibcq->device); + struct ipath_cq *cq = to_icq(ibcq); + + tasklet_kill(&cq->comptask); + spin_lock(&dev->n_cqs_lock); + dev->n_cqs_allocated--; + spin_unlock(&dev->n_cqs_lock); + if (cq->ip) + kref_put(&cq->ip->ref, ipath_release_mmap_info); + else + vfree(cq->queue); + kfree(cq); + + return 0; +} + +/** + * ipath_req_notify_cq - change the notification type for a completion queue + * @ibcq: the completion queue + * @notify_flags: the type of notification to request + * + * Returns 0 for success. + * + * This may be called from interrupt context. Also called by + * ib_req_notify_cq() in the generic verbs code. + */ +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) +{ + struct ipath_cq *cq = to_icq(ibcq); + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&cq->lock, flags); + /* + * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow + * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). + */ + if (cq->notify != IB_CQ_NEXT_COMP) + cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; + + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + cq->queue->head != cq->queue->tail) + ret = 1; + + spin_unlock_irqrestore(&cq->lock, flags); + + return ret; +} + +/** + * ipath_resize_cq - change the size of the CQ + * @ibcq: the completion queue + * + * Returns 0 for success. + */ +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) +{ + struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *old_wc; + struct ipath_cq_wc *wc; + u32 head, tail, n; + int ret; + u32 sz; + + if (cqe < 1 || cqe > ib_ipath_max_cqes) { + ret = -EINVAL; + goto bail; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + sz = sizeof(*wc); + if (udata && udata->outlen >= sizeof(__u64)) + sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); + else + sz += sizeof(struct ib_wc) * (cqe + 1); + wc = vmalloc_user(sz); + if (!wc) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->outlen >= sizeof(__u64)) { + __u64 offset = 0; + + ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&cq->lock); + /* + * Make sure head and tail are sane since they + * might be user writable. + */ + old_wc = cq->queue; + head = old_wc->head; + if (head > (u32) cq->ibcq.cqe) + head = (u32) cq->ibcq.cqe; + tail = old_wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + if (head < tail) + n = cq->ibcq.cqe + 1 + head - tail; + else + n = head - tail; + if (unlikely((u32)cqe < n)) { + ret = -EINVAL; + goto bail_unlock; + } + for (n = 0; tail != head; n++) { + if (cq->ip) + wc->uqueue[n] = old_wc->uqueue[tail]; + else + wc->kqueue[n] = old_wc->kqueue[tail]; + if (tail == (u32) cq->ibcq.cqe) + tail = 0; + else + tail++; + } + cq->ibcq.cqe = cqe; + wc->head = n; + wc->tail = 0; + cq->queue = wc; + spin_unlock_irq(&cq->lock); + + vfree(old_wc); + + if (cq->ip) { + struct ipath_ibdev *dev = to_idev(ibcq->device); + struct ipath_mmap_info *ip = cq->ip; + + ipath_update_mmap_info(dev, ip, sz, wc); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = 0; + goto bail; + +bail_unlock: + spin_unlock_irq(&cq->lock); +bail_free: + vfree(wc); +bail: + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_debug.h b/drivers/staging/rdma/ipath/ipath_debug.h new file mode 100644 index 000000000..65926cd35 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_debug.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_DEBUG_H +#define _IPATH_DEBUG_H + +#ifndef _IPATH_DEBUGGING /* debugging enabled or not */ +#define _IPATH_DEBUGGING 1 +#endif + +#if _IPATH_DEBUGGING + +/* + * Mask values for debugging. The scheme allows us to compile out any + * of the debug tracing stuff, and if compiled in, to enable or disable + * dynamically. This can be set at modprobe time also: + * modprobe infinipath.ko infinipath_debug=7 + */ + +#define __IPATH_INFO 0x1 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x2 /* generic debug */ +#define __IPATH_TRSAMPLE 0x8 /* generate trace buffer sample entries */ +/* leave some low verbosity spots open */ +#define __IPATH_VERBDBG 0x40 /* very verbose debug */ +#define __IPATH_PKTDBG 0x80 /* print packet data */ +/* print process startup (init)/exit messages */ +#define __IPATH_PROCDBG 0x100 +/* print mmap/fault stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x200 +#define __IPATH_ERRPKTDBG 0x400 +#define __IPATH_USER_SEND 0x1000 /* use user mode send */ +#define __IPATH_KERNEL_SEND 0x2000 /* use kernel mode send */ +#define __IPATH_EPKTDBG 0x4000 /* print ethernet packet data */ +#define __IPATH_IPATHDBG 0x10000 /* Ethernet (IPATH) gen debug */ +#define __IPATH_IPATHWARN 0x20000 /* Ethernet (IPATH) warnings */ +#define __IPATH_IPATHERR 0x40000 /* Ethernet (IPATH) errors */ +#define __IPATH_IPATHPD 0x80000 /* Ethernet (IPATH) packet dump */ +#define __IPATH_IPATHTABLE 0x100000 /* Ethernet (IPATH) table dump */ +#define __IPATH_LINKVERBDBG 0x200000 /* very verbose linkchange debug */ + +#else /* _IPATH_DEBUGGING */ + +/* + * define all of these even with debugging off, for the few places that do + * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the + * compiler eliminate the code + */ + +#define __IPATH_INFO 0x0 /* generic low verbosity stuff */ +#define __IPATH_DBG 0x0 /* generic debug */ +#define __IPATH_TRSAMPLE 0x0 /* generate trace buffer sample entries */ +#define __IPATH_VERBDBG 0x0 /* very verbose debug */ +#define __IPATH_PKTDBG 0x0 /* print packet data */ +#define __IPATH_PROCDBG 0x0 /* process startup (init)/exit messages */ +/* print mmap/fault stuff, not using VDBG any more */ +#define __IPATH_MMDBG 0x0 +#define __IPATH_EPKTDBG 0x0 /* print ethernet packet data */ +#define __IPATH_IPATHDBG 0x0 /* Ethernet (IPATH) table dump on */ +#define __IPATH_IPATHWARN 0x0 /* Ethernet (IPATH) warnings on */ +#define __IPATH_IPATHERR 0x0 /* Ethernet (IPATH) errors on */ +#define __IPATH_IPATHPD 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_IPATHTABLE 0x0 /* Ethernet (IPATH) packet dump on */ +#define __IPATH_LINKVERBDBG 0x0 /* very verbose linkchange debug */ + +#endif /* _IPATH_DEBUGGING */ + +#define __IPATH_VERBOSEDBG __IPATH_VERBDBG + +#endif /* _IPATH_DEBUG_H */ diff --git a/drivers/staging/rdma/ipath/ipath_diag.c b/drivers/staging/rdma/ipath/ipath_diag.c new file mode 100644 index 000000000..45802e973 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_diag.c @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains support for diagnostic functions. It is accessed by + * opening the ipath_diag device, normally minor number 129. Diagnostic use + * of the InfiniPath chip may render the chip or board unusable until the + * driver is unloaded, or in some cases, until the system is rebooted. + * + * Accesses to the chip through this interface are not similar to going + * through the /sys/bus/pci resource mmap interface. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" + +int ipath_diag_inuse; +static int diag_set_link; + +static int ipath_diag_open(struct inode *in, struct file *fp); +static int ipath_diag_release(struct inode *in, struct file *fp); +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off); +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diag_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diag_write, + .read = ipath_diag_read, + .open = ipath_diag_open, + .release = ipath_diag_release, + .llseek = default_llseek, +}; + +static ssize_t ipath_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off); + +static const struct file_operations diagpkt_file_ops = { + .owner = THIS_MODULE, + .write = ipath_diagpkt_write, + .llseek = noop_llseek, +}; + +static atomic_t diagpkt_count = ATOMIC_INIT(0); +static struct cdev *diagpkt_cdev; +static struct device *diagpkt_dev; + +int ipath_diag_add(struct ipath_devdata *dd) +{ + char name[16]; + int ret = 0; + + if (atomic_inc_return(&diagpkt_count) == 1) { + ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR, + "ipath_diagpkt", &diagpkt_file_ops, + &diagpkt_cdev, &diagpkt_dev); + + if (ret) { + ipath_dev_err(dd, "Couldn't create ipath_diagpkt " + "device: %d", ret); + goto done; + } + } + + snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit); + + ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name, + &diag_file_ops, &dd->diag_cdev, + &dd->diag_dev); + if (ret) + ipath_dev_err(dd, "Couldn't create %s device: %d", + name, ret); + +done: + return ret; +} + +void ipath_diag_remove(struct ipath_devdata *dd) +{ + if (atomic_dec_and_test(&diagpkt_count)) + ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev); + + ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev); +} + +/** + * ipath_read_umem64 - read a 64-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy (multiple of 32 bits) + * + * This function also localizes all chip memory accesses. + * The copy should be written such that we read full cacheline packets + * from the chip. This is usually used for a single qword + * + * NOTE: This assumes the chip address is 64-bit aligned. + */ +static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data = readq(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(u64))) { + ret = -EFAULT; + goto bail; + } + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem64 - write a 64-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: the number of bytes to copy (multiple of 32 bits) + * + * This is usually used for a single qword + * NOTE: This assumes the chip address is 64-bit aligned. + */ + +static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u64 __iomem *reg_addr = caddr; + const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64)); + int ret; + + /* not very efficient, but it works for now */ + if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u64 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writeq(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u64); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_read_umem32 - read a 32-bit quantity from the chip into user space + * @dd: the infinipath device + * @uaddr: the location to store the data in user memory + * @caddr: the source chip address (full pointer, not offset) + * @count: number of bytes to copy + * + * read 32 bit values, not 64 bit; for memories that only + * support 32 bit reads; usually a single dword. + */ +static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr, + const void __iomem *caddr, size_t count) +{ + const u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + /* not very efficient, but it works for now */ + while (reg_addr < reg_end) { + u32 data = readl(reg_addr); + if (copy_to_user(uaddr, &data, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + + reg_addr++; + uaddr += sizeof(u32); + + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_write_umem32 - write a 32-bit quantity to the chip from user space + * @dd: the infinipath device + * @caddr: the destination chip address (full pointer, not offset) + * @uaddr: the source of the data in user memory + * @count: number of bytes to copy + * + * write 32 bit values, not 64 bit; for memories that only + * support 32 bit write; usually a single dword. + */ + +static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr, + const void __user *uaddr, size_t count) +{ + u32 __iomem *reg_addr = caddr; + const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32)); + int ret; + + if (reg_addr < (u32 __iomem *) dd->ipath_kregbase || + reg_end > (u32 __iomem *) dd->ipath_kregend) { + ret = -EINVAL; + goto bail; + } + while (reg_addr < reg_end) { + u32 data; + if (copy_from_user(&data, uaddr, sizeof(data))) { + ret = -EFAULT; + goto bail; + } + writel(data, reg_addr); + + reg_addr++; + uaddr += sizeof(u32); + } + ret = 0; +bail: + return ret; +} + +static int ipath_diag_open(struct inode *in, struct file *fp) +{ + int unit = iminor(in) - IPATH_DIAG_MINOR_BASE; + struct ipath_devdata *dd; + int ret; + + mutex_lock(&ipath_mutex); + + if (ipath_diag_inuse) { + ret = -EBUSY; + goto bail; + } + + dd = ipath_lookup(unit); + + if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) { + ret = -ENODEV; + goto bail; + } + + fp->private_data = dd; + ipath_diag_inuse = -2; + diag_set_link = 0; + ret = 0; + + /* Only expose a way to reset the device if we + make it into diag mode. */ + ipath_expose_reset(&dd->pcidev->dev); + +bail: + mutex_unlock(&ipath_mutex); + + return ret; +} + +/** + * ipath_diagpkt_write - write an IB packet + * @fp: the diag data device file pointer + * @data: ipath_diag_pkt structure saying where to get the packet + * @count: size of data to write + * @off: unused by this code + */ +static ssize_t ipath_diagpkt_write(struct file *fp, + const char __user *data, + size_t count, loff_t *off) +{ + u32 __iomem *piobuf; + u32 plen, pbufn, maxlen_reserve; + struct ipath_diag_pkt odp; + struct ipath_diag_xpkt dp; + u32 *tmpbuf = NULL; + struct ipath_devdata *dd; + ssize_t ret = 0; + u64 val; + u32 l_state, lt_state; /* LinkState, LinkTrainingState */ + + + if (count == sizeof(dp)) { + if (copy_from_user(&dp, data, sizeof(dp))) { + ret = -EFAULT; + goto bail; + } + } else if (count == sizeof(odp)) { + if (copy_from_user(&odp, data, sizeof(odp))) { + ret = -EFAULT; + goto bail; + } + dp.len = odp.len; + dp.unit = odp.unit; + dp.data = odp.data; + dp.pbc_wd = 0; + } else { + ret = -EINVAL; + goto bail; + } + + /* send count must be an exact number of dwords */ + if (dp.len & 3) { + ret = -EINVAL; + goto bail; + } + + plen = dp.len >> 2; + + dd = ipath_lookup(dp.unit); + if (!dd || !(dd->ipath_flags & IPATH_PRESENT) || + !dd->ipath_kregbase) { + ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n", + dp.unit); + ret = -ENODEV; + goto bail; + } + + if (ipath_diag_inuse && !diag_set_link && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + diag_set_link = 1; + ipath_cdbg(VERBOSE, "Trying to set to set link active for " + "diag pkt\n"); + ipath_set_linkstate(dd, IPATH_IB_LINKARM); + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); + } + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit); + ret = -ENODEV; + goto bail; + } + /* + * Want to skip check for l_state if using custom PBC, + * because we might be trying to force an SM packet out. + * first-cut, skip _all_ state checking in that case. + */ + val = ipath_ib_state(dd, dd->ipath_lastibcstat); + lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); + l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP || + (val != dd->ib_init && val != dd->ib_arm && + val != dd->ib_active))) { + ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n", + dd->ipath_unit, (unsigned long long) val); + ret = -EINVAL; + goto bail; + } + + /* + * need total length before first word written, plus 2 Dwords. One Dword + * is for padding so we get the full user data when not aligned on + * a word boundary. The other Dword is to make sure we have room for the + * ICRC which gets tacked on later. + */ + maxlen_reserve = 2 * sizeof(u32); + if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) { + ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n", + dp.len, dd->ipath_ibmaxlen); + ret = -EINVAL; + goto bail; + } + + plen = sizeof(u32) + dp.len; + + tmpbuf = vmalloc(plen); + if (!tmpbuf) { + dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, " + "failing\n"); + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmpbuf, + (const void __user *) (unsigned long) dp.data, + dp.len)) { + ret = -EFAULT; + goto bail; + } + + plen >>= 2; /* in dwords */ + + piobuf = ipath_getpiobuf(dd, plen, &pbufn); + if (!piobuf) { + ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n", + dd->ipath_unit); + ret = -EBUSY; + goto bail; + } + /* disarm it just to be extra sure */ + ipath_disarm_piobufs(dd, pbufn, 1); + + if (ipath_debug & __IPATH_PKTDBG) + ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n", + dd->ipath_unit, plen - 1, pbufn); + + if (dp.pbc_wd == 0) + dp.pbc_wd = plen; + writeq(dp.pbc_wd, piobuf); + /* + * Copy all by the trigger word, then flush, so it's written + * to chip before trigger word, then write trigger word, then + * flush again, so packet is sent. + */ + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, tmpbuf, plen - 1); + ipath_flush_wc(); + __raw_writel(tmpbuf[plen - 1], piobuf + plen + 1); + } else + __iowrite32_copy(piobuf + 2, tmpbuf, plen); + + ipath_flush_wc(); + + ret = sizeof(dp); + +bail: + vfree(tmpbuf); + return ret; +} + +static int ipath_diag_release(struct inode *in, struct file *fp) +{ + mutex_lock(&ipath_mutex); + ipath_diag_inuse = 0; + fp->private_data = NULL; + mutex_unlock(&ipath_mutex); + return 0; +} + +static ssize_t ipath_diag_read(struct file *fp, char __user *data, + size_t count, loff_t *off) +{ + struct ipath_devdata *dd = fp->private_data; + void __iomem *kreg_base; + ssize_t ret; + + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if (ipath_diag_inuse < 1 && (*off || count != 8)) + ret = -EINVAL; /* prevent cat /dev/ipath_diag* */ + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit reads */ + ret = ipath_read_umem32(dd, data, kreg_base + *off, count); + else + ret = ipath_read_umem64(dd, data, kreg_base + *off, count); + + if (ret >= 0) { + *off += count; + ret = count; + if (ipath_diag_inuse == -2) + ipath_diag_inuse++; + } + + return ret; +} + +static ssize_t ipath_diag_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + struct ipath_devdata *dd = fp->private_data; + void __iomem *kreg_base; + ssize_t ret; + + kreg_base = dd->ipath_kregbase; + + if (count == 0) + ret = 0; + else if ((count % 4) || (*off % 4)) + /* address or length is not 32-bit aligned, hence invalid */ + ret = -EINVAL; + else if ((ipath_diag_inuse == -1 && (*off || count != 8)) || + ipath_diag_inuse == -2) /* read qw off 0, write qw off 0 */ + ret = -EINVAL; /* before any other write allowed */ + else if ((count % 8) || (*off % 8)) + /* address or length not 64-bit aligned; do 32-bit writes */ + ret = ipath_write_umem32(dd, kreg_base + *off, data, count); + else + ret = ipath_write_umem64(dd, kreg_base + *off, data, count); + + if (ret >= 0) { + *off += count; + ret = count; + if (ipath_diag_inuse == -1) + ipath_diag_inuse = 1; /* all read/write OK now */ + } + + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_dma.c b/drivers/staging/rdma/ipath/ipath_dma.c new file mode 100644 index 000000000..123a8c053 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_dma.c @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2006 QLogic, Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +#define BAD_DMA_ADDRESS ((u64) 0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 ipath_dma_map_single(struct ib_device *dev, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); + return (u64) cpu_addr; +} + +static void ipath_dma_unmap_single(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static u64 ipath_dma_map_page(struct ib_device *dev, + struct page *page, + unsigned long offset, + size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + BUG_ON(!valid_dma_direction(direction)); + + if (offset + size > PAGE_SIZE) { + addr = BAD_DMA_ADDRESS; + goto done; + } + + addr = (u64) page_address(page); + if (addr) + addr += offset; + /* TODO: handle highmem pages */ + +done: + return addr; +} + +static void ipath_dma_unmap_page(struct ib_device *dev, + u64 addr, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + BUG_ON(!valid_dma_direction(direction)); + + for_each_sg(sgl, sg, nents, i) { + addr = (u64) page_address(sg_page(sg)); + /* TODO: handle highmem pages */ + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void ipath_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG_ON(!valid_dma_direction(direction)); +} + +static void ipath_sync_single_for_cpu(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void ipath_sync_single_for_device(struct ib_device *dev, + u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64) addr; + return addr; +} + +static void ipath_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long) cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops ipath_dma_mapping_ops = { + .mapping_error = ipath_mapping_error, + .map_single = ipath_dma_map_single, + .unmap_single = ipath_dma_unmap_single, + .map_page = ipath_dma_map_page, + .unmap_page = ipath_dma_unmap_page, + .map_sg = ipath_map_sg, + .unmap_sg = ipath_unmap_sg, + .sync_single_for_cpu = ipath_sync_single_for_cpu, + .sync_single_for_device = ipath_sync_single_for_device, + .alloc_coherent = ipath_dma_alloc_coherent, + .free_coherent = ipath_dma_free_coherent +}; diff --git a/drivers/staging/rdma/ipath/ipath_driver.c b/drivers/staging/rdma/ipath/ipath_driver.c new file mode 100644 index 000000000..871dbe562 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_driver.c @@ -0,0 +1,2789 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +#include +#endif + +#include "ipath_kernel.h" +#include "ipath_verbs.h" + +static void ipath_update_pio_bufs(struct ipath_devdata *); + +const char *ipath_get_unit_name(int unit) +{ + static char iname[16]; + snprintf(iname, sizeof iname, "infinipath%u", unit); + return iname; +} + +#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: " +#define PFX IPATH_DRV_NAME ": " + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the init code. + */ +const char ib_ipath_version[] = IPATH_IDSTR "\n"; + +static struct idr unit_table; +DEFINE_SPINLOCK(ipath_devs_lock); +LIST_HEAD(ipath_dev_list); + +wait_queue_head_t ipath_state_wait; + +unsigned ipath_debug = __IPATH_INFO; + +module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(debug, "mask for debug prints"); +EXPORT_SYMBOL_GPL(ipath_debug); + +unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */ +module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO); +MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported"); + +static unsigned ipath_hol_timeout_ms = 13000; +module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO); +MODULE_PARM_DESC(hol_timeout_ms, + "duration of user app suspension after link failure"); + +unsigned ipath_linkrecovery = 1; +module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("QLogic "); +MODULE_DESCRIPTION("QLogic InfiniPath driver"); + +/* + * Table to translate the LINKTRAININGSTATE portion of + * IBCStatus to a human-readable form. + */ +const char *ipath_ibcstatus_str[] = { + "Disabled", + "LinkUp", + "PollActive", + "PollQuiet", + "SleepDelay", + "SleepQuiet", + "LState6", /* unused */ + "LState7", /* unused */ + "CfgDebounce", + "CfgRcvfCfg", + "CfgWaitRmt", + "CfgIdle", + "RecovRetrain", + "CfgTxRevLane", /* unused before IBA7220 */ + "RecovWaitRmt", + "RecovIdle", + /* below were added for IBA7220 */ + "CfgEnhanced", + "CfgTest", + "CfgWaitRmtTest", + "CfgWaitCfgEnhanced", + "SendTS_T", + "SendTstIdles", + "RcvTS_T", + "SendTst_TS1s", + "LTState18", "LTState19", "LTState1A", "LTState1B", + "LTState1C", "LTState1D", "LTState1E", "LTState1F" +}; + +static void ipath_remove_one(struct pci_dev *); +static int ipath_init_one(struct pci_dev *, const struct pci_device_id *); + +/* Only needed for registration, nothing else needs this info */ +#define PCI_VENDOR_ID_PATHSCALE 0x1fc1 +#define PCI_DEVICE_ID_INFINIPATH_HT 0xd + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +static const struct pci_device_id ipath_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, ipath_pci_tbl); + +static struct pci_driver ipath_driver = { + .name = IPATH_DRV_NAME, + .probe = ipath_init_one, + .remove = ipath_remove_one, + .id_table = ipath_pci_tbl, + .driver = { + .groups = ipath_driver_attr_groups, + }, +}; + +static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev, + u32 *bar0, u32 *bar1) +{ + int ret; + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0); + if (ret) + ipath_dev_err(dd, "failed to read bar0 before enable: " + "error %d\n", -ret); + + ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1); + if (ret) + ipath_dev_err(dd, "failed to read bar1 before enable: " + "error %d\n", -ret); + + ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1); +} + +static void ipath_free_devdata(struct pci_dev *pdev, + struct ipath_devdata *dd) +{ + unsigned long flags; + + pci_set_drvdata(pdev, NULL); + + if (dd->ipath_unit != -1) { + spin_lock_irqsave(&ipath_devs_lock, flags); + idr_remove(&unit_table, dd->ipath_unit); + list_del(&dd->ipath_list); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + } + vfree(dd); +} + +static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev) +{ + unsigned long flags; + struct ipath_devdata *dd; + int ret; + + dd = vzalloc(sizeof(*dd)); + if (!dd) { + dd = ERR_PTR(-ENOMEM); + goto bail; + } + dd->ipath_unit = -1; + + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&ipath_devs_lock, flags); + + ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate unit ID: error %d\n", -ret); + ipath_free_devdata(pdev, dd); + dd = ERR_PTR(ret); + goto bail_unlock; + } + dd->ipath_unit = ret; + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + list_add(&dd->ipath_list, &ipath_dev_list); + +bail_unlock: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + idr_preload_end(); +bail: + return dd; +} + +static inline struct ipath_devdata *__ipath_lookup(int unit) +{ + return idr_find(&unit_table, unit); +} + +struct ipath_devdata *ipath_lookup(int unit) +{ + struct ipath_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&ipath_devs_lock, flags); + dd = __ipath_lookup(unit); + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return dd; +} + +int ipath_count_units(int *npresentp, int *nupp, int *maxportsp) +{ + int nunits, npresent, nup; + struct ipath_devdata *dd; + unsigned long flags; + int maxports; + + nunits = npresent = nup = maxports = 0; + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + nunits++; + if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase) + npresent++; + if (dd->ipath_lid && + !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN + | IPATH_LINKUNK))) + nup++; + if (dd->ipath_cfgports > maxports) + maxports = dd->ipath_cfgports; + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + if (maxportsp) + *maxportsp = maxports; + + return nunits; +} + +/* + * These next two routines are placeholders in case we don't have per-arch + * code for controlling write combining. If explicit control of write + * combining is not available, performance will probably be awful. + */ + +int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd) +{ + return -EOPNOTSUPP; +} + +void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd) +{ +} + +/* + * Perform a PIO buffer bandwidth write test, to verify proper system + * configuration. Even when all the setup calls work, occasionally + * BIOS or other issues can prevent write combining from working, or + * can cause other bandwidth problems to the chip. + * + * This test simply writes the same buffer over and over again, and + * measures close to the peak bandwidth to the chip (not testing + * data bandwidth to the wire). On chips that use an address-based + * trigger to send packets to the wire, this is easy. On chips that + * use a count to trigger, we want to make sure that the packet doesn't + * go out on the wire, or trigger flow control checks. + */ +static void ipath_verify_pioperf(struct ipath_devdata *dd) +{ + u32 pbnum, cnt, lcnt; + u32 __iomem *piobuf; + u32 *addr; + u64 msecs, emsecs; + + piobuf = ipath_getpiobuf(dd, 0, &pbnum); + if (!piobuf) { + dev_info(&dd->pcidev->dev, + "No PIObufs for checking perf, skipping\n"); + return; + } + + /* + * Enough to give us a reasonable test, less than piobuf size, and + * likely multiple of store buffer length. + */ + cnt = 1024; + + addr = vmalloc(cnt); + if (!addr) { + dev_info(&dd->pcidev->dev, + "Couldn't get memory for checking PIO perf," + " skipping\n"); + goto done; + } + + preempt_disable(); /* we want reasonably accurate elapsed time */ + msecs = 1 + jiffies_to_msecs(jiffies); + for (lcnt = 0; lcnt < 10000U; lcnt++) { + /* wait until we cross msec boundary */ + if (jiffies_to_msecs(jiffies) >= msecs) + break; + udelay(1); + } + + ipath_disable_armlaunch(dd); + + /* + * length 0, no dwords actually sent, and mark as VL15 + * on chips where that may matter (due to IB flowcontrol) + */ + if ((dd->ipath_flags & IPATH_HAS_PBC_CNT)) + writeq(1UL << 63, piobuf); + else + writeq(0, piobuf); + ipath_flush_wc(); + + /* + * this is only roughly accurate, since even with preempt we + * still take interrupts that could take a while. Running for + * >= 5 msec seems to get us "close enough" to accurate values + */ + msecs = jiffies_to_msecs(jiffies); + for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) { + __iowrite32_copy(piobuf + 64, addr, cnt >> 2); + emsecs = jiffies_to_msecs(jiffies) - msecs; + } + + /* 1 GiB/sec, slightly over IB SDR line rate */ + if (lcnt < (emsecs * 1024U)) + ipath_dev_err(dd, + "Performance problem: bandwidth to PIO buffers is " + "only %u MiB/sec\n", + lcnt / (u32) emsecs); + else + ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n", + lcnt / (u32) emsecs); + + preempt_enable(); + + vfree(addr); + +done: + /* disarm piobuf, so it's available again */ + ipath_disarm_piobufs(dd, pbnum, 1); + ipath_enable_armlaunch(dd); +} + +static void cleanup_device(struct ipath_devdata *dd); + +static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret, len, j; + struct ipath_devdata *dd; + unsigned long long addr; + u32 bar0 = 0, bar1 = 0; + +#ifdef CONFIG_X86_64 + if (pat_enabled()) { + pr_warn("ipath needs PAT disabled, boot with nopat kernel parameter\n"); + ret = -ENODEV; + goto bail; + } +#endif + + dd = ipath_alloc_devdata(pdev); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate devdata: error %d\n", -ret); + goto bail; + } + + ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit); + + ret = pci_enable_device(pdev); + if (ret) { + /* This can happen iff: + * + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + ipath_dev_err(dd, "enable unit %d failed: error %d\n", + dd->ipath_unit, -ret); + goto bail_devdata; + } + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x " + "driver_data %lx\n", addr, len, pdev->irq, ent->vendor, + ent->device, ent->driver_data); + + read_bars(dd, pdev, &bar0, &bar1); + + if (!bar1 && !(bar0 & ~0xf)) { + if (addr) { + dev_info(&pdev->dev, "BAR is 0 (probable RESET), " + "rewriting as %llx\n", addr); + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_0, addr); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR0 " + "failed: err %d\n", -ret); + goto bail_disable; + } + ret = pci_write_config_dword( + pdev, PCI_BASE_ADDRESS_1, addr >> 32); + if (ret) { + ipath_dev_err(dd, "rewrite of BAR1 " + "failed: err %d\n", -ret); + goto bail_disable; + } + } else { + ipath_dev_err(dd, "BAR is 0 (probable RESET), " + "not usable until reboot\n"); + ret = -ENODEV; + goto bail_disable; + } + } + + ret = pci_request_regions(pdev, IPATH_DRV_NAME); + if (ret) { + dev_info(&pdev->dev, "pci_request_regions unit %u fails: " + "err %d\n", dd->ipath_unit, -ret); + goto bail_disable; + } + + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + /* + * if the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + dev_info(&pdev->dev, + "Unable to set DMA mask for unit %u: %d\n", + dd->ipath_unit, ret); + goto bail_regions; + } + else { + ipath_dbg("No 64bit DMA mask, used 32 bit mask\n"); + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + + } + } + else { + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) + dev_info(&pdev->dev, + "Unable to set DMA consistent mask " + "for unit %u: %d\n", + dd->ipath_unit, ret); + } + + pci_set_master(pdev); + + /* + * Save BARs to rewrite after device reset. Save all 64 bits of + * BAR, just in case. + */ + dd->ipath_pcibar0 = addr; + dd->ipath_pcibar1 = addr >> 32; + dd->ipath_deviceid = ent->device; /* save for later use */ + dd->ipath_vendorid = ent->vendor; + + /* setup the chip-specific functions, as early as possible. */ + switch (ent->device) { + case PCI_DEVICE_ID_INFINIPATH_HT: + ipath_init_iba6110_funcs(dd); + break; + + default: + ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, " + "failing\n", ent->device); + return -ENODEV; + } + + for (j = 0; j < 6; j++) { + if (!pdev->resource[j].start) + continue; + ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n", + j, &pdev->resource[j], + (unsigned long long)pci_resource_len(pdev, j)); + } + + if (!addr) { + ipath_dev_err(dd, "No valid address in BAR 0!\n"); + ret = -ENODEV; + goto bail_regions; + } + + dd->ipath_pcirev = pdev->revision; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + dd->ipath_kregbase = __ioremap(addr, len, + (_PAGE_NO_CACHE|_PAGE_WRITETHRU)); +#else + /* XXX: split this properly to enable on PAT */ + dd->ipath_kregbase = ioremap_nocache(addr, len); +#endif + + if (!dd->ipath_kregbase) { + ipath_dbg("Unable to map io addr %llx to kvirt, failing\n", + addr); + ret = -ENOMEM; + goto bail_iounmap; + } + dd->ipath_kregend = (u64 __iomem *) + ((void __iomem *)dd->ipath_kregbase + len); + dd->ipath_physaddr = addr; /* used for io_remap, etc. */ + /* for user mmap */ + ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n", + addr, dd->ipath_kregbase); + + if (dd->ipath_f_bus(dd, pdev)) + ipath_dev_err(dd, "Failed to setup config space; " + "continuing anyway\n"); + + /* + * set up our interrupt handler; IRQF_SHARED probably not needed, + * since MSI interrupts shouldn't be shared but won't hurt for now. + * check 0 irq after we return from chip-specific bus setup, since + * that can affect this due to setup + */ + if (!dd->ipath_irq) + ipath_dev_err(dd, "irq is 0, BIOS error? Interrupts won't " + "work\n"); + else { + ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED, + IPATH_DRV_NAME, dd); + if (ret) { + ipath_dev_err(dd, "Couldn't setup irq handler, " + "irq=%d: %d\n", dd->ipath_irq, ret); + goto bail_iounmap; + } + } + + ret = ipath_init_chip(dd, 0); /* do the chip-specific init */ + if (ret) + goto bail_irqsetup; + + ret = ipath_enable_wc(dd); + + if (ret) + ret = 0; + + ipath_verify_pioperf(dd); + + ipath_device_create_group(&pdev->dev, dd); + ipathfs_add_device(dd); + ipath_user_add(dd); + ipath_diag_add(dd); + ipath_register_ib_device(dd); + + goto bail; + +bail_irqsetup: + cleanup_device(dd); + + if (dd->ipath_irq) + dd->ipath_f_free_irq(dd); + + if (dd->ipath_f_cleanup) + dd->ipath_f_cleanup(dd); + +bail_iounmap: + iounmap((volatile void __iomem *) dd->ipath_kregbase); + +bail_regions: + pci_release_regions(pdev); + +bail_disable: + pci_disable_device(pdev); + +bail_devdata: + ipath_free_devdata(pdev, dd); + +bail: + return ret; +} + +static void cleanup_device(struct ipath_devdata *dd) +{ + int port; + struct ipath_portdata **tmp; + unsigned long flags; + + if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) { + /* can't do anything more with chip; needs re-init */ + *dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT; + if (dd->ipath_kregbase) { + /* + * if we haven't already cleaned up before these are + * to ensure any register reads/writes "fail" until + * re-init + */ + dd->ipath_kregbase = NULL; + dd->ipath_uregbase = 0; + dd->ipath_sregbase = 0; + dd->ipath_cregbase = 0; + dd->ipath_kregsize = 0; + } + ipath_disable_wc(dd); + } + + if (dd->ipath_spectriggerhit) + dev_info(&dd->pcidev->dev, "%lu special trigger hits\n", + dd->ipath_spectriggerhit); + + if (dd->ipath_pioavailregs_dma) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *) dd->ipath_pioavailregs_dma, + dd->ipath_pioavailregs_phys); + dd->ipath_pioavailregs_dma = NULL; + } + if (dd->ipath_dummy_hdrq) { + dma_free_coherent(&dd->pcidev->dev, + dd->ipath_pd[0]->port_rcvhdrq_size, + dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys); + dd->ipath_dummy_hdrq = NULL; + } + + if (dd->ipath_pageshadow) { + struct page **tmpp = dd->ipath_pageshadow; + dma_addr_t *tmpd = dd->ipath_physshadow; + int i, cnt = 0; + + ipath_cdbg(VERBOSE, "Unlocking any expTID pages still " + "locked\n"); + for (port = 0; port < dd->ipath_cfgports; port++) { + int port_tidbase = port * dd->ipath_rcvtidcnt; + int maxtid = port_tidbase + dd->ipath_rcvtidcnt; + for (i = port_tidbase; i < maxtid; i++) { + if (!tmpp[i]) + continue; + pci_unmap_page(dd->pcidev, tmpd[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&tmpp[i], 1); + tmpp[i] = NULL; + cnt++; + } + } + if (cnt) { + ipath_stats.sps_pageunlocks += cnt; + ipath_cdbg(VERBOSE, "There were still %u expTID " + "entries locked\n", cnt); + } + if (ipath_stats.sps_pagelocks || + ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu " + "unlocked via ipath_m{un}lock\n", + (unsigned long long) + ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); + + ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n", + dd->ipath_pageshadow); + tmpp = dd->ipath_pageshadow; + dd->ipath_pageshadow = NULL; + vfree(tmpp); + + dd->ipath_egrtidbase = NULL; + } + + /* + * free any resources still in use (usually just kernel ports) + * at unload; we do for portcnt, because that's what we allocate. + * We acquire lock to be really paranoid that ipath_pd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + tmp = dd->ipath_pd; + dd->ipath_pd = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + for (port = 0; port < dd->ipath_portcnt; port++) { + struct ipath_portdata *pd = tmp[port]; + tmp[port] = NULL; /* debugging paranoia */ + ipath_free_pddata(dd, pd); + } + kfree(tmp); +} + +static void ipath_remove_one(struct pci_dev *pdev) +{ + struct ipath_devdata *dd = pci_get_drvdata(pdev); + + ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd); + + /* + * disable the IB link early, to be sure no new packets arrive, which + * complicates the shutdown process + */ + ipath_shutdown_device(dd); + + flush_workqueue(ib_wq); + + if (dd->verbs_dev) + ipath_unregister_ib_device(dd->verbs_dev); + + ipath_diag_remove(dd); + ipath_user_remove(dd); + ipathfs_remove_device(dd); + ipath_device_remove_group(&pdev->dev, dd); + + ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, " + "unit %u\n", dd, (u32) dd->ipath_unit); + + cleanup_device(dd); + + /* + * turn off rcv, send, and interrupts for all ports, all drivers + * should also hard reset the chip here? + * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs + * for all versions of the driver, if they were allocated + */ + if (dd->ipath_irq) { + ipath_cdbg(VERBOSE, "unit %u free irq %d\n", + dd->ipath_unit, dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } else + ipath_dbg("irq is 0, not doing free_irq " + "for unit %u\n", dd->ipath_unit); + /* + * we check for NULL here, because it's outside + * the kregbase check, and we need to call it + * after the free_irq. Thus it's possible that + * the function pointers were never initialized. + */ + if (dd->ipath_f_cleanup) + /* clean up chip-specific stuff */ + dd->ipath_f_cleanup(dd); + + ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase); + iounmap((volatile void __iomem *) dd->ipath_kregbase); + pci_release_regions(pdev); + ipath_cdbg(VERBOSE, "calling pci_disable_device\n"); + pci_disable_device(pdev); + + ipath_free_devdata(pdev, dd); +} + +/* general driver use */ +DEFINE_MUTEX(ipath_mutex); + +static DEFINE_SPINLOCK(ipath_pioavail_lock); + +/** + * ipath_disarm_piobufs - cancel a range of PIO buffers + * @dd: the infinipath device + * @first: the first PIO buffer to cancel + * @cnt: the number of PIO buffers to cancel + * + * cancel a range of PIO buffers, used when they might be armed, but + * not triggered. Used at init to ensure buffer state, and also user + * process close, in case it died while writing to a PIO buffer + * Also after errors. + */ +void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first, + unsigned cnt) +{ + unsigned i, last = first + cnt; + unsigned long flags; + + ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first); + for (i = first; i < last; i++) { + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * The disarm-related bits are write-only, so it + * is ok to OR them in with our copy of sendctrl + * while we hold the lock. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_DISARM | + (i << INFINIPATH_S_DISARMPIOBUF_SHIFT)); + /* can't disarm bufs back-to-back per iba7220 spec */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + /* on some older chips, update may not happen after cancel */ + ipath_force_pio_avail_update(dd); +} + +/** + * ipath_wait_linkstate - wait for an IB link state change to occur + * @dd: the infinipath device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * wait up to msecs milliseconds for IB link state change to occur for + * now, take the easy polling route. Currently used only by + * ipath_set_linkstate. Returns 0 if state reached, otherwise + * -ETIMEDOUT state can have multiple states set, for any of several + * transitions. + */ +int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs) +{ + dd->ipath_state_wanted = state; + wait_event_interruptible_timeout(ipath_state_wait, + (dd->ipath_flags & state), + msecs_to_jiffies(msecs)); + dd->ipath_state_wanted = 0; + + if (!(dd->ipath_flags & state)) { + u64 val; + ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u" + " ms\n", + /* test INIT ahead of DOWN, both can be set */ + (state & IPATH_LINKINIT) ? "INIT" : + ((state & IPATH_LINKDOWN) ? "DOWN" : + ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")), + msecs); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n", + (unsigned long long) ipath_read_kreg64( + dd, dd->ipath_kregs->kr_ibcctrl), + (unsigned long long) val, + ipath_ibcstatus_str[val & dd->ibcs_lts_mask]); + } + return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT; +} + +static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err, + char *buf, size_t blen) +{ + static const struct { + ipath_err_t err; + const char *msg; + } errs[] = { + { INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" }, + { INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" }, + { INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" }, + { INFINIPATH_E_SDMABASE, "SDmaBase" }, + { INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" }, + { INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" }, + { INFINIPATH_E_SDMADWEN, "SDmaDwEn" }, + { INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" }, + { INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" }, + { INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" }, + { INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" }, + { INFINIPATH_E_SDMADISABLED, "SDmaDisabled" }, + }; + int i; + int expected; + size_t bidx = 0; + + for (i = 0; i < ARRAY_SIZE(errs); i++) { + expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 : + test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + if ((err & errs[i].err) && !expected) + bidx += snprintf(buf + bidx, blen - bidx, + "%s ", errs[i].msg); + } +} + +/* + * Decode the error status into strings, deciding whether to always + * print * it or not depending on "normal packet errors" vs everything + * else. Return 1 if "real" errors, otherwise 0 if only packet + * errors, so caller can decide what to print with the string. + */ +int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen, + ipath_err_t err) +{ + int iserr = 1; + *buf = '\0'; + if (err & INFINIPATH_E_PKTERRS) { + if (!(err & ~INFINIPATH_E_PKTERRS)) + iserr = 0; // if only packet errors. + if (ipath_debug & __IPATH_ERRPKTDBG) { + if (err & INFINIPATH_E_REBP) + strlcat(buf, "EBP ", blen); + if (err & INFINIPATH_E_RVCRC) + strlcat(buf, "VCRC ", blen); + if (err & INFINIPATH_E_RICRC) { + strlcat(buf, "CRC ", blen); + // clear for check below, so only once + err &= INFINIPATH_E_RICRC; + } + if (err & INFINIPATH_E_RSHORTPKTLEN) + strlcat(buf, "rshortpktlen ", blen); + if (err & INFINIPATH_E_SDROPPEDDATAPKT) + strlcat(buf, "sdroppeddatapkt ", blen); + if (err & INFINIPATH_E_SPKTLEN) + strlcat(buf, "spktlen ", blen); + } + if ((err & INFINIPATH_E_RICRC) && + !(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP))) + strlcat(buf, "CRC ", blen); + if (!iserr) + goto done; + } + if (err & INFINIPATH_E_RHDRLEN) + strlcat(buf, "rhdrlen ", blen); + if (err & INFINIPATH_E_RBADTID) + strlcat(buf, "rbadtid ", blen); + if (err & INFINIPATH_E_RBADVERSION) + strlcat(buf, "rbadversion ", blen); + if (err & INFINIPATH_E_RHDR) + strlcat(buf, "rhdr ", blen); + if (err & INFINIPATH_E_SENDSPECIALTRIGGER) + strlcat(buf, "sendspecialtrigger ", blen); + if (err & INFINIPATH_E_RLONGPKTLEN) + strlcat(buf, "rlongpktlen ", blen); + if (err & INFINIPATH_E_RMAXPKTLEN) + strlcat(buf, "rmaxpktlen ", blen); + if (err & INFINIPATH_E_RMINPKTLEN) + strlcat(buf, "rminpktlen ", blen); + if (err & INFINIPATH_E_SMINPKTLEN) + strlcat(buf, "sminpktlen ", blen); + if (err & INFINIPATH_E_RFORMATERR) + strlcat(buf, "rformaterr ", blen); + if (err & INFINIPATH_E_RUNSUPVL) + strlcat(buf, "runsupvl ", blen); + if (err & INFINIPATH_E_RUNEXPCHAR) + strlcat(buf, "runexpchar ", blen); + if (err & INFINIPATH_E_RIBFLOW) + strlcat(buf, "ribflow ", blen); + if (err & INFINIPATH_E_SUNDERRUN) + strlcat(buf, "sunderrun ", blen); + if (err & INFINIPATH_E_SPIOARMLAUNCH) + strlcat(buf, "spioarmlaunch ", blen); + if (err & INFINIPATH_E_SUNEXPERRPKTNUM) + strlcat(buf, "sunexperrpktnum ", blen); + if (err & INFINIPATH_E_SDROPPEDSMPPKT) + strlcat(buf, "sdroppedsmppkt ", blen); + if (err & INFINIPATH_E_SMAXPKTLEN) + strlcat(buf, "smaxpktlen ", blen); + if (err & INFINIPATH_E_SUNSUPVL) + strlcat(buf, "sunsupVL ", blen); + if (err & INFINIPATH_E_INVALIDADDR) + strlcat(buf, "invalidaddr ", blen); + if (err & INFINIPATH_E_RRCVEGRFULL) + strlcat(buf, "rcvegrfull ", blen); + if (err & INFINIPATH_E_RRCVHDRFULL) + strlcat(buf, "rcvhdrfull ", blen); + if (err & INFINIPATH_E_IBSTATUSCHANGED) + strlcat(buf, "ibcstatuschg ", blen); + if (err & INFINIPATH_E_RIBLOSTLINK) + strlcat(buf, "riblostlink ", blen); + if (err & INFINIPATH_E_HARDWARE) + strlcat(buf, "hardware ", blen); + if (err & INFINIPATH_E_RESET) + strlcat(buf, "reset ", blen); + if (err & INFINIPATH_E_SDMAERRS) + decode_sdma_errs(dd, err, buf, blen); + if (err & INFINIPATH_E_INVALIDEEPCMD) + strlcat(buf, "invalideepromcmd ", blen); +done: + return iserr; +} + +/** + * get_rhf_errstring - decode RHF errors + * @err: the err number + * @msg: the output buffer + * @len: the length of the output buffer + * + * only used one place now, may want more later + */ +static void get_rhf_errstring(u32 err, char *msg, size_t len) +{ + /* if no errors, and so don't need to check what's first */ + *msg = '\0'; + + if (err & INFINIPATH_RHF_H_ICRCERR) + strlcat(msg, "icrcerr ", len); + if (err & INFINIPATH_RHF_H_VCRCERR) + strlcat(msg, "vcrcerr ", len); + if (err & INFINIPATH_RHF_H_PARITYERR) + strlcat(msg, "parityerr ", len); + if (err & INFINIPATH_RHF_H_LENERR) + strlcat(msg, "lenerr ", len); + if (err & INFINIPATH_RHF_H_MTUERR) + strlcat(msg, "mtuerr ", len); + if (err & INFINIPATH_RHF_H_IHDRERR) + /* infinipath hdr checksum error */ + strlcat(msg, "ipathhdrerr ", len); + if (err & INFINIPATH_RHF_H_TIDERR) + strlcat(msg, "tiderr ", len); + if (err & INFINIPATH_RHF_H_MKERR) + /* bad port, offset, etc. */ + strlcat(msg, "invalid ipathhdr ", len); + if (err & INFINIPATH_RHF_H_IBERR) + strlcat(msg, "iberr ", len); + if (err & INFINIPATH_RHF_L_SWA) + strlcat(msg, "swA ", len); + if (err & INFINIPATH_RHF_L_SWB) + strlcat(msg, "swB ", len); +} + +/** + * ipath_get_egrbuf - get an eager buffer + * @dd: the infinipath device + * @bufnum: the eager buffer to get + * + * must only be called if ipath_pd[port] is known to be allocated + */ +static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum) +{ + return dd->ipath_port0_skbinfo ? + (void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL; +} + +/** + * ipath_alloc_skb - allocate an skb and buffer with possible constraints + * @dd: the infinipath device + * @gfp_mask: the sk_buff SFP mask + */ +struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, + gfp_t gfp_mask) +{ + struct sk_buff *skb; + u32 len; + + /* + * Only fully supported way to handle this is to allocate lots + * extra, align as needed, and then do skb_reserve(). That wastes + * a lot of memory... I'll have to hack this into infinipath_copy + * also. + */ + + /* + * We need 2 extra bytes for ipath_ether data sent in the + * key header. In order to keep everything dword aligned, + * we'll reserve 4 bytes. + */ + len = dd->ipath_ibmaxlen + 4; + + if (dd->ipath_flags & IPATH_4BYTE_TID) { + /* We need a 2KB multiple alignment, and there is no way + * to do it except to allocate extra and then skb_reserve + * enough to bring it up to the right alignment. + */ + len += 2047; + } + + skb = __dev_alloc_skb(len, gfp_mask); + if (!skb) { + ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n", + len); + goto bail; + } + + skb_reserve(skb, 4); + + if (dd->ipath_flags & IPATH_4BYTE_TID) { + u32 una = (unsigned long)skb->data & 2047; + if (una) + skb_reserve(skb, 2048 - una); + } + +bail: + return skb; +} + +static void ipath_rcv_hdrerr(struct ipath_devdata *dd, + u32 eflags, + u32 l, + u32 etail, + __le32 *rhf_addr, + struct ipath_message_header *hdr) +{ + char emsg[128]; + + get_rhf_errstring(eflags, emsg, sizeof emsg); + ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u " + "tlen=%x opcode=%x egridx=%x: %s\n", + eflags, l, + ipath_hdrget_rcv_type(rhf_addr), + ipath_hdrget_length_in_bytes(rhf_addr), + be32_to_cpu(hdr->bth[0]) >> 24, + etail, emsg); + + /* Count local link integrity errors. */ + if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) { + u8 n = (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + + if (++dd->ipath_lli_counter > n) { + dd->ipath_lli_counter = 0; + dd->ipath_lli_errors++; + } + } +} + +/* + * ipath_kreceive - receive a packet + * @pd: the infinipath port + * + * called from interrupt handler for errors or receive interrupt + */ +void ipath_kreceive(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + __le32 *rhf_addr; + void *ebuf; + const u32 rsize = dd->ipath_rcvhdrentsize; /* words */ + const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize; /* words */ + u32 etail = -1, l, hdrqtail; + struct ipath_message_header *hdr; + u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0; + static u64 totcalls; /* stats, may eventually remove */ + int last; + + l = pd->port_head; + rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (seq != pd->port_seq_cnt) + goto bail; + hdrqtail = 0; + } else { + hdrqtail = ipath_get_rcvhdrtail(pd); + if (l == hdrqtail) + goto bail; + smp_rmb(); + } + +reloop: + for (last = 0, i = 1; !last; i += !last) { + hdr = dd->ipath_f_get_msgheader(dd, rhf_addr); + eflags = ipath_hdrget_err_flags(rhf_addr); + etype = ipath_hdrget_rcv_type(rhf_addr); + /* total length */ + tlen = ipath_hdrget_length_in_bytes(rhf_addr); + ebuf = NULL; + if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ? + ipath_hdrget_use_egr_buf(rhf_addr) : + (etype != RCVHQ_RCV_TYPE_EXPECTED)) { + /* + * It turns out that the chip uses an eager buffer + * for all non-expected packets, whether it "needs" + * one or not. So always get the index, but don't + * set ebuf (so we try to copy data) unless the + * length requires it. + */ + etail = ipath_hdrget_index(rhf_addr); + updegr = 1; + if (tlen > sizeof(*hdr) || + etype == RCVHQ_RCV_TYPE_NON_KD) + ebuf = ipath_get_egrbuf(dd, etail); + } + + /* + * both tiderr and ipathhdrerr are set for all plain IB + * packets; only ipathhdrerr should be set. + */ + + if (etype != RCVHQ_RCV_TYPE_NON_KD && + etype != RCVHQ_RCV_TYPE_ERROR && + ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) != + IPS_PROTO_VERSION) + ipath_cdbg(PKT, "Bad InfiniPath protocol version " + "%x\n", etype); + + if (unlikely(eflags)) + ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr); + else if (etype == RCVHQ_RCV_TYPE_NON_KD) { + ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen); + if (dd->ipath_lli_counter) + dd->ipath_lli_counter--; + } else if (etype == RCVHQ_RCV_TYPE_EAGER) { + u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24; + u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff; + ipath_cdbg(PKT, "typ %x, opcode %x (eager, " + "qp=%x), len %x; ignored\n", + etype, opcode, qp, tlen); + } + else if (etype == RCVHQ_RCV_TYPE_EXPECTED) + ipath_dbg("Bug: Expected TID, opcode %x; ignored\n", + be32_to_cpu(hdr->bth[0]) >> 24); + else { + /* + * error packet, type of error unknown. + * Probably type 3, but we don't know, so don't + * even try to print the opcode, etc. + * Usually caused by a "bad packet", that has no + * BTH, when the LRH says it should. + */ + ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf" + " %x, len %x hdrq+%x rhf: %Lx\n", + etail, tlen, l, (unsigned long long) + le64_to_cpu(*(__le64 *) rhf_addr)); + if (ipath_debug & __IPATH_ERRPKTDBG) { + u32 j, *d, dw = rsize-2; + if (rsize > (tlen>>2)) + dw = tlen>>2; + d = (u32 *)hdr; + printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n", + dw); + for (j = 0; j < dw; j++) + printk(KERN_DEBUG "%8x%s", d[j], + (j%8) == 7 ? "\n" : " "); + printk(KERN_DEBUG ".\n"); + } + } + l += rsize; + if (l >= maxcnt) + l = 0; + rhf_addr = (__le32 *) pd->port_rcvhdrq + + l + dd->ipath_rhf_offset; + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + u32 seq = ipath_hdrget_seq(rhf_addr); + + if (++pd->port_seq_cnt > 13) + pd->port_seq_cnt = 1; + if (seq != pd->port_seq_cnt) + last = 1; + } else if (l == hdrqtail) + last = 1; + /* + * update head regs on last packet, and every 16 packets. + * Reduce bus traffic, while still trying to prevent + * rcvhdrq overflows, for when the queue is nearly full + */ + if (last || !(i & 0xf)) { + u64 lval = l; + + /* request IBA6120 and 7220 interrupt only on last */ + if (last) + lval |= dd->ipath_rhdrhead_intr_off; + ipath_write_ureg(dd, ur_rcvhdrhead, lval, + pd->port_port); + if (updegr) { + ipath_write_ureg(dd, ur_rcvegrindexhead, + etail, pd->port_port); + updegr = 0; + } + } + } + + if (!dd->ipath_rhdrhead_intr_off && !reloop && + !(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + /* IBA6110 workaround; we can have a race clearing chip + * interrupt with another interrupt about to be delivered, + * and can clear it before it is delivered on the GPIO + * workaround. By doing the extra check here for the + * in-memory tail register updating while we were doing + * earlier packets, we "almost" guarantee we have covered + * that case. + */ + u32 hqtail = ipath_get_rcvhdrtail(pd); + if (hqtail != hdrqtail) { + hdrqtail = hqtail; + reloop = 1; /* loop 1 extra time at most */ + goto reloop; + } + } + + pkttot += i; + + pd->port_head = l; + + if (pkttot > ipath_stats.sps_maxpkts_call) + ipath_stats.sps_maxpkts_call = pkttot; + ipath_stats.sps_port0pkts += pkttot; + ipath_stats.sps_avgpkts_call = + ipath_stats.sps_port0pkts / ++totcalls; + +bail:; +} + +/** + * ipath_update_pio_bufs - update shadow copy of the PIO availability map + * @dd: the infinipath device + * + * called whenever our local copy indicates we have run out of send buffers + * NOTE: This can be called from interrupt context by some code + * and from non-interrupt context by ipath_getpiobuf(). + */ + +static void ipath_update_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long flags; + int i; + const unsigned piobregs = (unsigned)dd->ipath_pioavregs; + + /* If the generation (check) bits have changed, then we update the + * busy bit for the corresponding PIO buffer. This algorithm will + * modify positions to the value they already have in some cases + * (i.e., no change), but it's faster than changing only the bits + * that have changed. + * + * We would like to do this atomicly, to avoid spinlocks in the + * critical send path, but that's not really possible, given the + * type of changes, and that this routine could be called on + * multiple cpu's simultaneously, so we lock in this routine only, + * to avoid conflicting updates; all we change is the shadow, and + * it's a single 64 bit memory location, so by definition the update + * is atomic in terms of what other cpu's can see in testing the + * bits. The spin_lock overhead isn't too bad, since it only + * happens when all buffers are in use, so only cpu overhead, not + * latency or bandwidth is affected. + */ + if (!dd->ipath_pioavailregs_dma) { + ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n"); + return; + } + if (ipath_debug & __IPATH_VERBDBG) { + /* only if packet debug and verbose */ + volatile __le64 *dma = dd->ipath_pioavailregs_dma; + unsigned long *shadow = dd->ipath_pioavailshadow; + + ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, " + "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx " + "s3=%lx\n", + (unsigned long long) le64_to_cpu(dma[0]), + shadow[0], + (unsigned long long) le64_to_cpu(dma[1]), + shadow[1], + (unsigned long long) le64_to_cpu(dma[2]), + shadow[2], + (unsigned long long) le64_to_cpu(dma[3]), + shadow[3]); + if (piobregs > 4) + ipath_cdbg( + PKT, "2nd group, dma4=%llx shad4=%lx, " + "d5=%llx s5=%lx, d6=%llx s6=%lx, " + "d7=%llx s7=%lx\n", + (unsigned long long) le64_to_cpu(dma[4]), + shadow[4], + (unsigned long long) le64_to_cpu(dma[5]), + shadow[5], + (unsigned long long) le64_to_cpu(dma[6]), + shadow[6], + (unsigned long long) le64_to_cpu(dma[7]), + shadow[7]); + } + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < piobregs; i++) { + u64 pchbusy, pchg, piov, pnew; + /* + * Chip Errata: bug 6641; even and odd qwords>3 are swapped + */ + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]); + else + piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]); + pchg = dd->ipath_pioavailkernel[i] & + ~(dd->ipath_pioavailshadow[i] ^ piov); + pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT; + if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) { + pnew = dd->ipath_pioavailshadow[i] & ~pchbusy; + pnew |= piov & pchbusy; + dd->ipath_pioavailshadow[i] = pnew; + } + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + +/* + * used to force update of pioavailshadow if we can't get a pio buffer. + * Needed primarily due to exitting freeze mode after recovering + * from errors. Done lazily, because it's safer (known to not + * be writing pio buffers). + */ +static void ipath_reset_availshadow(struct ipath_devdata *dd) +{ + int i, im; + unsigned long flags; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (i = 0; i < dd->ipath_pioavregs; i++) { + u64 val, oldval; + /* deal with 6110 chip bug on high register #s */ + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]); + /* + * busy out the buffers not in the kernel avail list, + * without changing the generation bits. + */ + oldval = dd->ipath_pioavailshadow[i]; + dd->ipath_pioavailshadow[i] = val | + ((~dd->ipath_pioavailkernel[i] << + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) & + 0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */ + if (oldval != dd->ipath_pioavailshadow[i]) + ipath_dbg("shadow[%d] was %Lx, now %lx\n", + i, (unsigned long long) oldval, + dd->ipath_pioavailshadow[i]); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); +} + +/** + * ipath_setrcvhdrsize - set the receive header size + * @dd: the infinipath device + * @rhdrsize: the receive header size + * + * called from user init code, and also layered driver init + */ +int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize) +{ + int ret = 0; + + if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) { + if (dd->ipath_rcvhdrsize != rhdrsize) { + dev_info(&dd->pcidev->dev, + "Error: can't set protocol header " + "size %u, already %u\n", + rhdrsize, dd->ipath_rcvhdrsize); + ret = -EAGAIN; + } else + ipath_cdbg(VERBOSE, "Reuse same protocol header " + "size %u\n", dd->ipath_rcvhdrsize); + } else if (rhdrsize > (dd->ipath_rcvhdrentsize - + (sizeof(u64) / sizeof(u32)))) { + ipath_dbg("Error: can't set protocol header size %u " + "(> max %u)\n", rhdrsize, + dd->ipath_rcvhdrentsize - + (u32) (sizeof(u64) / sizeof(u32))); + ret = -EOVERFLOW; + } else { + dd->ipath_flags |= IPATH_RCVHDRSZ_SET; + dd->ipath_rcvhdrsize = rhdrsize; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + ipath_cdbg(VERBOSE, "Set protocol header size to %u\n", + dd->ipath_rcvhdrsize); + } + return ret; +} + +/* + * debugging code and stats updates if no pio buffers available. + */ +static noinline void no_pio_bufs(struct ipath_devdata *dd) +{ + unsigned long *shadow = dd->ipath_pioavailshadow; + __le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma; + + dd->ipath_upd_pio_shadow = 1; + + /* + * not atomic, but if we lose a stat count in a while, that's OK + */ + ipath_stats.sps_nopiobufs++; + if (!(++dd->ipath_consec_nopiobuf % 100000)) { + ipath_force_pio_avail_update(dd); /* at start */ + ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + dd->ipath_consec_nopiobuf, + (unsigned long)get_cycles(), + (unsigned long long) le64_to_cpu(dma[0]), + (unsigned long long) le64_to_cpu(dma[1]), + (unsigned long long) le64_to_cpu(dma[2]), + (unsigned long long) le64_to_cpu(dma[3]), + shadow[0], shadow[1], shadow[2], shadow[3]); + /* + * 4 buffers per byte, 4 registers above, cover rest + * below + */ + if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) > + (sizeof(shadow[0]) * 4 * 4)) + ipath_dbg("2nd group: dmacopy: " + "%llx %llx %llx %llx\n" + "ipath shadow: %lx %lx %lx %lx\n", + (unsigned long long)le64_to_cpu(dma[4]), + (unsigned long long)le64_to_cpu(dma[5]), + (unsigned long long)le64_to_cpu(dma[6]), + (unsigned long long)le64_to_cpu(dma[7]), + shadow[4], shadow[5], shadow[6], shadow[7]); + + /* at end, so update likely happened */ + ipath_reset_availshadow(dd); + } +} + +/* + * common code for normal driver pio buffer allocation, and reserved + * allocation. + * + * do appropriate marking as busy, etc. + * returns buffer number if one found (>=0), negative number is error. + */ +static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd, + u32 *pbufnum, u32 first, u32 last, u32 firsti) +{ + int i, j, updated = 0; + unsigned piobcnt; + unsigned long flags; + unsigned long *shadow = dd->ipath_pioavailshadow; + u32 __iomem *buf; + + piobcnt = last - first; + if (dd->ipath_upd_pio_shadow) { + /* + * Minor optimization. If we had no buffers on last call, + * start out by doing the update; continue and do scan even + * if no buffers were updated, to be paranoid + */ + ipath_update_pio_bufs(dd); + updated++; + i = first; + } else + i = firsti; +rescan: + /* + * while test_and_set_bit() is atomic, we do that and then the + * change_bit(), and the pair is not. See if this is the cause + * of the remaining armlaunch errors. + */ + spin_lock_irqsave(&ipath_pioavail_lock, flags); + for (j = 0; j < piobcnt; j++, i++) { + if (i >= last) + i = first; + if (__test_and_set_bit((2 * i) + 1, shadow)) + continue; + /* flip generation bit */ + __change_bit(2 * i, shadow); + break; + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + if (j == piobcnt) { + if (!updated) { + /* + * first time through; shadow exhausted, but may be + * buffers available, try an update and then rescan. + */ + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; + } else if (updated == 1 && piobcnt <= + ((dd->ipath_sendctrl + >> INFINIPATH_S_UPDTHRESH_SHIFT) & + INFINIPATH_S_UPDTHRESH_MASK)) { + /* + * for chips supporting and using the update + * threshold we need to force an update of the + * in-memory copy if the count is less than the + * thershold, then check one more time. + */ + ipath_force_pio_avail_update(dd); + ipath_update_pio_bufs(dd); + updated++; + i = first; + goto rescan; + } + + no_pio_bufs(dd); + buf = NULL; + } else { + if (i < dd->ipath_piobcnt2k) + buf = (u32 __iomem *) (dd->ipath_pio2kbase + + i * dd->ipath_palign); + else + buf = (u32 __iomem *) + (dd->ipath_pio4kbase + + (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign); + if (pbufnum) + *pbufnum = i; + } + + return buf; +} + +/** + * ipath_getpiobuf - find an available pio buffer + * @dd: the infinipath device + * @plen: the size of the PIO buffer needed in 32-bit words + * @pbufnum: the buffer number is placed here + */ +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum) +{ + u32 __iomem *buf; + u32 pnum, nbufs; + u32 first, lasti; + + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) { + first = dd->ipath_piobcnt2k; + lasti = dd->ipath_lastpioindexl; + } else { + first = 0; + lasti = dd->ipath_lastpioindex; + } + nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti); + + if (buf) { + /* + * Set next starting place. It's just an optimization, + * it doesn't matter who wins on this, so no locking + */ + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + dd->ipath_lastpioindexl = pnum + 1; + else + dd->ipath_lastpioindex = pnum + 1; + if (dd->ipath_upd_pio_shadow) + dd->ipath_upd_pio_shadow = 0; + if (dd->ipath_consec_nopiobuf) + dd->ipath_consec_nopiobuf = 0; + ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n", + pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf); + if (pbufnum) + *pbufnum = pnum; + + } + return buf; +} + +/** + * ipath_chg_pioavailkernel - change which send buffers are available for kernel + * @dd: the infinipath device + * @start: the starting send buffer number + * @len: the number of send buffers + * @avail: true if the buffers are available for kernel use, false otherwise + */ +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail) +{ + unsigned long flags; + unsigned end, cnt = 0; + + /* There are two bits per send buffer (busy and generation) */ + start *= 2; + end = start + len * 2; + + spin_lock_irqsave(&ipath_pioavail_lock, flags); + /* Set or clear the busy bit in the shadow. */ + while (start < end) { + if (avail) { + unsigned long dma; + int i, im; + /* + * the BUSY bit will never be set, because we disarm + * the user buffers before we hand them back to the + * kernel. We do have to make sure the generation + * bit is set correctly in shadow, since it could + * have changed many times while allocated to user. + * We can't use the bitmap functions on the full + * dma array because it is always little-endian, so + * we have to flip to host-order first. + * BITS_PER_LONG is slightly wrong, since it's + * always 64 bits per register in chip... + * We only work on 64 bit kernels, so that's OK. + */ + /* deal with 6110 chip bug on high register #s */ + i = start / BITS_PER_LONG; + im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ? + i ^ 1 : i; + __clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT + + start, dd->ipath_pioavailshadow); + dma = (unsigned long) le64_to_cpu( + dd->ipath_pioavailregs_dma[im]); + if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start) % BITS_PER_LONG, &dma)) + __set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + else + __clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT + + start, dd->ipath_pioavailshadow); + __set_bit(start, dd->ipath_pioavailkernel); + } else { + __set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT, + dd->ipath_pioavailshadow); + __clear_bit(start, dd->ipath_pioavailkernel); + } + start += 2; + } + + if (dd->ipath_pioupd_thresh) { + end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + cnt = bitmap_weight(dd->ipath_pioavailkernel, end); + } + spin_unlock_irqrestore(&ipath_pioavail_lock, flags); + + /* + * When moving buffers from kernel to user, if number assigned to + * the user is less than the pio update threshold, and threshold + * is supported (cnt was computed > 0), drop the update threshold + * so we update at least once per allocated number of buffers. + * In any case, if the kernel buffers are less than the threshold, + * drop the threshold. We don't bother increasing it, having once + * decreased it, since it would typically just cycle back and forth. + * If we don't decrease below buffers in use, we can wait a long + * time for an update, until some other context uses PIO buffers. + */ + if (!avail && len < cnt) + cnt = len; + if (cnt < dd->ipath_pioupd_thresh) { + dd->ipath_pioupd_thresh = cnt; + ipath_dbg("Decreased pio update threshold to %u\n", + dd->ipath_pioupd_thresh); + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK + << INFINIPATH_S_UPDTHRESH_SHIFT); + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } +} + +/** + * ipath_create_rcvhdrq - create a receive header queue + * @dd: the infinipath device + * @pd: the port data + * + * this must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int ipath_create_rcvhdrq(struct ipath_devdata *dd, + struct ipath_portdata *pd) +{ + int ret = 0; + + if (!pd->port_rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags = GFP_USER | __GFP_COMP; + int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE); + + pd->port_rcvhdrq = dma_alloc_coherent( + &dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys, + gfp_flags); + + if (!pd->port_rcvhdrq) { + ipath_dev_err(dd, "attempt to allocate %d bytes " + "for port %u rcvhdrq failed\n", + amt, pd->port_port); + ret = -ENOMEM; + goto bail; + } + + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + GFP_KERNEL); + if (!pd->port_rcvhdrtail_kvaddr) { + ipath_dev_err(dd, "attempt to allocate 1 page " + "for port %u rcvhdrqtailaddr " + "failed\n", pd->port_port); + ret = -ENOMEM; + dma_free_coherent(&dd->pcidev->dev, amt, + pd->port_rcvhdrq, + pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + goto bail; + } + pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail; + ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx " + "physical\n", pd->port_port, + (unsigned long long) phys_hdrqtail); + } + + pd->port_rcvhdrq_size = amt; + + ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu " + "for port %u rcvhdr Q\n", + amt >> PAGE_SHIFT, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_phys, + (unsigned long) pd->port_rcvhdrq_size, + pd->port_port); + } + else + ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; " + "hdrtailaddr@%p %llx physical\n", + pd->port_port, pd->port_rcvhdrq, + (unsigned long long) pd->port_rcvhdrq_phys, + pd->port_rcvhdrtail_kvaddr, (unsigned long long) + pd->port_rcvhdrqtailaddr_phys); + + /* clear for security and sanity on each use */ + memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size); + if (pd->port_rcvhdrtail_kvaddr) + memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE); + + /* + * tell chip each time we init it, even if we are re-using previous + * memory (we zero the register at process close) + */ + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr, + pd->port_port, pd->port_rcvhdrqtailaddr_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, pd->port_rcvhdrq_phys); + +bail: + return ret; +} + + +/* + * Flush all sends that might be in the ready to send state, as well as any + * that are in the process of being sent. Used whenever we need to be + * sure the send side is idle. Cleans up all buffer state by canceling + * all pio buffers, and issuing an abort, which cleans up anything in the + * launch fifo. The cancel is superfluous on some chip versions, but + * it's safer to always do it. + * PIOAvail bits are updated by the chip as if normal send had happened. + */ +void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl) +{ + unsigned long flags; + + if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) { + ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n"); + goto bail; + } + /* + * If we have SDMA, and it's not disabled, we have to kick off the + * abort state machine, provided we aren't already aborting. + * If we are in the process of aborting SDMA (!DISABLED, but ABORTING), + * we skip the rest of this routine. It is already "in progress" + */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) { + int skip_cancel; + unsigned long *statp = &dd->ipath_sdma_status; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + skip_cancel = + test_and_set_bit(IPATH_SDMA_ABORTING, statp) + && !test_bit(IPATH_SDMA_DISABLED, statp); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (skip_cancel) + goto bail; + } + + ipath_dbg("Cancelling all in-progress send buffers\n"); + + /* skip armlaunch errs for a while */ + dd->ipath_lastcancel = jiffies + HZ / 2; + + /* + * The abort bit is auto-clearing. We also don't want pioavail + * update happening during this, and we don't want any other + * sends going out, so turn those off for the duration. We read + * the scratch register to be sure that cancels and the abort + * have taken effect in the chip. Otherwise two parts are same + * as ipath_force_pio_avail_update() + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD + | INFINIPATH_S_PIOENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl | INFINIPATH_S_ABORT); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* disarm all send buffers */ + ipath_disarm_piobufs(dd, 0, + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + + if (restore_sendctrl) { + /* else done by caller later if needed */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD | + INFINIPATH_S_PIOENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + /* and again, be sure all have hit the chip */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } + + if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) && + !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) && + test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) { + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* only wait so long for intr */ + dd->ipath_sdma_abort_intr_timeout = jiffies + HZ; + dd->ipath_sdma_reset_wait = 200; + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + } +bail:; +} + +/* + * Force an update of in-memory copy of the pioavail registers, when + * needed for any of a variety of reasons. We read the scratch register + * to make it highly likely that the update will have happened by the + * time we return. If already off (as in cancel_sends above), this + * routine is a nop, on the assumption that the caller will "do the + * right thing". + */ +void ipath_force_pio_avail_update(struct ipath_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + } + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); +} + +static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd, + int linitcmd) +{ + u64 mod_wd; + static const char *what[4] = { + [0] = "NOP", + [INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN", + [INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED", + [INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE" + }; + + if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) { + /* + * If we are told to disable, note that so link-recovery + * code does not attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + preempt_enable(); + } else if (linitcmd) { + /* + * Any other linkinitcmd will lead to LINKDOWN and then + * to INIT (if all is well), so clear flag to let + * link-recovery code attempt to bring us back up. + */ + preempt_disable(); + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + preempt_enable(); + } + + mod_wd = (linkcmd << dd->ibcc_lc_shift) | + (linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT); + ipath_cdbg(VERBOSE, + "Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n", + dd->ipath_unit, what[linkcmd], linitcmd, + ipath_ibcstatus_str[ipath_ib_linktrstate(dd, + ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl | mod_wd); + /* read from chip so write is flushed */ + (void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); +} + +int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate) +{ + u32 lstate; + int ret; + + switch (newstate) { + case IPATH_IB_LINKDOWN_ONLY: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_POLL); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_SLEEP: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_SLEEP); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKDOWN_DISABLE: + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, + INFINIPATH_IBCC_LINKINITCMD_DISABLE); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINKARM: + if (dd->ipath_flags & IPATH_LINKARMED) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & + (IPATH_LINKINIT | IPATH_LINKACTIVE))) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0); + + /* + * Since the port can transition to ACTIVE by receiving + * a non VL 15 packet, wait for either state. + */ + lstate = IPATH_LINKARMED | IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINKACTIVE: + if (dd->ipath_flags & IPATH_LINKACTIVE) { + ret = 0; + goto bail; + } + if (!(dd->ipath_flags & IPATH_LINKARMED)) { + ret = -EINVAL; + goto bail; + } + ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0); + lstate = IPATH_LINKACTIVE; + break; + + case IPATH_IB_LINK_LOOPBACK: + dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n"); + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + + /* turn heartbeat off, as it causes loopback to fail */ + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + /* don't wait */ + ret = 0; + goto bail; + + case IPATH_IB_LINK_EXTERNAL: + dev_info(&dd->pcidev->dev, + "Disabling IB local loopback (normal)\n"); + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + /* don't wait */ + ret = 0; + goto bail; + + /* + * Heartbeat can be explicitly enabled by the user via + * "hrtbt_enable" "file", and if disabled, trying to enable here + * will have no effect. Implicit changes (heartbeat off when + * loopback on, and vice versa) are included to ease testing. + */ + case IPATH_IB_LINK_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_ON); + goto bail; + + case IPATH_IB_LINK_NO_HRTBT: + ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, + IPATH_IB_HRTBT_OFF); + goto bail; + + default: + ipath_dbg("Invalid linkstate 0x%x requested\n", newstate); + ret = -EINVAL; + goto bail; + } + ret = ipath_wait_linkstate(dd, lstate, 2000); + +bail: + return ret; +} + +/** + * ipath_set_mtu - set the MTU + * @dd: the infinipath device + * @arg: the new MTU + * + * we can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. For now, we don't do any + * sanity checking on this, and we don't deal with what happens to + * programs that are already running when the size changes. + * NOTE: changing the MTU will usually cause the IBC to go back to + * link INIT state... + */ +int ipath_set_mtu(struct ipath_devdata *dd, u16 arg) +{ + u32 piosize; + int changed = 0; + int ret; + + /* + * mtu is IB data payload max. It's the largest power of 2 less + * than piosize (or even larger, since it only really controls the + * largest we can receive; we can send the max of the mtu and + * piosize). We check that it's one of the valid IB sizes. + */ + if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 && + (arg != 4096 || !ipath_mtu4096)) { + ipath_dbg("Trying to set invalid mtu %u, failing\n", arg); + ret = -EINVAL; + goto bail; + } + if (dd->ipath_ibmtu == arg) { + ret = 0; /* same as current */ + goto bail; + } + + piosize = dd->ipath_ibmaxlen; + dd->ipath_ibmtu = arg; + + if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) { + /* Only if it's not the initial value (or reset to it) */ + if (piosize != dd->ipath_init_ibmaxlen) { + if (arg > piosize && arg <= dd->ipath_init_ibmaxlen) + piosize = dd->ipath_init_ibmaxlen; + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + } else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) { + piosize = arg + IPATH_PIO_MAXIBHDR; + ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x " + "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize, + arg); + dd->ipath_ibmaxlen = piosize; + changed = 1; + } + + if (changed) { + u64 ibc = dd->ipath_ibcctrl, ibdw; + /* + * update our housekeeping variables, and set IBC max + * size, same as init code; max IBC is max we allow in + * buffer, less the qword pbc, plus 1 for ICRC, in dwords + */ + dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32); + ibdw = (dd->ipath_ibmaxlen >> 2) + 1; + ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK << + dd->ibcc_mpl_shift); + ibc |= ibdw << dd->ibcc_mpl_shift; + dd->ipath_ibcctrl = ibc; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + dd->ipath_f_tidtemplate(dd); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc) +{ + dd->ipath_lid = lid; + dd->ipath_lmc = lmc; + + dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid | + (~((1U << lmc) - 1)) << 16); + + dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid); + + return 0; +} + + +/** + * ipath_write_kreg_port - write a device's per-port 64-bit kernel register + * @dd: the infinipath device + * @regno: the register number to write + * @port: the port containing the register + * @value: the value to write + * + * Registers that vary with the chip implementation constants (port) + * use this routine. + */ +void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno, + unsigned port, u64 value) +{ + u16 where; + + if (port < dd->ipath_portcnt && + (regno == dd->ipath_kregs->kr_rcvhdraddr || + regno == dd->ipath_kregs->kr_rcvhdrtailaddr)) + where = regno + port; + else + where = -1; + + ipath_write_kreg(dd, where, value); +} + +/* + * Following deal with the "obviously simple" task of overriding the state + * of the LEDS, which normally indicate link physical and logical status. + * The complications arise in dealing with different hardware mappings + * and the board-dependent routine being called from interrupts. + * and then there's the requirement to _flash_ them. + */ +#define LED_OVER_FREQ_SHIFT 8 +#define LED_OVER_FREQ_MASK (0xFF<ipath_flags & IPATH_INITTED)) + return; + + pidx = dd->ipath_led_override_phase++ & 1; + dd->ipath_led_override = dd->ipath_led_override_vals[pidx]; + timeoff = dd->ipath_led_override_timeoff; + + /* + * below potentially restores the LED values per current status, + * should also possibly setup the traffic-blink register, + * but leave that to per-chip functions. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + ltstate = ipath_ib_linktrstate(dd, val); + lstate = ipath_ib_linkstate(dd, val); + + dd->ipath_f_setextled(dd, lstate, ltstate); + mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff); +} + +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val) +{ + int timeoff, freq; + + if (!(dd->ipath_flags & IPATH_INITTED)) + return; + + /* First check if we are blinking. If not, use 1HZ polling */ + timeoff = HZ; + freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT; + + if (freq) { + /* For blink, set each phase from one nybble of val */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = (val >> 4) & 0xF; + timeoff = (HZ << 4)/freq; + } else { + /* Non-blink set both phases the same. */ + dd->ipath_led_override_vals[0] = val & 0xF; + dd->ipath_led_override_vals[1] = val & 0xF; + } + dd->ipath_led_override_timeoff = timeoff; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the function will be called soon, to look at our request. + */ + if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) { + /* Need to start timer */ + init_timer(&dd->ipath_led_override_timer); + dd->ipath_led_override_timer.function = + ipath_run_led_override; + dd->ipath_led_override_timer.data = (unsigned long) dd; + dd->ipath_led_override_timer.expires = jiffies + 1; + add_timer(&dd->ipath_led_override_timer); + } else + atomic_dec(&dd->ipath_led_override_timer_active); +} + +/** + * ipath_shutdown_device - shut down a device + * @dd: the infinipath device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by ipath_init_chip(dd,1) + */ +void ipath_shutdown_device(struct ipath_devdata *dd) +{ + unsigned long flags; + + ipath_dbg("Shutting down the device\n"); + + ipath_hol_up(dd); /* make sure user processes aren't suspended */ + + dd->ipath_flags |= IPATH_LINKUNK; + dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN | + IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF | + IPATH_STATUS_IB_READY); + + /* mask interrupts, but not errors */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + dd->ipath_rcvctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + + /* + * gracefully stop all sends allowing any in progress to trickle out + * first. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + /* flush it */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* + * enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(5); + + dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */ + + ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE); + ipath_cancel_sends(dd, 0); + + /* + * we are shutting down, so tell components that care. We don't do + * this on just a link state change, much like ethernet, a cable + * unplug, etc. doesn't change driver state + */ + signal_ib_event(dd, IB_EVENT_PORT_ERR); + + /* disable IBC */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control | INFINIPATH_C_FREEZEMODE); + + /* + * clear SerdesEnable and turn the leds off; do this here because + * we are unloading, so don't count on interrupts to move along + * Turn the LEDs off explicitly for the same reason. + */ + dd->ipath_f_quiet_serdes(dd); + + /* stop all the timers that might still be running */ + del_timer_sync(&dd->ipath_hol_timer); + if (dd->ipath_stats_timer_active) { + del_timer_sync(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 0; + } + if (dd->ipath_intrchk_timer.data) { + del_timer_sync(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.data = 0; + } + if (atomic_read(&dd->ipath_led_override_timer_active)) { + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* + * clear all interrupts and errors, so that the next time the driver + * is loaded or device is enabled, we know that whatever is set + * happened while we were unloaded + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n"); + ipath_update_eeprom_log(dd); +} + +/** + * ipath_free_pddata - free a port's allocated data + * @dd: the infinipath device + * @pd: the portdata structure + * + * free up any allocated data for a port + * This should not touch anything that would affect a simultaneous + * re-allocation of port data, because it is called after ipath_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + * (The only exception to global state is freeing the port0 port0_skbs.) + */ +void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd) +{ + if (!pd) + return; + + if (pd->port_rcvhdrq) { + ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p " + "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq, + (unsigned long) pd->port_rcvhdrq_size); + dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size, + pd->port_rcvhdrq, pd->port_rcvhdrq_phys); + pd->port_rcvhdrq = NULL; + if (pd->port_rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + pd->port_rcvhdrtail_kvaddr, + pd->port_rcvhdrqtailaddr_phys); + pd->port_rcvhdrtail_kvaddr = NULL; + } + } + if (pd->port_port && pd->port_rcvegrbuf) { + unsigned e; + + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + void *base = pd->port_rcvegrbuf[e]; + size_t size = pd->port_rcvegrbuf_size; + + ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), " + "chunk %u/%u\n", base, + (unsigned long) size, + e, pd->port_rcvegrbuf_chunks); + dma_free_coherent(&dd->pcidev->dev, size, + base, pd->port_rcvegrbuf_phys[e]); + } + kfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; + kfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; + pd->port_rcvegrbuf_chunks = 0; + } else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) { + unsigned e; + struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo; + + dd->ipath_port0_skbinfo = NULL; + ipath_cdbg(VERBOSE, "free closed port %d " + "ipath_port0_skbinfo @ %p\n", pd->port_port, + skbinfo); + for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++) + if (skbinfo[e].skb) { + pci_unmap_single(dd->pcidev, skbinfo[e].phys, + dd->ipath_ibmaxlen, + PCI_DMA_FROMDEVICE); + dev_kfree_skb(skbinfo[e].skb); + } + vfree(skbinfo); + } + kfree(pd->port_tid_pg_list); + vfree(pd->subport_uregbase); + vfree(pd->subport_rcvegrbuf); + vfree(pd->subport_rcvhdr_base); + kfree(pd); +} + +static int __init infinipath_init(void) +{ + int ret; + + if (ipath_debug & __IPATH_DBG) + printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version); + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&unit_table); + + ret = pci_register_driver(&ipath_driver); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Unable to register driver: error %d\n", -ret); + goto bail_unit; + } + + ret = ipath_init_ipathfs(); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Unable to create " + "ipathfs: error %d\n", -ret); + goto bail_pci; + } + + goto bail; + +bail_pci: + pci_unregister_driver(&ipath_driver); + +bail_unit: + idr_destroy(&unit_table); + +bail: + return ret; +} + +static void __exit infinipath_cleanup(void) +{ + ipath_exit_ipathfs(); + + ipath_cdbg(VERBOSE, "Unregistering pci driver\n"); + pci_unregister_driver(&ipath_driver); + + idr_destroy(&unit_table); +} + +/** + * ipath_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user ports are open that use chip resources + */ +int ipath_reset_device(int unit) +{ + int ret, i; + struct ipath_devdata *dd = ipath_lookup(unit); + unsigned long flags; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + if (atomic_read(&dd->ipath_led_override_timer_active)) { + /* Need to stop LED timer, _then_ shut off LEDs */ + del_timer_sync(&dd->ipath_led_override_timer); + atomic_set(&dd->ipath_led_override_timer_active, 0); + } + + /* Shut off LEDs after we are sure timer is not running */ + dd->ipath_led_override = LED_OVER_BOTH_OFF; + dd->ipath_f_setextled(dd, 0, 0); + + dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit); + + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) { + dev_info(&dd->pcidev->dev, "Invalid unit number %u or " + "not initialized or not present\n", unit); + ret = -ENXIO; + goto bail; + } + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + if (dd->ipath_pd) + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + ipath_dbg("unit %u port %d is in use " + "(PID %u cmd %s), can't reset\n", + unit, i, + pid_nr(dd->ipath_pd[i]->port_pid), + dd->ipath_pd[i]->port_comm); + ret = -EBUSY; + goto bail; + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + teardown_sdma(dd); + + dd->ipath_flags &= ~IPATH_INITTED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + ret = dd->ipath_f_reset(dd); + if (ret == 1) { + ipath_dbg("Reinitializing unit %u after reset attempt\n", + unit); + ret = ipath_init_chip(dd, 1); + } else + ret = -EAGAIN; + if (ret) + ipath_dev_err(dd, "Reinitialize unit %u after " + "reset failed with %d\n", unit, ret); + else + dev_info(&dd->pcidev->dev, "Reinitialized unit %u after " + "resetting\n", unit); + +bail: + return ret; +} + +/* + * send a signal to all the processes that have the driver open + * through the normal interfaces (i.e., everything other than diags + * interface). Returns number of signalled processes. + */ +static int ipath_signal_procs(struct ipath_devdata *dd, int sig) +{ + int i, sub, any = 0; + struct pid *pid; + unsigned long flags; + + if (!dd->ipath_pd) + return 0; + + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + for (i = 1; i < dd->ipath_cfgports; i++) { + if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt) + continue; + pid = dd->ipath_pd[i]->port_pid; + if (!pid) + continue; + + dev_info(&dd->pcidev->dev, "context %d in use " + "(PID %u), sending signal %d\n", + i, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) { + pid = dd->ipath_pd[i]->port_subpid[sub]; + if (!pid) + continue; + dev_info(&dd->pcidev->dev, "sub-context " + "%d:%d in use (PID %u), sending " + "signal %d\n", i, sub, pid_nr(pid), sig); + kill_pid(pid, sig, 1); + any++; + } + } + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + return any; +} + +static void ipath_hol_signal_down(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGSTOP)) + ipath_dbg("Stopped some processes\n"); + ipath_cancel_sends(dd, 1); +} + + +static void ipath_hol_signal_up(struct ipath_devdata *dd) +{ + if (ipath_signal_procs(dd, SIGCONT)) + ipath_dbg("Continued some processes\n"); +} + +/* + * link is down, stop any users processes, and flush pending sends + * to prevent HoL blocking, then start the HoL timer that + * periodically continues, then stop procs, so they can detect + * link down if they want, and do something about it. + * Timer may already be running, so use mod_timer, not add_timer. + */ +void ipath_hol_down(struct ipath_devdata *dd) +{ + dd->ipath_hol_state = IPATH_HOL_DOWN; + ipath_hol_signal_down(dd); + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires); +} + +/* + * link is up, continue any user processes, and ensure timer + * is a nop, if running. Let timer keep running, if set; it + * will nop when it sees the link is up + */ +void ipath_hol_up(struct ipath_devdata *dd) +{ + ipath_hol_signal_up(dd); + dd->ipath_hol_state = IPATH_HOL_UP; +} + +/* + * toggle the running/not running state of user proceses + * to prevent HoL blocking on chip resources, but still allow + * user processes to do link down special case handling. + * Should only be called via the timer + */ +void ipath_hol_event(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP + && dd->ipath_hol_state != IPATH_HOL_UP) { + dd->ipath_hol_next = IPATH_HOL_DOWNCONT; + ipath_dbg("Stopping processes\n"); + ipath_hol_signal_down(dd); + } else { /* may do "extra" if also in ipath_hol_up() */ + dd->ipath_hol_next = IPATH_HOL_DOWNSTOP; + ipath_dbg("Continuing processes\n"); + ipath_hol_signal_up(dd); + } + if (dd->ipath_hol_state == IPATH_HOL_UP) + ipath_dbg("link's up, don't resched timer\n"); + else { + dd->ipath_hol_timer.expires = jiffies + + msecs_to_jiffies(ipath_hol_timeout_ms); + mod_timer(&dd->ipath_hol_timer, + dd->ipath_hol_timer.expires); + } +} + +int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv) +{ + u64 val; + + if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK) + return -1; + if (dd->ipath_rx_pol_inv != new_pol_inv) { + dd->ipath_rx_pol_inv = new_pol_inv; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val &= ~(INFINIPATH_XGXS_RX_POL_MASK << + INFINIPATH_XGXS_RX_POL_SHIFT); + val |= ((u64)dd->ipath_rx_pol_inv) << + INFINIPATH_XGXS_RX_POL_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + } + return 0; +} + +/* + * Disable and enable the armlaunch error. Used for PIO bandwidth testing on + * the 7220, which is count-based, rather than trigger-based. Safe for the + * driver check, since it's at init. Not completely safe when used for + * user-mode checking, since some error checking can be lost, but not + * particularly risky, and only has problematic side-effects in the face of + * very buggy user code. There is no reference counting, but that's also + * fine, given the intended use. + */ +void ipath_enable_armlaunch(struct ipath_devdata *dd) +{ + dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_SPIOARMLAUNCH); + dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +void ipath_disable_armlaunch(struct ipath_devdata *dd) +{ + /* so don't re-enable if already set */ + dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH; + dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); +} + +module_init(infinipath_init); +module_exit(infinipath_cleanup); diff --git a/drivers/staging/rdma/ipath/ipath_eeprom.c b/drivers/staging/rdma/ipath/ipath_eeprom.c new file mode 100644 index 000000000..fc7181985 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_eeprom.c @@ -0,0 +1,1183 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_kernel.h" + +/* + * InfiniPath I2C driver for a serial eeprom. This is not a generic + * I2C interface. For a start, the device we're using (Atmel AT24C11) + * doesn't work like a regular I2C device. It looks like one + * electrically, but not logically. Normal I2C devices have a single + * 7-bit or 10-bit I2C address that they respond to. Valid 7-bit + * addresses range from 0x03 to 0x77. Addresses 0x00 to 0x02 and 0x78 + * to 0x7F are special reserved addresses (e.g. 0x00 is the "general + * call" address.) The Atmel device, on the other hand, responds to ALL + * 7-bit addresses. It's designed to be the only device on a given I2C + * bus. A 7-bit address corresponds to the memory address within the + * Atmel device itself. + * + * Also, the timing requirements mean more than simple software + * bitbanging, with readbacks from chip to ensure timing (simple udelay + * is not enough). + * + * This all means that accessing the device is specialized enough + * that using the standard kernel I2C bitbanging interface would be + * impossible. For example, the core I2C eeprom driver expects to find + * a device at one or more of a limited set of addresses only. It doesn't + * allow writing to an eeprom. It also doesn't provide any means of + * accessing eeprom contents from within the kernel, only via sysfs. + */ + +/* Added functionality for IBA7220-based cards */ +#define IPATH_EEPROM_DEV_V1 0xA0 +#define IPATH_EEPROM_DEV_V2 0xA2 +#define IPATH_TEMP_DEV 0x98 +#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2) +#define IPATH_NO_DEV (0xFF) + +/* + * The number of I2C chains is proliferating. Table below brings + * some order to the madness. The basic principle is that the + * table is scanned from the top, and a "probe" is made to the + * device probe_dev. If that succeeds, the chain is considered + * to be of that type, and dd->i2c_chain_type is set to the index+1 + * of the entry. + * The +1 is so static initialization can mean "unknown, do probe." + */ +static struct i2c_chain_desc { + u8 probe_dev; /* If seen at probe, chain is this type */ + u8 eeprom_dev; /* Dev addr (if any) for EEPROM */ + u8 temp_dev; /* Dev Addr (if any) for Temp-sense */ +} i2c_chains[] = { + { IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */ + { IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */ + { IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */ + { IPATH_NO_DEV } +}; + +enum i2c_type { + i2c_line_scl = 0, + i2c_line_sda +}; + +enum i2c_state { + i2c_line_low = 0, + i2c_line_high +}; + +#define READ_CMD 1 +#define WRITE_CMD 0 + +/** + * i2c_gpio_set - set a GPIO line + * @dd: the infinipath device + * @line: the line to set + * @new_line_state: the state to set + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. + */ +static int i2c_gpio_set(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state new_line_state) +{ + u64 out_mask, dir_mask, *gpioval; + unsigned long flags = 0; + + gpioval = &dd->ipath_gpio_out; + + if (line == i2c_line_scl) { + dir_mask = dd->ipath_gpio_scl; + out_mask = (1UL << dd->ipath_gpio_scl_num); + } else { + dir_mask = dd->ipath_gpio_sda; + out_mask = (1UL << dd->ipath_gpio_sda_num); + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + if (new_line_state == i2c_line_high) { + /* tri-state the output rather than force high */ + dd->ipath_extctrl &= ~dir_mask; + } else { + /* config line to be an output */ + dd->ipath_extctrl |= dir_mask; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); + + /* set output as well (no real verify) */ + if (new_line_state == i2c_line_high) + *gpioval |= out_mask; + else + *gpioval &= ~out_mask; + + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + return 0; +} + +/** + * i2c_gpio_get - get a GPIO line state + * @dd: the infinipath device + * @line: the line to get + * @curr_statep: where to put the line state + * + * Returns 0 if the line was set to the new state successfully, non-zero + * on error. curr_state is not set on error. + */ +static int i2c_gpio_get(struct ipath_devdata *dd, + enum i2c_type line, + enum i2c_state *curr_statep) +{ + u64 read_val, mask; + int ret; + unsigned long flags = 0; + + /* check args */ + if (curr_statep == NULL) { + ret = 1; + goto bail; + } + + /* config line to be an input */ + if (line == i2c_line_scl) + mask = dd->ipath_gpio_scl; + else + mask = dd->ipath_gpio_sda; + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + dd->ipath_extctrl &= ~mask; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl); + /* + * Below is very unlikely to reflect true input state if Output + * Enable actually changed. + */ + read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + if (read_val & mask) + *curr_statep = i2c_line_high; + else + *curr_statep = i2c_line_low; + + ret = 0; + +bail: + return ret; +} + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the infinipath device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct ipath_devdata *dd) +{ + (void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + rmb(); +} + +static void scl_out(struct ipath_devdata *dd, u8 bit) +{ + udelay(1); + i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static void sda_out(struct ipath_devdata *dd, u8 bit) +{ + i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low); + + i2c_wait_for_writes(dd); +} + +static u8 sda_in(struct ipath_devdata *dd, int wait) +{ + enum i2c_state bit; + + if (i2c_gpio_get(dd, i2c_line_sda, &bit)) + ipath_dbg("get bit failed!\n"); + + if (wait) + i2c_wait_for_writes(dd); + + return bit == i2c_line_high ? 1U : 0; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the infinipath device + */ +static int i2c_ackrcv(struct ipath_devdata *dd) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, 1); + scl_out(dd, i2c_line_high); + ack_received = sda_in(dd, 1) == 0; + scl_out(dd, i2c_line_low); + return ack_received; +} + +/** + * rd_byte - read a byte, leaving ACK, STOP, etc up to caller + * @dd: the infinipath device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct ipath_devdata *dd) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, i2c_line_high); + data |= sda_in(dd, 0); + scl_out(dd, i2c_line_low); + } + return data; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the infinipath device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct ipath_devdata *dd, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, bit); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + } + return (!i2c_ackrcv(dd)) ? 1 : 0; +} + +static void send_ack(struct ipath_devdata *dd) +{ + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); +} + +/** + * i2c_startcmd - transmit the start condition, followed by address/cmd + * @dd: the infinipath device + * @offset_dir: direction byte + * + * (both clock/data high, clock high, data low while clock is high) + */ +static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir) +{ + int res; + + /* issue start sequence */ + sda_out(dd, i2c_line_high); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + + /* issue length and direction byte */ + res = wr_byte(dd, offset_dir); + + if (res) + ipath_cdbg(VERBOSE, "No ack to complete start\n"); + + return res; +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the infinipath device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct ipath_devdata *dd) +{ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); + udelay(2); +} + +/** + * eeprom_reset - reset I2C communication + * @dd: the infinipath device + */ + +static int eeprom_reset(struct ipath_devdata *dd) +{ + int clock_cycles_left = 9; + u64 *gpioval = &dd->ipath_gpio_out; + int ret; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + /* Make sure shadows are consistent */ + dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl); + *gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); + + ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg " + "is %llx\n", (unsigned long long) *gpioval); + + /* + * This is to get the i2c into a known state, by first going low, + * then tristate sda (and then tristate scl as first thing + * in loop) + */ + scl_out(dd, i2c_line_low); + sda_out(dd, i2c_line_high); + + /* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */ + while (clock_cycles_left--) { + scl_out(dd, i2c_line_high); + + /* SDA seen high, issue START by dropping it while SCL high */ + if (sda_in(dd, 0)) { + sda_out(dd, i2c_line_low); + scl_out(dd, i2c_line_low); + /* ATMEL spec says must be followed by STOP. */ + scl_out(dd, i2c_line_high); + sda_out(dd, i2c_line_high); + ret = 0; + goto bail; + } + + scl_out(dd, i2c_line_low); + } + + ret = 1; + +bail: + return ret; +} + +/* + * Probe for I2C device at specified address. Returns 0 for "success" + * to match rest of this file. + * Leave bus in "reasonable" state for further commands. + */ +static int i2c_probe(struct ipath_devdata *dd, int devaddr) +{ + int ret = 0; + + ret = eeprom_reset(dd); + if (ret) { + ipath_dev_err(dd, "Failed reset probing device 0x%02X\n", + devaddr); + return ret; + } + /* + * Reset no longer leaves bus in start condition, so normal + * i2c_startcmd() will do. + */ + ret = i2c_startcmd(dd, devaddr | READ_CMD); + if (ret) + ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n", + devaddr); + else { + /* + * Device did respond. Complete a single-byte read, because some + * devices apparently cannot handle STOP immediately after they + * ACK the start-cmd. + */ + int data; + data = rd_byte(dd); + stop_cmd(dd); + ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr); + } + return ret; +} + +/* + * Returns the "i2c type". This is a pointer to a struct that describes + * the I2C chain on this board. To minimize impact on struct ipath_devdata, + * the (small integer) index into the table is actually memoized, rather + * then the pointer. + * Memoization is because the type is determined on the first call per chip. + * An alternative would be to move type determination to early + * init code. + */ +static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd) +{ + int idx; + + /* Get memoized index, from previous successful probes */ + idx = dd->ipath_i2c_chain_type - 1; + if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1)) + goto done; + + idx = 0; + while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) { + /* if probe succeeds, this is type */ + if (!i2c_probe(dd, i2c_chains[idx].probe_dev)) + break; + ++idx; + } + + /* + * Old EEPROM (first entry) may require a reset after probe, + * rather than being able to "start" after "stop" + */ + if (idx == 0) + eeprom_reset(dd); + + if (i2c_chains[idx].probe_dev == IPATH_NO_DEV) + idx = -1; + else + dd->ipath_i2c_chain_type = idx + 1; +done: + return (idx >= 0) ? i2c_chains + idx : NULL; +} + +static int ipath_eeprom_internal_read(struct ipath_devdata *dd, + u8 eeprom_offset, void *buffer, int len) +{ + int ret; + struct i2c_chain_desc *icd; + u8 *bp = buffer; + + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->eeprom_dev == IPATH_NO_DEV) { + /* legacy not-really-I2C */ + ipath_cdbg(VERBOSE, "Start command only address\n"); + eeprom_offset = (eeprom_offset << 1) | READ_CMD; + ret = i2c_startcmd(dd, eeprom_offset); + } else { + /* Actual I2C */ + ipath_cdbg(VERBOSE, "Start command uses devaddr\n"); + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + stop_cmd(dd); + ret = 1; + goto bail; + } + ret = wr_byte(dd, eeprom_offset); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM address\n"); + ret = 1; + goto bail; + } + ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD); + } + if (ret) { + ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev); + stop_cmd(dd); + ret = 1; + goto bail; + } + + /* + * eeprom keeps clocking data out as long as we ack, automatically + * incrementing the address. + */ + while (len-- > 0) { + /* get and store data */ + *bp++ = rd_byte(dd); + /* send ack if not the last byte */ + if (len) + send_ack(dd); + } + + stop_cmd(dd); + + ret = 0; + +bail: + return ret; +} + +static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buffer, int len) +{ + int sub_len; + const u8 *bp = buffer; + int max_wait_time, i; + int ret; + struct i2c_chain_desc *icd; + + ret = 1; + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + while (len > 0) { + if (icd->eeprom_dev == IPATH_NO_DEV) { + if (i2c_startcmd(dd, + (eeprom_offset << 1) | WRITE_CMD)) { + ipath_dbg("Failed to start cmd offset %u\n", + eeprom_offset); + goto failed_write; + } + } else { + /* Real I2C */ + if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) { + ipath_dbg("Failed EEPROM startcmd\n"); + goto failed_write; + } + ret = wr_byte(dd, eeprom_offset); + if (ret) { + ipath_dev_err(dd, "Failed to write EEPROM " + "address\n"); + goto failed_write; + } + } + + sub_len = min(len, 4); + eeprom_offset += sub_len; + len -= sub_len; + + for (i = 0; i < sub_len; i++) { + if (wr_byte(dd, *bp++)) { + ipath_dbg("no ack after byte %u/%u (%u " + "total remain)\n", i, sub_len, + len + sub_len - i); + goto failed_write; + } + } + + stop_cmd(dd); + + /* + * wait for write complete by waiting for a successful + * read (the chip replies with a zero after the write + * cmd completes, and before it writes to the eeprom. + * The startcmd for the read will fail the ack until + * the writes have completed. We do this inline to avoid + * the debug prints that are in the real read routine + * if the startcmd fails. + * We also use the proper device address, so it doesn't matter + * whether we have real eeprom_dev. legacy likes any address. + */ + max_wait_time = 100; + while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) { + stop_cmd(dd); + if (!--max_wait_time) { + ipath_dbg("Did not get successful read to " + "complete write\n"); + goto failed_write; + } + } + /* now read (and ignore) the resulting byte */ + rd_byte(dd); + stop_cmd(dd); + } + + ret = 0; + goto bail; + +failed_write: + stop_cmd(dd); + ret = 1; + +bail: + return ret; +} + +/** + * ipath_eeprom_read - receives bytes from the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: address to read from + * @buffer: where to store result + * @len: number of bytes to receive + */ +int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset, + void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + +/** + * ipath_eeprom_write - writes data to the eeprom via I2C + * @dd: the infinipath device + * @eeprom_offset: where to place data + * @buffer: data to write + * @len: number of bytes to write + */ +int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset, + const void *buff, int len) +{ + int ret; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len); + mutex_unlock(&dd->ipath_eep_lock); + } + + return ret; +} + +static u8 flash_csum(struct ipath_flash *ifp, int adjust) +{ + u8 *ip = (u8 *) ifp; + u8 csum = 0, len; + + /* + * Limit length checksummed to max length of actual data. + * Checksum of erased eeprom will still be bad, but we avoid + * reading past the end of the buffer we were passed. + */ + len = ifp->if_length; + if (len > sizeof(struct ipath_flash)) + len = sizeof(struct ipath_flash); + while (len--) + csum += *ip++; + csum -= ifp->if_csum; + csum = ~csum; + if (adjust) + ifp->if_csum = csum; + + return csum; +} + +/** + * ipath_get_guid - get the GUID from the i2c device + * @dd: the infinipath device + * + * We have the capability to use the ipath_nguid field, and get + * the guid from the first chip's flash, to use for all of them. + */ +void ipath_get_eeprom_info(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + __be64 guid; + int len, eep_stat; + u8 csum, *bguid; + int t = dd->ipath_unit; + struct ipath_devdata *dd0 = ipath_lookup(0); + + if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) { + u8 oguid; + dd->ipath_guid = dd0->ipath_guid; + bguid = (u8 *) & dd->ipath_guid; + + oguid = bguid[7]; + bguid[7] += t; + if (oguid > bguid[7]) { + if (bguid[6] == 0xff) { + if (bguid[5] == 0xff) { + ipath_dev_err( + dd, + "Can't set %s GUID from " + "base, wraps to OUI!\n", + ipath_get_unit_name(t)); + dd->ipath_guid = 0; + goto bail; + } + bguid[5]++; + } + bguid[6]++; + } + dd->ipath_nguid = 1; + + ipath_dbg("nguid %u, so adding %u to device 0 guid, " + "for %llx\n", + dd0->ipath_nguid, t, + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + goto bail; + } + + /* + * read full flash, not just currently used part, since it may have + * been written with a newer definition + * */ + len = sizeof(struct ipath_flash); + buf = vmalloc(len); + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for GUID\n", len); + goto bail; + } + + mutex_lock(&dd->ipath_eep_lock); + eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len); + mutex_unlock(&dd->ipath_eep_lock); + + if (eep_stat) { + ipath_dev_err(dd, "Failed reading GUID from eeprom\n"); + goto done; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: " + "0x%x, not 0x%x\n", csum, ifp->if_csum); + goto done; + } + if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) || + *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) { + ipath_dev_err(dd, "Invalid GUID %llx from flash; " + "ignoring\n", + *(unsigned long long *) ifp->if_guid); + /* don't allow GUID if all 0 or all 1's */ + goto done; + } + + /* complain, but allow it */ + if (*(u64 *) ifp->if_guid == 0x100007511000000ULL) + dev_info(&dd->pcidev->dev, "Warning, GUID %llx is " + "default, probably not correct!\n", + *(unsigned long long *) ifp->if_guid); + + bguid = ifp->if_guid; + if (!bguid[0] && !bguid[1] && !bguid[2]) { + /* original incorrect GUID format in flash; fix in + * core copy, by shifting up 2 octets; don't need to + * change top octet, since both it and shifted are + * 0.. */ + bguid[1] = bguid[3]; + bguid[2] = bguid[4]; + bguid[3] = bguid[4] = 0; + guid = *(__be64 *) ifp->if_guid; + ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, " + "shifting 2 octets\n"); + } else + guid = *(__be64 *) ifp->if_guid; + dd->ipath_guid = guid; + dd->ipath_nguid = ifp->if_numguid; + /* + * Things are slightly complicated by the desire to transparently + * support both the Pathscale 10-digit serial number and the QLogic + * 13-character version. + */ + if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] + && ((u8 *)ifp->if_sprefix)[0] != 0xFF) { + /* This board has a Serial-prefix, which is stored + * elsewhere for backward-compatibility. + */ + char *snp = dd->ipath_serial; + memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix); + snp[sizeof ifp->if_sprefix] = '\0'; + len = strlen(snp); + snp += len; + len = (sizeof dd->ipath_serial) - len; + if (len > sizeof ifp->if_serial) { + len = sizeof ifp->if_serial; + } + memcpy(snp, ifp->if_serial, len); + } else + memcpy(dd->ipath_serial, ifp->if_serial, + sizeof ifp->if_serial); + if (!strstr(ifp->if_comment, "Tested successfully")) + ipath_dev_err(dd, "Board SN %s did not pass functional " + "test: %s\n", dd->ipath_serial, + ifp->if_comment); + + ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n", + (unsigned long long) be64_to_cpu(dd->ipath_guid)); + + memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT); + /* + * Power-on (actually "active") hours are kept as little-endian value + * in EEPROM, but as seconds in a (possibly as small as 24-bit) + * atomic_t while running. + */ + atomic_set(&dd->ipath_active_time, 0); + dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8); + +done: + vfree(buf); + +bail:; +} + +/** + * ipath_update_eeprom_log - copy active-time and error counters to eeprom + * @dd: the infinipath device + * + * Although the time is kept as seconds in the ipath_devdata struct, it is + * rounded to hours for re-write, as we have only 16 bits in EEPROM. + * First-cut code reads whole (expected) struct ipath_flash, modifies, + * re-writes. Future direction: read/write only what we need, assuming + * that the EEPROM had to have been "good enough" for driver init, and + * if not, we aren't making it worse. + * + */ + +int ipath_update_eeprom_log(struct ipath_devdata *dd) +{ + void *buf; + struct ipath_flash *ifp; + int len, hi_water; + uint32_t new_time, new_hrs; + u8 csum; + int ret, idx; + unsigned long flags; + + /* first, check if we actually need to do anything. */ + ret = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + if (dd->ipath_eep_st_new_errs[idx]) { + ret = 1; + break; + } + } + new_time = atomic_read(&dd->ipath_active_time); + + if (ret == 0 && new_time < 3600) + return 0; + + /* + * The quick-check above determined that there is something worthy + * of logging, so get current contents and do a more detailed idea. + * read full flash, not just currently used part, since it may have + * been written with a newer definition + */ + len = sizeof(struct ipath_flash); + buf = vmalloc(len); + ret = 1; + if (!buf) { + ipath_dev_err(dd, "Couldn't allocate memory to read %u " + "bytes from eeprom for logging\n", len); + goto bail; + } + + /* Grab semaphore and read current EEPROM. If we get an + * error, let go, but if not, keep it until we finish write. + */ + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (ret) { + ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n"); + goto free_bail; + } + ret = ipath_eeprom_internal_read(dd, 0, buf, len); + if (ret) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "Unable read EEPROM for logging\n"); + goto free_bail; + } + ifp = (struct ipath_flash *)buf; + + csum = flash_csum(ifp, 0); + if (csum != ifp->if_csum) { + mutex_unlock(&dd->ipath_eep_lock); + ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n", + csum, ifp->if_csum); + ret = 1; + goto free_bail; + } + hi_water = 0; + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + int new_val = dd->ipath_eep_st_new_errs[idx]; + if (new_val) { + /* + * If we have seen any errors, add to EEPROM values + * We need to saturate at 0xFF (255) and we also + * would need to adjust the checksum if we were + * trying to minimize EEPROM traffic + * Note that we add to actual current count in EEPROM, + * in case it was altered while we were running. + */ + new_val += ifp->if_errcntp[idx]; + if (new_val > 0xFF) + new_val = 0xFF; + if (ifp->if_errcntp[idx] != new_val) { + ifp->if_errcntp[idx] = new_val; + hi_water = offsetof(struct ipath_flash, + if_errcntp) + idx; + } + /* + * update our shadow (used to minimize EEPROM + * traffic), to match what we are about to write. + */ + dd->ipath_eep_st_errs[idx] = new_val; + dd->ipath_eep_st_new_errs[idx] = 0; + } + } + /* + * now update active-time. We would like to round to the nearest hour + * but unless atomic_t are sure to be proper signed ints we cannot, + * because we need to account for what we "transfer" to EEPROM and + * if we log an hour at 31 minutes, then we would need to set + * active_time to -29 to accurately count the _next_ hour. + */ + if (new_time >= 3600) { + new_hrs = new_time / 3600; + atomic_sub((new_hrs * 3600), &dd->ipath_active_time); + new_hrs += dd->ipath_eep_hrs; + if (new_hrs > 0xFFFF) + new_hrs = 0xFFFF; + dd->ipath_eep_hrs = new_hrs; + if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) { + ifp->if_powerhour[0] = new_hrs & 0xFF; + hi_water = offsetof(struct ipath_flash, if_powerhour); + } + if ((new_hrs >> 8) != ifp->if_powerhour[1]) { + ifp->if_powerhour[1] = new_hrs >> 8; + hi_water = offsetof(struct ipath_flash, if_powerhour) + + 1; + } + } + /* + * There is a tiny possibility that we could somehow fail to write + * the EEPROM after updating our shadows, but problems from holding + * the spinlock too long are a much bigger issue. + */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + if (hi_water) { + /* we made some change to the data, uopdate cksum and write */ + csum = flash_csum(ifp, 1); + ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1); + } + mutex_unlock(&dd->ipath_eep_lock); + if (ret) + ipath_dev_err(dd, "Failed updating EEPROM\n"); + +free_bail: + vfree(buf); +bail: + return ret; + +} + +/** + * ipath_inc_eeprom_err - increment one of the four error counters + * that are logged to EEPROM. + * @dd: the infinipath device + * @eidx: 0..3, the counter to increment + * @incr: how much to add + * + * Each counter is 8-bits, and saturates at 255 (0xFF). They + * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log() + * is called, but it can only be called in a context that allows sleep. + * This function can be called even at interrupt level. + */ + +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr) +{ + uint new_val; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + new_val = dd->ipath_eep_st_new_errs[eidx] + incr; + if (new_val > 255) + new_val = 255; + dd->ipath_eep_st_new_errs[eidx] = new_val; + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + return; +} + +static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + struct i2c_chain_desc *icd; + + ret = -ENOENT; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + stop_cmd(dd); + if (ret) { + ipath_dev_err(dd, "Failed tempsense WR command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) { + ipath_dbg("Failed tempsense RD startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + /* + * We can only clock out one byte per command, sensibly + */ + ret = rd_byte(dd); + stop_cmd(dd); + +bail: + return ret; +} + +#define VALID_TS_RD_REG_MASK 0xBF + +/** + * ipath_tempsense_read - read register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to read from + * + * returns reg contents (0..255) or < 0 for error + */ +int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum) +{ + int ret; + + if (regnum > 7) + return -EINVAL; + + /* return a bogus value for (the one) register we do not have */ + if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) + return 0; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_read(dd, regnum); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is actual value (0..255) + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} + +static int ipath_tempsense_internal_write(struct ipath_devdata *dd, + u8 regnum, u8 data) +{ + int ret = -ENOENT; + struct i2c_chain_desc *icd; + + icd = ipath_i2c_type(dd); + if (!icd) + goto bail; + + if (icd->temp_dev == IPATH_NO_DEV) { + /* tempsense only exists on new, real-I2C boards */ + ret = -ENXIO; + goto bail; + } + if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) { + ipath_dbg("Failed tempsense startcmd\n"); + stop_cmd(dd); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, regnum); + if (ret) { + stop_cmd(dd); + ipath_dev_err(dd, "Failed to write tempsense command %02X\n", + regnum); + ret = -ENXIO; + goto bail; + } + ret = wr_byte(dd, data); + stop_cmd(dd); + ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD); + if (ret) { + ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n", + regnum); + ret = -ENXIO; + } + +bail: + return ret; +} + +#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD)) + +/** + * ipath_tempsense_write - write register of temp sensor via I2C + * @dd: the infinipath device + * @regnum: register to write + * @data: data to write + * + * returns 0 for success or < 0 for error + */ +int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data) +{ + int ret; + + if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK)) + return -EINVAL; + + ret = mutex_lock_interruptible(&dd->ipath_eep_lock); + if (!ret) { + ret = ipath_tempsense_internal_write(dd, regnum, data); + mutex_unlock(&dd->ipath_eep_lock); + } + + /* + * There are three possibilities here: + * ret is 0 for success + * ret is -ENXIO or -EINVAL from code in this file + * ret is -EINTR from mutex_lock_interruptible. + */ + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_file_ops.c b/drivers/staging/rdma/ipath/ipath_file_ops.c new file mode 100644 index 000000000..450d15965 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_file_ops.c @@ -0,0 +1,2620 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" +#include "ipath_user_sdma.h" + +static int ipath_open(struct inode *, struct file *); +static int ipath_close(struct inode *, struct file *); +static ssize_t ipath_write(struct file *, const char __user *, size_t, + loff_t *); +static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from); +static unsigned int ipath_poll(struct file *, struct poll_table_struct *); +static int ipath_mmap(struct file *, struct vm_area_struct *); + +/* + * This is really, really weird shit - write() and writev() here + * have completely unrelated semantics. Sucky userland ABI, + * film at 11. + */ +static const struct file_operations ipath_file_ops = { + .owner = THIS_MODULE, + .write = ipath_write, + .write_iter = ipath_write_iter, + .open = ipath_open, + .release = ipath_close, + .poll = ipath_poll, + .mmap = ipath_mmap, + .llseek = noop_llseek, +}; + +/* + * Convert kernel virtual addresses to physical addresses so they don't + * potentially conflict with the chip addresses used as mmap offsets. + * It doesn't really matter what mmap offset we use as long as we can + * interpret it correctly. + */ +static u64 cvt_kvaddr(void *p) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(p); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int ipath_get_base_info(struct file *fp, + void __user *ubase, size_t ubase_size) +{ + struct ipath_portdata *pd = port_fp(fp); + int ret = 0; + struct ipath_base_info *kinfo = NULL; + struct ipath_devdata *dd = pd->port_dd; + unsigned subport_cnt; + int shared, master; + size_t sz; + + subport_cnt = pd->port_subport_cnt; + if (!subport_cnt) { + shared = 0; + master = 0; + subport_cnt = 1; + } else { + shared = 1; + master = !subport_fp(fp); + } + + sz = sizeof(*kinfo); + /* If port sharing is not requested, allow the old size structure */ + if (!shared) + sz -= 7 * sizeof(u64); + if (ubase_size < sz) { + ipath_cdbg(PROC, + "Base size %zu, need %zu (version mismatch?)\n", + ubase_size, sz); + ret = -EINVAL; + goto bail; + } + + kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL); + if (kinfo == NULL) { + ret = -ENOMEM; + goto bail; + } + + ret = dd->ipath_f_get_base_info(pd, kinfo); + if (ret < 0) + goto bail; + + kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt; + kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize; + kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt; + kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize; + /* + * have to mmap whole thing + */ + kinfo->spi_rcv_egrbuftotlen = + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk; + kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen / + pd->port_rcvegrbuf_chunks; + kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt; + if (master) + kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt; + /* + * for this use, may be ipath_cfgports summed over all chips that + * are are configured and present + */ + kinfo->spi_nports = dd->ipath_cfgports; + /* unit (chip/board) our port is on */ + kinfo->spi_unit = dd->ipath_unit; + /* for now, only a single page */ + kinfo->spi_tid_maxsize = PAGE_SIZE; + + /* + * Doing this per port, and based on the skip value, etc. This has + * to be the actual buffer size, since the protocol code treats it + * as an array. + * + * These have to be set to user addresses in the user code via mmap. + * These values are used on return to user code for the mmap target + * addresses only. For 32 bit, same 44 bit address problem, so use + * the physical address, not virtual. Before 2.6.11, using the + * page_address() macro worked, but in 2.6.11, even that returns the + * full 64 bit address (upper bits all 1's). So far, using the + * physical addresses (or chip offsets, for chip mapping) works, but + * no doubt some future kernel release will change that, and we'll be + * on to yet another method of dealing with this. + */ + kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys; + kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys; + kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys; + kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys; + kinfo->spi_status = (u64) kinfo->spi_pioavailaddr + + (void *) dd->ipath_statusp - + (void *) dd->ipath_pioavailregs_dma; + if (!shared) { + kinfo->spi_piocnt = pd->port_piocnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs; + kinfo->__spi_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + } else if (master) { + kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) + + (pd->port_piocnt % subport_cnt); + /* Master's PIO buffers are after all the slave's */ + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * + (pd->port_piocnt - kinfo->spi_piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + kinfo->spi_piocnt = pd->port_piocnt / subport_cnt; + kinfo->spi_piobufbase = (u64) pd->port_piobufs + + dd->ipath_palign * kinfo->spi_piocnt * slave; + } + + if (shared) { + kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase + + dd->ipath_ureg_align * pd->port_port; + kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs; + kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base; + kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr; + + kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport_fp(fp)); + + kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport_fp(fp)); + kinfo->spi_rcvhdr_tailaddr = 0; + kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf + + pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size * + subport_fp(fp)); + + kinfo->spi_subport_uregbase = + cvt_kvaddr(pd->subport_uregbase); + kinfo->spi_subport_rcvegrbuf = + cvt_kvaddr(pd->subport_rcvegrbuf); + kinfo->spi_subport_rcvhdr_base = + cvt_kvaddr(pd->subport_rcvhdr_base); + ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n", + kinfo->spi_port, kinfo->spi_runtime_flags, + (unsigned long long) kinfo->spi_subport_uregbase, + (unsigned long long) kinfo->spi_subport_rcvegrbuf, + (unsigned long long) kinfo->spi_subport_rcvhdr_base); + } + + /* + * All user buffers are 2KB buffers. If we ever support + * giving 4KB buffers to user processes, this will need some + * work. + */ + kinfo->spi_pioindex = (kinfo->spi_piobufbase - + (dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign; + kinfo->spi_pioalign = dd->ipath_palign; + + kinfo->spi_qpair = IPATH_KD_QP; + /* + * user mode PIO buffers are always 2KB, even when 4KB can + * be received, and sent via the kernel; this is ibmaxlen + * for 2K MTU. + */ + kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32); + kinfo->spi_mtu = dd->ipath_ibmaxlen; /* maxlen, not ibmtu */ + kinfo->spi_port = pd->port_port; + kinfo->spi_subport = subport_fp(fp); + kinfo->spi_sw_version = IPATH_KERN_SWVERSION; + kinfo->spi_hw_version = dd->ipath_revision; + + if (master) { + kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER; + } + + sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo); + if (copy_to_user(ubase, kinfo, sz)) + ret = -EFAULT; + +bail: + kfree(kinfo); + return ret; +} + +/** + * ipath_tid_update - update a port TID + * @pd: the port + * @fp: the ipath device file + * @ti: the TID information + * + * The new implementation as of Oct 2004 is that the driver assigns + * the tid and returns it to the caller. To make it easier to + * catch bugs, and to reduce search time, we keep a cursor for + * each port, walking the shadow tid array to find one that's not + * in use. + * + * For now, if we can't allocate the full list, we fail, although + * in the long run, we'll allocate as many as we can, and the + * caller will deal with that by trying the remaining pages later. + * That means that when we fail, we have to mark the tids as not in + * use again, in our shadow copy. + * + * It's up to the caller to free the tids when they are done. + * We'll unlock the pages as they free them. + * + * Also, right now we are locking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. + */ +static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp, + const struct ipath_tid_info *ti) +{ + int ret = 0, ntids; + u32 tid, porttid, cnt, i, tidcnt, tidoff; + u16 *tidlist; + struct ipath_devdata *dd = pd->port_dd; + u64 physaddr; + unsigned long vaddr; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + struct page **pagep = NULL; + unsigned subport = subport_fp(fp); + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + cnt = ti->tidcnt; + if (!cnt) { + ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n", + (unsigned long long) ti->tidlist); + /* + * Should we treat as success? likely a bug + */ + ret = -EFAULT; + goto done; + } + porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) { + tidcnt = dd->ipath_rcvtidcnt; + tid = pd->port_tidcursor; + tidoff = 0; + } else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + tidoff = dd->ipath_rcvtidcnt - tidcnt; + porttid += tidoff; + tid = tidcursor_fp(fp); + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + tidoff = tidcnt * (subport - 1); + porttid += tidoff; + tid = tidcursor_fp(fp); + } + if (cnt > tidcnt) { + /* make sure it all fits in port_tid_pg_list */ + dev_info(&dd->pcidev->dev, "Process tried to allocate %u " + "TIDs, only trying max (%u)\n", cnt, tidcnt); + cnt = tidcnt; + } + pagep = &((struct page **) pd->port_tid_pg_list)[tidoff]; + tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff]; + + memset(tidmap, 0, sizeof(tidmap)); + /* before decrement; chip actual # */ + ntids = tidcnt; + tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n", + pd->port_port, cnt, tid, tidbase); + + /* virtual address of first page in transfer */ + vaddr = ti->tidvaddr; + if (!access_ok(VERIFY_WRITE, (void __user *) vaddr, + cnt * PAGE_SIZE)) { + ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, cnt); + ret = -EFAULT; + goto done; + } + ret = ipath_get_user_pages(vaddr, cnt, pagep); + if (ret) { + if (ret == -EBUSY) { + ipath_dbg("Failed to lock addr %p, %u pages " + "(already locked)\n", + (void *) vaddr, cnt); + /* + * for now, continue, and see what happens but with + * the new implementation, this should never happen, + * unless perhaps the user has mpin'ed the pages + * themselves (something we need to test) + */ + ret = 0; + } else { + dev_info(&dd->pcidev->dev, + "Failed to lock addr %p, %u pages: " + "errno %d\n", (void *) vaddr, cnt, -ret); + goto done; + } + } + for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + for (; ntids--; tid++) { + if (tid == tidcnt) + tid = 0; + if (!dd->ipath_pageshadow[porttid + tid]) + break; + } + if (ntids < 0) { + /* + * oops, wrapped all the way through their TIDs, + * and didn't have enough free; see comments at + * start of routine + */ + ipath_dbg("Not enough free TIDs for %u pages " + "(index %d), failing\n", cnt, i); + i--; /* last tidlist[i] not filled in */ + ret = -ENOMEM; + break; + } + tidlist[i] = tid + tidoff; + ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, " + "vaddr %lx\n", i, tid + tidoff, vaddr); + /* we "know" system pages and TID pages are same size */ + dd->ipath_pageshadow[porttid + tid] = pagep[i]; + dd->ipath_physshadow[porttid + tid] = ipath_map_page( + dd->pcidev, pagep[i], 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + /* + * don't need atomic or it's overhead + */ + __set_bit(tid, tidmap); + physaddr = dd->ipath_physshadow[porttid + tid]; + ipath_stats.sps_pagelocks++; + ipath_cdbg(VERBOSE, + "TID %u, vaddr %lx, physaddr %llx pgp %p\n", + tid, vaddr, (unsigned long long) physaddr, + pagep[i]); + dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED, + physaddr); + /* + * don't check this tid in ipath_portshadow, since we + * just filled it in; start with the next one. + */ + tid++; + } + + if (ret) { + u32 limit; + cleanup: + /* jump here if copy out of updated info failed... */ + ipath_dbg("After failure (ret=%d), undo %d of %d entries\n", + -ret, i, cnt); + /* same code that's in ipath_free_tid() */ + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit((const unsigned long *)tidmap, limit); + for (; tid < limit; tid++) { + if (!test_bit(tid, tidmap)) + continue; + if (dd->ipath_pageshadow[porttid + tid]) { + ipath_cdbg(VERBOSE, "Freeing TID %u\n", + tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_stats.sps_pageunlocks++; + } + } + ipath_release_user_pages(pagep, cnt); + } else { + /* + * Copy the updated array, with ipath_tid's filled in, back + * to user. Since we did the copy in already, this "should + * never fail" If it does, we have to clean up... + */ + if (copy_to_user((void __user *) + (unsigned long) ti->tidlist, + tidlist, cnt * sizeof(*tidlist))) { + ret = -EFAULT; + goto cleanup; + } + if (copy_to_user((void __user *) (unsigned long) ti->tidmap, + tidmap, sizeof tidmap)) { + ret = -EFAULT; + goto cleanup; + } + if (tid == tidcnt) + tid = 0; + if (!pd->port_subport_cnt) + pd->port_tidcursor = tid; + else + tidcursor_fp(fp) = tid; + } + +done: + if (ret) + ipath_dbg("Failed to map %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_tid_free - free a port TID + * @pd: the port + * @subport: the subport + * @ti: the TID info + * + * right now we are unlocking one page at a time, but since + * the intended use of this routine is for a single group of + * virtually contiguous pages, that should change to improve + * performance. We check that the TID is in range for this port + * but otherwise don't check validity; if user has an error and + * frees the wrong tid, it's only their own data that can thereby + * be corrupted. We do check that the TID was in use, for sanity + * We always use our idea of the saved address, not the address that + * they pass in to us. + */ + +static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport, + const struct ipath_tid_info *ti) +{ + int ret = 0; + u32 tid, porttid, cnt, limit, tidcnt; + struct ipath_devdata *dd = pd->port_dd; + u64 __iomem *tidbase; + unsigned long tidmap[8]; + + if (!dd->ipath_pageshadow) { + ret = -ENOMEM; + goto done; + } + + if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap, + sizeof tidmap)) { + ret = -EFAULT; + goto done; + } + + porttid = pd->port_port * dd->ipath_rcvtidcnt; + if (!pd->port_subport_cnt) + tidcnt = dd->ipath_rcvtidcnt; + else if (!subport) { + tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) + + (dd->ipath_rcvtidcnt % pd->port_subport_cnt); + porttid += dd->ipath_rcvtidcnt - tidcnt; + } else { + tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt; + porttid += tidcnt * (subport - 1); + } + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + porttid * sizeof(*tidbase)); + + limit = sizeof(tidmap) * BITS_PER_BYTE; + if (limit > tidcnt) + /* just in case size changes in future */ + limit = tidcnt; + tid = find_first_bit(tidmap, limit); + ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) " + "set is %d, porttid %u\n", pd->port_port, ti->tidcnt, + limit, tid, porttid); + for (cnt = 0; tid < limit; tid++) { + /* + * small optimization; if we detect a run of 3 or so without + * any set, use find_first_bit again. That's mainly to + * accelerate the case where we wrapped, so we have some at + * the beginning, and some at the end, and a big gap + * in the middle. + */ + if (!test_bit(tid, tidmap)) + continue; + cnt++; + if (dd->ipath_pageshadow[porttid + tid]) { + struct page *p; + p = dd->ipath_pageshadow[porttid + tid]; + dd->ipath_pageshadow[porttid + tid] = NULL; + ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n", + pid_nr(pd->port_pid), tid); + dd->ipath_f_put_tid(dd, &tidbase[tid], + RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + pci_unmap_page(dd->pcidev, + dd->ipath_physshadow[porttid + tid], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages(&p, 1); + ipath_stats.sps_pageunlocks++; + } else + ipath_dbg("Unused tid %u, ignoring\n", tid); + } + if (cnt != ti->tidcnt) + ipath_dbg("passed in tidcnt %d, only %d bits set in map\n", + ti->tidcnt, cnt); +done: + if (ret) + ipath_dbg("Failed to unmap %u TID pages, failing with %d\n", + ti->tidcnt, -ret); + return ret; +} + +/** + * ipath_set_part_key - set a partition key + * @pd: the port + * @key: the key + * + * We can have up to 4 active at a time (other than the default, which is + * always allowed). This is somewhat tricky, since multiple ports may set + * the same key, so we reference count them, and clean up at exit. All 4 + * partition keys are packed into a single infinipath register. It's an + * error for a process to set the same pkey multiple times. We provide no + * mechanism to de-allocate a pkey at this time, we may eventually need to + * do that. I've used the atomic operations, and no locking, and only make + * a single pass through what's available. This should be more than + * adequate for some time. I'll think about spinlocks or the like if and as + * it's necessary. + */ +static int ipath_set_part_key(struct ipath_portdata *pd, u16 key) +{ + struct ipath_devdata *dd = pd->port_dd; + int i, any = 0, pidx = -1; + u16 lkey = key & 0x7FFF; + int ret; + + if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) { + /* nothing to do; this key always valid */ + ret = 0; + goto bail; + } + + ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys " + "%hx:%x %hx:%x %hx:%x %hx:%x\n", + pd->port_port, key, dd->ipath_pkeys[0], + atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1], + atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2], + atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3], + atomic_read(&dd->ipath_pkeyrefs[3])); + + if (!lkey) { + ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n", + pd->port_port); + ret = -EINVAL; + goto bail; + } + + /* + * Set the full membership bit, because it has to be + * set in the register or the packet, and it seems + * cleaner to set in the register than to force all + * callers to set it. (see bug 4331) + */ + key |= 0x8000; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i] && pidx == -1) + pidx = i; + if (pd->port_pkeys[i] == key) { + ipath_cdbg(VERBOSE, "p%u tries to set same pkey " + "(%x) more than once\n", + pd->port_port, key); + ret = -EEXIST; + goto bail; + } + } + if (pidx == -1) { + ipath_dbg("All pkeys for port %u already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + if (dd->ipath_pkeys[i] == key) { + atomic_t *pkrefs = &dd->ipath_pkeyrefs[i]; + + if (atomic_inc_return(pkrefs) > 1) { + pd->port_pkeys[pidx] = key; + ipath_cdbg(VERBOSE, "p%u set key %x " + "matches #%d, count now %d\n", + pd->port_port, key, i, + atomic_read(pkrefs)); + ret = 0; + goto bail; + } else { + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + ipath_cdbg(VERBOSE, "Lost race, count was " + "0, after dec, it's %d\n", + atomic_read(pkrefs)); + any++; + } + } + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + /* + * It makes no sense to have both the limited and + * full membership PKEY set at the same time since + * the unlimited one will disable the limited one. + */ + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ipath_dbg("port %u, all pkeys already in use, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + goto bail; + } + for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + u64 pkey; + + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key; + pkey = + (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(PROC, "p%u set key %x in #%d, " + "portidx %d, new pkey reg %llx\n", + pd->port_port, key, i, pidx, + (unsigned long long) pkey); + ipath_write_kreg( + dd, dd->ipath_kregs->kr_partitionkey, pkey); + + ret = 0; + goto bail; + } + } + ipath_dbg("port %u, all pkeys already in use 2nd pass, " + "can't set %x\n", pd->port_port, key); + ret = -EBUSY; + +bail: + return ret; +} + +/** + * ipath_manage_rcvq - manage a port's receive queue + * @pd: the port + * @subport: the subport + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the port, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport, + int start_stop) +{ + struct ipath_devdata *dd = pd->port_dd; + + ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n", + start_stop ? "en" : "dis", dd->ipath_unit, + pd->port_port, subport); + if (subport) + goto bail; + /* atomically clear receive enable port. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. This could cause a + * problem if software was broken, and did the enable w/o + * the disable, but eventually the in-memory copy will be + * updated and correct itself, even in the face of software + * bugs. + */ + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + set_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + } else + clear_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* now be sure chip saw it before we return */ + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + if (start_stop) { + /* + * And try to be sure that tail reg update has happened too. + * This should in theory interlock with the RXE changes to + * the tail register. Don't assign it to the tail register + * in memory copy, since we could overwrite an update by the + * chip if we did. + */ + ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + } + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +static void ipath_clean_part_key(struct ipath_portdata *pd, + struct ipath_devdata *dd) +{ + int i, j, pchanged = 0; + u64 oldpkey; + + /* for debugging only */ + oldpkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + if (!pd->port_pkeys[i]) + continue; + ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i, + pd->port_pkeys[i]); + for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) { + /* check for match independent of the global bit */ + if ((dd->ipath_pkeys[j] & 0x7fff) != + (pd->port_pkeys[i] & 0x7fff)) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) { + ipath_cdbg(VERBOSE, "p%u clear key " + "%x matches #%d\n", + pd->port_port, + pd->port_pkeys[i], j); + ipath_stats.sps_pkeys[j] = + dd->ipath_pkeys[j] = 0; + pchanged++; + } + else ipath_cdbg( + VERBOSE, "p%u key %x matches #%d, " + "but ref still %d\n", pd->port_port, + pd->port_pkeys[i], j, + atomic_read(&dd->ipath_pkeyrefs[j])); + break; + } + pd->port_pkeys[i] = 0; + } + if (pchanged) { + u64 pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, " + "new pkey reg %llx\n", pd->port_port, + (unsigned long long) oldpkey, + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + } +} + +/* + * Initialize the port data with the receive buffer sizes + * so this can be done while the master port is locked. + * Otherwise, there is a race with a slave opening the port + * and seeing these fields uninitialized. + */ +static void init_user_egr_sizes(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned egrperchunk, egrcnt, size; + + /* + * to avoid wasting a lot of memory, we allocate 32KB chunks of + * physically contiguous memory, advance through it until used up + * and then allocate more. Of course, we need memory to store those + * extra pointers, now. Started out with 256KB, but under heavy + * memory pressure (creating large files and then copying them over + * NFS while doing lots of MPI jobs), we hit some allocation + * failures, even though we can sleep... (2.6.10) Still get + * failures at 64K. 32K is the lowest we can go without wasting + * additional memory. + */ + size = 0x8000; + egrperchunk = size / dd->ipath_rcvegrbufsize; + egrcnt = dd->ipath_rcvegrcnt; + pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk; + pd->port_rcvegrbufs_perchunk = egrperchunk; + pd->port_rcvegrbuf_size = size; +} + +/** + * ipath_create_user_egr - allocate eager TID buffers + * @pd: the port to allocate TID buffers for + * + * This routine is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance + * This is the user port version + * + * Allocate the eager TID buffers and program them into infinipath + * They are no longer completely contiguous, we do multiple allocation + * calls. + */ +static int ipath_create_user_egr(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff; + size_t size; + int ret; + gfp_t gfp_flags; + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP; + + egrcnt = dd->ipath_rcvegrcnt; + /* TID number offset for this port */ + egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt; + egrsize = dd->ipath_rcvegrbufsize; + ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid " + "offset %x, egrsize %u\n", egrcnt, egroff, egrsize); + + chunk = pd->port_rcvegrbuf_chunks; + egrperchunk = pd->port_rcvegrbufs_perchunk; + size = pd->port_rcvegrbuf_size; + pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]), + GFP_KERNEL); + if (!pd->port_rcvegrbuf) { + ret = -ENOMEM; + goto bail; + } + pd->port_rcvegrbuf_phys = + kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]), + GFP_KERNEL); + if (!pd->port_rcvegrbuf_phys) { + ret = -ENOMEM; + goto bail_rcvegrbuf; + } + for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) { + + pd->port_rcvegrbuf[e] = dma_alloc_coherent( + &dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e], + gfp_flags); + + if (!pd->port_rcvegrbuf[e]) { + ret = -ENOMEM; + goto bail_rcvegrbuf_phys; + } + } + + pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0]; + + for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) { + dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk]; + unsigned i; + + for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) { + dd->ipath_f_put_tid(dd, e + egroff + + (u64 __iomem *) + ((char __iomem *) + dd->ipath_kregbase + + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, pa); + pa += egrsize; + } + cond_resched(); /* don't hog the cpu */ + } + + ret = 0; + goto bail; + +bail_rcvegrbuf_phys: + for (e = 0; e < pd->port_rcvegrbuf_chunks && + pd->port_rcvegrbuf[e]; e++) { + dma_free_coherent(&dd->pcidev->dev, size, + pd->port_rcvegrbuf[e], + pd->port_rcvegrbuf_phys[e]); + + } + kfree(pd->port_rcvegrbuf_phys); + pd->port_rcvegrbuf_phys = NULL; +bail_rcvegrbuf: + kfree(pd->port_rcvegrbuf); + pd->port_rcvegrbuf = NULL; +bail: + return ret; +} + + +/* common code for the mappings on dma_alloc_coherent mem */ +static int ipath_mmap_mem(struct vm_area_struct *vma, + struct ipath_portdata *pd, unsigned len, int write_ok, + void *kvaddr, char *what) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned long pfn; + int ret; + + if ((vma->vm_end - vma->vm_start) > len) { + dev_info(&dd->pcidev->dev, + "FAIL on %s: len %lx > %x\n", what, + vma->vm_end - vma->vm_start, len); + ret = -EFAULT; + goto bail; + } + + if (!write_ok) { + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "%s must be mapped readonly\n", what); + ret = -EPERM; + goto bail; + } + + /* don't allow them to later change with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + } + + pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, vma->vm_start, pfn, + len, vma->vm_page_prot); + if (ret) + dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x " + "bytes r%c failed: %d\n", what, pd->port_port, + pfn, len, write_ok?'w':'o', ret); + else + ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes " + "r%c\n", what, pd->port_port, pfn, len, + write_ok?'w':'o'); +bail: + return ret; +} + +static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd, + u64 ureg) +{ + unsigned long phys; + int ret; + + /* + * This is real hardware, so use io_remap. This is the mechanism + * for the user process to update the head registers for their port + * in the chip. + */ + if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { + dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen " + "%lx > PAGE\n", vma->vm_end - vma->vm_start); + ret = -EFAULT; + } else { + phys = dd->ipath_physaddr + ureg; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + ret = io_remap_pfn_range(vma, vma->vm_start, + phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); + } + return ret; +} + +static int mmap_piobufs(struct vm_area_struct *vma, + struct ipath_devdata *dd, + struct ipath_portdata *pd, + unsigned piobufs, unsigned piocnt) +{ + unsigned long phys; + int ret; + + /* + * When we map the PIO buffers in the chip, we want to map them as + * writeonly, no read possible. This prevents access to previous + * process data, and catches users who might try to read the i/o + * space due to a bug. + */ + if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) { + dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: " + "reqlen %lx > PAGE\n", + vma->vm_end - vma->vm_start); + ret = -EINVAL; + goto bail; + } + + phys = dd->ipath_physaddr + piobufs; + +#if defined(__powerpc__) + /* There isn't a generic way to specify writethrough mappings */ + pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; + pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; + pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; +#endif + + /* + * don't allow them to later change to readable with mprotect (for when + * not initially mapped readable, as is normally the case) + */ + vma->vm_flags &= ~VM_MAYREAD; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + + ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +bail: + return ret; +} + +static int mmap_rcvegrbufs(struct vm_area_struct *vma, + struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + unsigned long start, size; + size_t total_size, i; + unsigned long pfn; + int ret; + + size = pd->port_rcvegrbuf_size; + total_size = pd->port_rcvegrbuf_chunks * size; + if ((vma->vm_end - vma->vm_start) > total_size) { + dev_info(&dd->pcidev->dev, "FAIL on egr bufs: " + "reqlen %lx > actual %lx\n", + vma->vm_end - vma->vm_start, + (unsigned long) total_size); + ret = -EINVAL; + goto bail; + } + + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* don't allow them to later change to writeable with mprotect */ + vma->vm_flags &= ~VM_MAYWRITE; + + start = vma->vm_start; + + for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) { + pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT; + ret = remap_pfn_range(vma, start, pfn, size, + vma->vm_page_prot); + if (ret < 0) + goto bail; + } + ret = 0; + +bail: + return ret; +} + +/* + * ipath_file_vma_fault - handle a VMA page fault. + */ +static int ipath_file_vma_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct ipath_file_vm_ops = { + .fault = ipath_file_vma_fault, +}; + +static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, + struct ipath_portdata *pd, unsigned subport) +{ + unsigned long len; + struct ipath_devdata *dd; + void *addr; + size_t size; + int ret = 0; + + /* If the port is not shared, all addresses should be physical */ + if (!pd->port_subport_cnt) + goto bail; + + dd = pd->port_dd; + size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size; + + /* + * Each process has all the subport uregbase, rcvhdrq, and + * rcvegrbufs mmapped - as an array for all the processes, + * and also separately for this process. + */ + if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) { + addr = pd->subport_uregbase; + size = PAGE_SIZE * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) { + addr = pd->subport_rcvhdr_base; + size = pd->port_rcvhdrq_size * pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) { + addr = pd->subport_rcvegrbuf; + size *= pd->port_subport_cnt; + } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase + + PAGE_SIZE * subport)) { + addr = pd->subport_uregbase + PAGE_SIZE * subport; + size = PAGE_SIZE; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport)) { + addr = pd->subport_rcvhdr_base + + pd->port_rcvhdrq_size * subport; + size = pd->port_rcvhdrq_size; + } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf + + size * subport)) { + addr = pd->subport_rcvegrbuf + size * subport; + /* rcvegrbufs are read-only on the slave */ + if (vma->vm_flags & VM_WRITE) { + dev_info(&dd->pcidev->dev, + "Can't map eager buffers as " + "writable (flags=%lx)\n", vma->vm_flags); + ret = -EPERM; + goto bail; + } + /* + * Don't allow permission to later change to writeable + * with mprotect. + */ + vma->vm_flags &= ~VM_MAYWRITE; + } else { + goto bail; + } + len = vma->vm_end - vma->vm_start; + if (len > size) { + ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size); + ret = -EINVAL; + goto bail; + } + + vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT; + vma->vm_ops = &ipath_file_vm_ops; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + ret = 1; + +bail: + return ret; +} + +/** + * ipath_mmap - mmap various structures into user space + * @fp: the file pointer + * @vma: the VM area + * + * We use this to have a shared buffer between the kernel and the user code + * for the rcvhdr queue, egr buffers, and the per-port user regs and pio + * buffers in the chip. We have the open and close entries so we can bump + * the ref count and keep the driver from being unloaded while still mapped. + */ +static int ipath_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct ipath_portdata *pd; + struct ipath_devdata *dd; + u64 pgaddr, ureg; + unsigned piobufs, piocnt; + int ret; + + pd = port_fp(fp); + if (!pd) { + ret = -EINVAL; + goto bail; + } + dd = pd->port_dd; + + /* + * This is the ipath_do_user_init() code, mapping the shared buffers + * into the user process. The address referred to by vm_pgoff is the + * file offset passed via mmap(). For shared ports, this is the + * kernel vmalloc() address of the pages to share with the master. + * For non-shared or master ports, this is a physical address. + * We only do one mmap for each space mapped. + */ + pgaddr = vma->vm_pgoff << PAGE_SHIFT; + + /* + * Check for 0 in case one of the allocations failed, but user + * called mmap anyway. + */ + if (!pgaddr) { + ret = -EINVAL; + goto bail; + } + + ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n", + (unsigned long long) pgaddr, vma->vm_start, + vma->vm_end - vma->vm_start, dd->ipath_unit, + pd->port_port, subport_fp(fp)); + + /* + * Physical addresses must fit in 40 bits for our hardware. + * Check for kernel virtual addresses first, anything else must + * match a HW or memory address. + */ + ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp)); + if (ret) { + if (ret > 0) + ret = 0; + goto bail; + } + + ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port; + if (!pd->port_subport_cnt) { + /* port is not shared */ + piocnt = pd->port_piocnt; + piobufs = pd->port_piobufs; + } else if (!subport_fp(fp)) { + /* caller is the master */ + piocnt = (pd->port_piocnt / pd->port_subport_cnt) + + (pd->port_piocnt % pd->port_subport_cnt); + piobufs = pd->port_piobufs + + dd->ipath_palign * (pd->port_piocnt - piocnt); + } else { + unsigned slave = subport_fp(fp) - 1; + + /* caller is a slave */ + piocnt = pd->port_piocnt / pd->port_subport_cnt; + piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave; + } + + if (pgaddr == ureg) + ret = mmap_ureg(vma, dd, ureg); + else if (pgaddr == piobufs) + ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt); + else if (pgaddr == dd->ipath_pioavailregs_phys) + /* in-memory copy of pioavail registers */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + (void *) dd->ipath_pioavailregs_dma, + "pioavail registers"); + else if (pgaddr == pd->port_rcvegr_phys) + ret = mmap_rcvegrbufs(vma, pd); + else if (pgaddr == (u64) pd->port_rcvhdrq_phys) + /* + * The rcvhdrq itself; readonly except on HT (so have + * to allow writable mapping), multiple pages, contiguous + * from an i/o perspective. + */ + ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1, + pd->port_rcvhdrq, + "rcvhdrq"); + else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys) + /* in-memory copy of rcvhdrq tail register */ + ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0, + pd->port_rcvhdrtail_kvaddr, + "rcvhdrq tail"); + else + ret = -EINVAL; + + vma->vm_private_data = NULL; + + if (ret < 0) + dev_info(&dd->pcidev->dev, + "Failure %d on off %llx len %lx\n", + -ret, (unsigned long long)pgaddr, + vma->vm_end - vma->vm_start); +bail: + return ret; +} + +static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd) +{ + unsigned pollflag = 0; + + if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) && + pd->port_hdrqfull != pd->port_hdrqfull_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + } + + return pollflag; +} + +static unsigned int ipath_poll_urgent(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) +{ + unsigned pollflag = 0; + struct ipath_devdata *dd; + + dd = pd->port_dd; + + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); + + if (pd->port_urgent != pd->port_urgent_poll) { + pollflag |= POLLIN | POLLRDNORM; + pd->port_urgent_poll = pd->port_urgent; + } + + if (!pollflag) { + /* this saves a spin_lock/unlock in interrupt handler... */ + set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag); + /* flush waiting flag so don't miss an event... */ + wmb(); + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll_next(struct ipath_portdata *pd, + struct file *fp, + struct poll_table_struct *pt) +{ + u32 head; + u32 tail; + unsigned pollflag = 0; + struct ipath_devdata *dd; + + dd = pd->port_dd; + + /* variable access in ipath_poll_hdrqfull() needs this */ + rmb(); + pollflag = ipath_poll_hdrqfull(pd); + + head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port); + if (pd->port_rcvhdrtail_kvaddr) + tail = ipath_get_rcvhdrtail(pd); + else + tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port); + + if (head != tail) + pollflag |= POLLIN | POLLRDNORM; + else { + /* this saves a spin_lock/unlock in interrupt handler */ + set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag); + /* flush waiting flag so we don't miss an event */ + wmb(); + + set_bit(pd->port_port + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */ + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | head, + pd->port_port); + + poll_wait(fp, &pd->port_wait, pt); + } + + return pollflag; +} + +static unsigned int ipath_poll(struct file *fp, + struct poll_table_struct *pt) +{ + struct ipath_portdata *pd; + unsigned pollflag; + + pd = port_fp(fp); + if (!pd) + pollflag = 0; + else if (pd->poll_type & IPATH_POLL_TYPE_URGENT) + pollflag = ipath_poll_urgent(pd, fp, pt); + else + pollflag = ipath_poll_next(pd, fp, pt); + + return pollflag; +} + +static int ipath_supports_subports(int user_swmajor, int user_swminor) +{ + /* no subport implementation prior to software version 1.3 */ + return (user_swmajor > 1) || (user_swminor >= 3); +} + +static int ipath_compatible_subports(int user_swmajor, int user_swminor) +{ + /* this code is written long-hand for clarity */ + if (IPATH_USER_SWMAJOR != user_swmajor) { + /* no promise of compatibility if major mismatch */ + return 0; + } + if (IPATH_USER_SWMAJOR == 1) { + switch (IPATH_USER_SWMINOR) { + case 0: + case 1: + case 2: + /* no subport implementation so cannot be compatible */ + return 0; + case 3: + /* 3 is only compatible with itself */ + return user_swminor == 3; + default: + /* >= 4 are compatible (or are expected to be) */ + return user_swminor >= 4; + } + } + /* make no promises yet for future major versions */ + return 0; +} + +static int init_subports(struct ipath_devdata *dd, + struct ipath_portdata *pd, + const struct ipath_user_info *uinfo) +{ + int ret = 0; + unsigned num_subports; + size_t size; + + /* + * If the user is requesting zero subports, + * skip the subport allocation. + */ + if (uinfo->spu_subport_cnt <= 0) + goto bail; + + /* Self-consistency check for ipath_compatible_subports() */ + if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) && + !ipath_compatible_subports(IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR)) { + dev_info(&dd->pcidev->dev, + "Inconsistent ipath_compatible_subports()\n"); + goto bail; + } + + /* Check for subport compatibility */ + if (!ipath_compatible_subports(uinfo->spu_userversion >> 16, + uinfo->spu_userversion & 0xffff)) { + dev_info(&dd->pcidev->dev, + "Mismatched user version (%d.%d) and driver " + "version (%d.%d) while port sharing. Ensure " + "that driver and library are from the same " + "release.\n", + (int) (uinfo->spu_userversion >> 16), + (int) (uinfo->spu_userversion & 0xffff), + IPATH_USER_SWMAJOR, + IPATH_USER_SWMINOR); + goto bail; + } + if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) { + ret = -EINVAL; + goto bail; + } + + num_subports = uinfo->spu_subport_cnt; + pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports); + if (!pd->subport_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* Note: pd->port_rcvhdrq_size isn't initialized yet. */ + size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize * + sizeof(u32), PAGE_SIZE) * num_subports; + pd->subport_rcvhdr_base = vzalloc(size); + if (!pd->subport_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks * + pd->port_rcvegrbuf_size * + num_subports); + if (!pd->subport_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + + pd->port_subport_cnt = uinfo->spu_subport_cnt; + pd->port_subport_id = uinfo->spu_subport_id; + pd->active_slaves = 1; + set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + goto bail; + +bail_rhdr: + vfree(pd->subport_rcvhdr_base); +bail_ureg: + vfree(pd->subport_uregbase); + pd->subport_uregbase = NULL; +bail: + return ret; +} + +static int try_alloc_port(struct ipath_devdata *dd, int port, + struct file *fp, + const struct ipath_user_info *uinfo) +{ + struct ipath_portdata *pd; + int ret; + + if (!(pd = dd->ipath_pd[port])) { + void *ptmp; + + pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL); + + /* + * Allocate memory for use in ipath_tid_update() just once + * at open, not per call. Reduces cost of expected send + * setup. + */ + ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) + + dd->ipath_rcvtidcnt * sizeof(struct page **), + GFP_KERNEL); + if (!pd || !ptmp) { + ipath_dev_err(dd, "Unable to allocate portdata " + "memory, failing open\n"); + ret = -ENOMEM; + kfree(pd); + kfree(ptmp); + goto bail; + } + dd->ipath_pd[port] = pd; + dd->ipath_pd[port]->port_port = port; + dd->ipath_pd[port]->port_dd = dd; + dd->ipath_pd[port]->port_tid_pg_list = ptmp; + init_waitqueue_head(&dd->ipath_pd[port]->port_wait); + } + if (!pd->port_cnt) { + pd->userversion = uinfo->spu_userversion; + init_user_egr_sizes(pd); + if ((ret = init_subports(dd, pd, uinfo)) != 0) + goto bail; + ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n", + current->comm, current->pid, dd->ipath_unit, + port); + pd->port_cnt = 1; + port_fp(fp) = pd; + pd->port_pid = get_pid(task_pid(current)); + strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm)); + ipath_stats.sps_ports++; + ret = 0; + } else + ret = -EBUSY; + +bail: + return ret; +} + +static inline int usable(struct ipath_devdata *dd) +{ + return dd && + (dd->ipath_flags & IPATH_PRESENT) && + dd->ipath_kregbase && + dd->ipath_lid && + !(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED + | IPATH_LINKUNK)); +} + +static int find_free_port(int unit, struct file *fp, + const struct ipath_user_info *uinfo) +{ + struct ipath_devdata *dd = ipath_lookup(unit); + int ret, i; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + if (!usable(dd)) { + ret = -ENETDOWN; + goto bail; + } + + for (i = 1; i < dd->ipath_cfgports; i++) { + ret = try_alloc_port(dd, i, fp, uinfo); + if (ret != -EBUSY) + goto bail; + } + ret = -EBUSY; + +bail: + return ret; +} + +static int find_best_unit(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret = 0, i, prefunit = -1, devmax; + int maxofallports, npresent, nup; + int ndev; + + devmax = ipath_count_units(&npresent, &nup, &maxofallports); + + /* + * This code is present to allow a knowledgeable person to + * specify the layout of processes to processors before opening + * this driver, and then we'll assign the process to the "closest" + * InfiniPath chip to that processor (we assume reasonable connectivity, + * for now). This code assumes that if affinity has been set + * before this point, that at most one cpu is set; for now this + * is reasonable. I check for both cpumask_empty() and cpumask_full(), + * in case some kernel variant sets none of the bits when no + * affinity is set. 2.6.11 and 12 kernels have all present + * cpus set. Some day we'll have to fix it up further to handle + * a cpu subset. This algorithm fails for two HT chips connected + * in tunnel fashion. Eventually this needs real topology + * information. There may be some issues with dual core numbering + * as well. This needs more work prior to release. + */ + if (!cpumask_empty(tsk_cpus_allowed(current)) && + !cpumask_full(tsk_cpus_allowed(current))) { + int ncpus = num_online_cpus(), curcpu = -1, nset = 0; + get_online_cpus(); + for_each_online_cpu(i) + if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) { + ipath_cdbg(PROC, "%s[%u] affinity set for " + "cpu %d/%d\n", current->comm, + current->pid, i, ncpus); + curcpu = i; + nset++; + } + put_online_cpus(); + if (curcpu != -1 && nset != ncpus) { + if (npresent) { + prefunit = curcpu / (ncpus / npresent); + ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, " + "%d cpus/chip, select unit %d\n", + current->comm, current->pid, + npresent, ncpus, ncpus / npresent, + prefunit); + } + } + } + + /* + * user ports start at 1, kernel port is 0 + * For now, we do round-robin access across all chips + */ + + if (prefunit != -1) + devmax = prefunit + 1; +recheck: + for (i = 1; i < maxofallports; i++) { + for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax; + ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; /* can't use this unit */ + if (i >= dd->ipath_cfgports) + /* + * Maxed out on users of this unit. Try + * next. + */ + continue; + ret = try_alloc_port(dd, i, fp, uinfo); + if (!ret) + goto done; + } + } + + if (npresent) { + if (nup == 0) { + ret = -ENETDOWN; + ipath_dbg("No ports available (none initialized " + "and ready)\n"); + } else { + if (prefunit > 0) { + /* if started above 0, retry from 0 */ + ipath_cdbg(PROC, + "%s[%u] no ports on prefunit " + "%d, clear and re-check\n", + current->comm, current->pid, + prefunit); + devmax = ipath_count_units(NULL, NULL, + NULL); + prefunit = -1; + goto recheck; + } + ret = -EBUSY; + ipath_dbg("No ports available\n"); + } + } else { + ret = -ENXIO; + ipath_dbg("No boards found\n"); + } + +done: + return ret; +} + +static int find_shared_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + + devmax = ipath_count_units(NULL, NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct ipath_devdata *dd = ipath_lookup(ndev); + + if (!usable(dd)) + continue; + for (i = 1; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + /* Skip ports which are not yet open */ + if (!pd || !pd->port_cnt) + continue; + /* Skip port if it doesn't match the requested one */ + if (pd->port_subport_id != uinfo->spu_subport_id) + continue; + /* Verify the sharing process matches the master */ + if (pd->port_subport_cnt != uinfo->spu_subport_cnt || + pd->userversion != uinfo->spu_userversion || + pd->port_cnt >= pd->port_subport_cnt) { + ret = -EINVAL; + goto done; + } + port_fp(fp) = pd; + subport_fp(fp) = pd->port_cnt++; + pd->port_subpid[subport_fp(fp)] = + get_pid(task_pid(current)); + tidcursor_fp(fp) = 0; + pd->active_slaves |= 1 << subport_fp(fp); + ipath_cdbg(PROC, + "%s[%u] %u sharing %s[%u] unit:port %u:%u\n", + current->comm, current->pid, + subport_fp(fp), + pd->port_comm, pid_nr(pd->port_pid), + dd->ipath_unit, pd->port_port); + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int ipath_open(struct inode *in, struct file *fp) +{ + /* The real work is performed later in ipath_assign_port() */ + fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL); + return fp->private_data ? 0 : -ENOMEM; +} + +/* Get port early, so can set affinity prior to memory allocation */ +static int ipath_assign_port(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + int i_minor; + unsigned swmajor, swminor; + + /* Check to be sure we haven't already initialized this file */ + if (port_fp(fp)) { + ret = -EINVAL; + goto done; + } + + /* for now, if major version is different, bail */ + swmajor = uinfo->spu_userversion >> 16; + if (swmajor != IPATH_USER_SWMAJOR) { + ipath_dbg("User major version %d not same as driver " + "major %d\n", uinfo->spu_userversion >> 16, + IPATH_USER_SWMAJOR); + ret = -ENODEV; + goto done; + } + + swminor = uinfo->spu_userversion & 0xffff; + if (swminor != IPATH_USER_SWMINOR) + ipath_dbg("User minor version %d not same as driver " + "minor %d\n", swminor, IPATH_USER_SWMINOR); + + mutex_lock(&ipath_mutex); + + if (ipath_compatible_subports(swmajor, swminor) && + uinfo->spu_subport_cnt && + (ret = find_shared_port(fp, uinfo))) { + if (ret > 0) + ret = 0; + goto done_chk_sdma; + } + + i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE; + ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n", + (long)file_inode(fp)->i_rdev, i_minor); + + if (i_minor) + ret = find_free_port(i_minor - 1, fp, uinfo); + else + ret = find_best_unit(fp, uinfo); + +done_chk_sdma: + if (!ret) { + struct ipath_filedata *fd = fp->private_data; + const struct ipath_portdata *pd = fd->pd; + const struct ipath_devdata *dd = pd->port_dd; + + fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev, + dd->ipath_unit, + pd->port_port, + fd->subport); + + if (!fd->pq) + ret = -ENOMEM; + } + + mutex_unlock(&ipath_mutex); + +done: + return ret; +} + + +static int ipath_do_user_init(struct file *fp, + const struct ipath_user_info *uinfo) +{ + int ret; + struct ipath_portdata *pd = port_fp(fp); + struct ipath_devdata *dd; + u32 head32; + + /* Subports don't need to initialize anything since master did it. */ + if (subport_fp(fp)) { + ret = wait_event_interruptible(pd->port_wait, + !test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag)); + goto done; + } + + dd = pd->port_dd; + + if (uinfo->spu_rcvhdrsize) { + ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize); + if (ret) + goto done; + } + + /* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */ + + /* some ports may get extra buffers, calculate that here */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_piocnt = dd->ipath_pbufsport + 1; + else + pd->port_piocnt = dd->ipath_pbufsport; + + /* for right now, kernel piobufs are at end, so port 1 is at 0 */ + if (pd->port_port <= dd->ipath_ports_extrabuf) + pd->port_pio_base = (dd->ipath_pbufsport + 1) + * (pd->port_port - 1); + else + pd->port_pio_base = dd->ipath_ports_extrabuf + + dd->ipath_pbufsport * (pd->port_port - 1); + pd->port_piobufs = dd->ipath_piobufbase + + pd->port_pio_base * dd->ipath_palign; + ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u," + " first pio %u\n", pd->port_port, pd->port_piobufs, + pd->port_piocnt, pd->port_pio_base); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0); + + /* + * Now allocate the rcvhdr Q and eager TIDs; skip the TID + * array for time being. If pd->port_port > chip-supported, + * we need to do extra stuff here to handle by handling overflow + * through port 0, someday + */ + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = ipath_create_user_egr(pd); + if (ret) + goto done; + + /* + * set the eager head register for this port to the current values + * of the tail pointers, since we don't know if they were + * updated on last use of the port. + */ + head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port); + ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port); + pd->port_lastrcvhdrqtail = -1; + ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n", + pd->port_port, head32); + pd->port_tidcursor = 0; /* start at beginning after open */ + + /* initialize poll variables... */ + pd->port_urgent = 0; + pd->port_urgent_poll = 0; + pd->port_hdrqfull_poll = pd->port_hdrqfull; + + /* + * Now enable the port for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ports, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + set_bit(dd->ipath_r_portenable_shift + pd->port_port, + &dd->ipath_rcvctrl); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) { + if (pd->port_rcvhdrtail_kvaddr) + ipath_clear_rcvhdrtail(pd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl & + ~(1ULL << dd->ipath_r_tailupd_shift)); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* Notify any waiting slaves */ + if (pd->port_subport_cnt) { + clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag); + wake_up(&pd->port_wait); + } +done: + return ret; +} + +/** + * unlock_exptid - unlock any expected TID entries port still had in use + * @pd: port + * + * We don't actually update the chip here, because we do a bulk update + * below, using ipath_f_clear_tids. + */ +static void unlock_expected_tids(struct ipath_portdata *pd) +{ + struct ipath_devdata *dd = pd->port_dd; + int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt; + int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt; + + ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n", + pd->port_port); + for (i = port_tidbase; i < maxtid; i++) { + struct page *ps = dd->ipath_pageshadow[i]; + + if (!ps) + continue; + + dd->ipath_pageshadow[i] = NULL; + pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i], + PAGE_SIZE, PCI_DMA_FROMDEVICE); + ipath_release_user_pages_on_close(&ps, 1); + cnt++; + ipath_stats.sps_pageunlocks++; + } + if (cnt) + ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n", + pd->port_port, cnt); + + if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks) + ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n", + (unsigned long long) ipath_stats.sps_pagelocks, + (unsigned long long) + ipath_stats.sps_pageunlocks); +} + +static int ipath_close(struct inode *in, struct file *fp) +{ + int ret = 0; + struct ipath_filedata *fd; + struct ipath_portdata *pd; + struct ipath_devdata *dd; + unsigned long flags; + unsigned port; + struct pid *pid; + + ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n", + (long)in->i_rdev, fp->private_data); + + mutex_lock(&ipath_mutex); + + fd = fp->private_data; + fp->private_data = NULL; + pd = fd->pd; + if (!pd) { + mutex_unlock(&ipath_mutex); + goto bail; + } + + dd = pd->port_dd; + + /* drain user sdma queue */ + ipath_user_sdma_queue_drain(dd, fd->pq); + ipath_user_sdma_queue_destroy(fd->pq); + + if (--pd->port_cnt) { + /* + * XXX If the master closes the port before the slave(s), + * revoke the mmap for the eager receive queue so + * the slave(s) don't wait for receive data forever. + */ + pd->active_slaves &= ~(1 << fd->subport); + put_pid(pd->port_subpid[fd->subport]); + pd->port_subpid[fd->subport] = NULL; + mutex_unlock(&ipath_mutex); + goto bail; + } + /* early; no interrupt users after this */ + spin_lock_irqsave(&dd->ipath_uctxt_lock, flags); + port = pd->port_port; + dd->ipath_pd[port] = NULL; + pid = pd->port_pid; + pd->port_pid = NULL; + spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags); + + if (pd->port_rcvwait_to || pd->port_piowait_to + || pd->port_rcvnowait || pd->port_pionowait) { + ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; " + "%u rcv %u, pio already\n", + pd->port_port, pd->port_rcvwait_to, + pd->port_piowait_to, pd->port_rcvnowait, + pd->port_pionowait); + pd->port_rcvwait_to = pd->port_piowait_to = + pd->port_rcvnowait = pd->port_pionowait = 0; + } + if (pd->port_flag) { + ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n", + pd->port_port, pd->port_flag); + pd->port_flag = 0; + } + + if (dd->ipath_kregbase) { + /* atomically clear receive enable port and intr avail. */ + clear_bit(dd->ipath_r_portenable_shift + port, + &dd->ipath_rcvctrl); + clear_bit(pd->port_port + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + /* and read back from chip to be sure that nothing + * else is in flight when we do the rest */ + (void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + /* clean up the pkeys for this port user */ + ipath_clean_part_key(pd, dd); + /* + * be paranoid, and never write 0's to these, just use an + * unused part of the port 0 tail page. Of course, + * rcvhdraddr points to a large chunk of memory, so this + * could still trash things, but at least it won't trash + * page 0, and by disabling the port, it should stop "soon", + * even if a packet or two is in already in flight after we + * disabled the port. + */ + ipath_write_kreg_port(dd, + dd->ipath_kregs->kr_rcvhdrtailaddr, port, + dd->ipath_dummy_hdrq_phys); + ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr, + pd->port_port, dd->ipath_dummy_hdrq_phys); + + ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt); + ipath_chg_pioavailkernel(dd, pd->port_pio_base, + pd->port_piocnt, 1); + + dd->ipath_f_clear_tids(dd, pd->port_port); + + if (dd->ipath_pageshadow) + unlock_expected_tids(pd); + ipath_stats.sps_ports--; + ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n", + pd->port_comm, pid_nr(pid), + dd->ipath_unit, port); + } + + put_pid(pid); + mutex_unlock(&ipath_mutex); + ipath_free_pddata(dd, pd); /* after releasing the mutex */ + +bail: + kfree(fd); + return ret; +} + +static int ipath_port_info(struct ipath_portdata *pd, u16 subport, + struct ipath_port_info __user *uinfo) +{ + struct ipath_port_info info; + int nup; + int ret; + size_t sz; + + (void) ipath_count_units(NULL, &nup, NULL); + info.num_active = nup; + info.unit = pd->port_dd->ipath_unit; + info.port = pd->port_port; + info.subport = subport; + /* Don't return new fields if old library opened the port. */ + if (ipath_supports_subports(pd->userversion >> 16, + pd->userversion & 0xffff)) { + /* Number of user ports available for this device. */ + info.num_ports = pd->port_dd->ipath_cfgports - 1; + info.num_subports = pd->port_subport_cnt; + sz = sizeof(info); + } else + sz = sizeof(info) - 2 * sizeof(u16); + + if (copy_to_user(uinfo, &info, sz)) { + ret = -EFAULT; + goto bail; + } + ret = 0; + +bail: + return ret; +} + +static int ipath_get_slave_info(struct ipath_portdata *pd, + void __user *slave_mask_addr) +{ + int ret = 0; + + if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32))) + ret = -EFAULT; + return ret; +} + +static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq, + u32 __user *inflightp) +{ + const u32 val = ipath_user_sdma_inflight_counter(pq); + + if (put_user(val, inflightp)) + return -EFAULT; + + return 0; +} + +static int ipath_sdma_get_complete(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + u32 __user *completep) +{ + u32 val; + int err; + + err = ipath_user_sdma_make_progress(dd, pq); + if (err < 0) + return err; + + val = ipath_user_sdma_complete_counter(pq); + if (put_user(val, completep)) + return -EFAULT; + + return 0; +} + +static ssize_t ipath_write(struct file *fp, const char __user *data, + size_t count, loff_t *off) +{ + const struct ipath_cmd __user *ucmd; + struct ipath_portdata *pd; + const void __user *src; + size_t consumed, copy; + struct ipath_cmd cmd; + ssize_t ret = 0; + void *dest; + + if (count < sizeof(cmd.type)) { + ret = -EINVAL; + goto bail; + } + + ucmd = (const struct ipath_cmd __user *) data; + + if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) { + ret = -EFAULT; + goto bail; + } + + consumed = sizeof(cmd.type); + + switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + case __IPATH_CMD_USER_INIT: + case IPATH_CMD_USER_INIT: + copy = sizeof(cmd.cmd.user_info); + dest = &cmd.cmd.user_info; + src = &ucmd->cmd.user_info; + break; + case IPATH_CMD_RECV_CTRL: + copy = sizeof(cmd.cmd.recv_ctrl); + dest = &cmd.cmd.recv_ctrl; + src = &ucmd->cmd.recv_ctrl; + break; + case IPATH_CMD_PORT_INFO: + copy = sizeof(cmd.cmd.port_info); + dest = &cmd.cmd.port_info; + src = &ucmd->cmd.port_info; + break; + case IPATH_CMD_TID_UPDATE: + case IPATH_CMD_TID_FREE: + copy = sizeof(cmd.cmd.tid_info); + dest = &cmd.cmd.tid_info; + src = &ucmd->cmd.tid_info; + break; + case IPATH_CMD_SET_PART_KEY: + copy = sizeof(cmd.cmd.part_key); + dest = &cmd.cmd.part_key; + src = &ucmd->cmd.part_key; + break; + case __IPATH_CMD_SLAVE_INFO: + copy = sizeof(cmd.cmd.slave_mask_addr); + dest = &cmd.cmd.slave_mask_addr; + src = &ucmd->cmd.slave_mask_addr; + break; + case IPATH_CMD_PIOAVAILUPD: // force an update of PIOAvail reg + copy = 0; + src = NULL; + dest = NULL; + break; + case IPATH_CMD_POLL_TYPE: + copy = sizeof(cmd.cmd.poll_type); + dest = &cmd.cmd.poll_type; + src = &ucmd->cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + copy = sizeof(cmd.cmd.armlaunch_ctrl); + dest = &cmd.cmd.armlaunch_ctrl; + src = &ucmd->cmd.armlaunch_ctrl; + break; + case IPATH_CMD_SDMA_INFLIGHT: + copy = sizeof(cmd.cmd.sdma_inflight); + dest = &cmd.cmd.sdma_inflight; + src = &ucmd->cmd.sdma_inflight; + break; + case IPATH_CMD_SDMA_COMPLETE: + copy = sizeof(cmd.cmd.sdma_complete); + dest = &cmd.cmd.sdma_complete; + src = &ucmd->cmd.sdma_complete; + break; + default: + ret = -EINVAL; + goto bail; + } + + if (copy) { + if ((count - consumed) < copy) { + ret = -EINVAL; + goto bail; + } + + if (copy_from_user(dest, src, copy)) { + ret = -EFAULT; + goto bail; + } + + consumed += copy; + } + + pd = port_fp(fp); + if (!pd && cmd.type != __IPATH_CMD_USER_INIT && + cmd.type != IPATH_CMD_ASSIGN_PORT) { + ret = -EINVAL; + goto bail; + } + + switch (cmd.type) { + case IPATH_CMD_ASSIGN_PORT: + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + break; + case __IPATH_CMD_USER_INIT: + /* backwards compatibility, get port first */ + ret = ipath_assign_port(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + /* and fall through to current version. */ + case IPATH_CMD_USER_INIT: + ret = ipath_do_user_init(fp, &cmd.cmd.user_info); + if (ret) + goto bail; + ret = ipath_get_base_info( + fp, (void __user *) (unsigned long) + cmd.cmd.user_info.spu_base_info, + cmd.cmd.user_info.spu_base_info_size); + break; + case IPATH_CMD_RECV_CTRL: + ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl); + break; + case IPATH_CMD_PORT_INFO: + ret = ipath_port_info(pd, subport_fp(fp), + (struct ipath_port_info __user *) + (unsigned long) cmd.cmd.port_info); + break; + case IPATH_CMD_TID_UPDATE: + ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info); + break; + case IPATH_CMD_TID_FREE: + ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info); + break; + case IPATH_CMD_SET_PART_KEY: + ret = ipath_set_part_key(pd, cmd.cmd.part_key); + break; + case __IPATH_CMD_SLAVE_INFO: + ret = ipath_get_slave_info(pd, + (void __user *) (unsigned long) + cmd.cmd.slave_mask_addr); + break; + case IPATH_CMD_PIOAVAILUPD: + ipath_force_pio_avail_update(pd->port_dd); + break; + case IPATH_CMD_POLL_TYPE: + pd->poll_type = cmd.cmd.poll_type; + break; + case IPATH_CMD_ARMLAUNCH_CTRL: + if (cmd.cmd.armlaunch_ctrl) + ipath_enable_armlaunch(pd->port_dd); + else + ipath_disable_armlaunch(pd->port_dd); + break; + case IPATH_CMD_SDMA_INFLIGHT: + ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_inflight); + break; + case IPATH_CMD_SDMA_COMPLETE: + ret = ipath_sdma_get_complete(pd->port_dd, + user_sdma_queue_fp(fp), + (u32 __user *) (unsigned long) + cmd.cmd.sdma_complete); + break; + } + + if (ret >= 0) + ret = consumed; + +bail: + return ret; +} + +static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *filp = iocb->ki_filp; + struct ipath_filedata *fp = filp->private_data; + struct ipath_portdata *pd = port_fp(filp); + struct ipath_user_sdma_queue *pq = fp->pq; + + if (!iter_is_iovec(from) || !from->nr_segs) + return -EINVAL; + + return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs); +} + +static struct class *ipath_class; + +static int init_cdev(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + const dev_t dev = MKDEV(IPATH_MAJOR, minor); + struct cdev *cdev = NULL; + struct device *device = NULL; + int ret; + + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not allocate cdev for minor %d, %s\n", + minor, name); + ret = -ENOMEM; + goto done; + } + + cdev->owner = THIS_MODULE; + cdev->ops = fops; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME + ": Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + device = device_create(ipath_class, NULL, dev, NULL, name); + + if (IS_ERR(device)) { + ret = PTR_ERR(device); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "device for minor %d, %s (err %d)\n", + minor, name, -ret); + goto err_cdev; + } + + goto done; + +err_cdev: + cdev_del(cdev); + cdev = NULL; + +done: + if (ret >= 0) { + *cdevp = cdev; + *devp = device; + } else { + *cdevp = NULL; + *devp = NULL; + } + + return ret; +} + +int ipath_cdev_init(int minor, char *name, const struct file_operations *fops, + struct cdev **cdevp, struct device **devp) +{ + return init_cdev(minor, name, fops, cdevp, devp); +} + +static void cleanup_cdev(struct cdev **cdevp, + struct device **devp) +{ + struct device *dev = *devp; + + if (dev) { + device_unregister(dev); + *devp = NULL; + } + + if (*cdevp) { + cdev_del(*cdevp); + *cdevp = NULL; + } +} + +void ipath_cdev_cleanup(struct cdev **cdevp, + struct device **devp) +{ + cleanup_cdev(cdevp, devp); +} + +static struct cdev *wildcard_cdev; +static struct device *wildcard_dev; + +static const dev_t dev = MKDEV(IPATH_MAJOR, 0); + +static int user_init(void) +{ + int ret; + + ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME); + if (ret < 0) { + printk(KERN_ERR IPATH_DRV_NAME ": Could not register " + "chrdev region (err %d)\n", -ret); + goto done; + } + + ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME); + + if (IS_ERR(ipath_class)) { + ret = PTR_ERR(ipath_class); + printk(KERN_ERR IPATH_DRV_NAME ": Could not create " + "device class (err %d)\n", -ret); + goto bail; + } + + goto done; +bail: + unregister_chrdev_region(dev, IPATH_NMINORS); +done: + return ret; +} + +static void user_cleanup(void) +{ + if (ipath_class) { + class_destroy(ipath_class); + ipath_class = NULL; + } + + unregister_chrdev_region(dev, IPATH_NMINORS); +} + +static atomic_t user_count = ATOMIC_INIT(0); +static atomic_t user_setup = ATOMIC_INIT(0); + +int ipath_user_add(struct ipath_devdata *dd) +{ + char name[10]; + int ret; + + if (atomic_inc_return(&user_count) == 1) { + ret = user_init(); + if (ret < 0) { + ipath_dev_err(dd, "Unable to set up user support: " + "error %d\n", -ret); + goto bail; + } + ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev, + &wildcard_dev); + if (ret < 0) { + ipath_dev_err(dd, "Could not create wildcard " + "minor: error %d\n", -ret); + goto bail_user; + } + + atomic_set(&user_setup, 1); + } + + snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit); + + ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops, + &dd->user_cdev, &dd->user_dev); + if (ret < 0) + ipath_dev_err(dd, "Could not create user minor %d, %s\n", + dd->ipath_unit + 1, name); + + goto bail; + +bail_user: + user_cleanup(); +bail: + return ret; +} + +void ipath_user_remove(struct ipath_devdata *dd) +{ + cleanup_cdev(&dd->user_cdev, &dd->user_dev); + + if (atomic_dec_return(&user_count) == 0) { + if (atomic_read(&user_setup) == 0) + goto bail; + + cleanup_cdev(&wildcard_cdev, &wildcard_dev); + user_cleanup(); + + atomic_set(&user_setup, 0); + } +bail: + return; +} diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c new file mode 100644 index 000000000..25422a3a7 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_fs.c @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" + +#define IPATHFS_MAGIC 0x726a77 + +static struct super_block *ipath_super; + +static int ipathfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, const struct file_operations *fops, + void *data) +{ + int error; + struct inode *inode = new_inode(dir->i_sb); + + if (!inode) { + error = -EPERM; + goto bail; + } + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_private = data; + if (S_ISDIR(mode)) { + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); + inc_nlink(dir); + } + + inode->i_fop = fops; + + d_instantiate(dentry, inode); + error = 0; + +bail: + return error; +} + +static int create_file(const char *name, umode_t mode, + struct dentry *parent, struct dentry **dentry, + const struct file_operations *fops, void *data) +{ + int error; + + mutex_lock(&d_inode(parent)->i_mutex); + *dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(*dentry)) + error = ipathfs_mknod(d_inode(parent), *dentry, + mode, fops, data); + else + error = PTR_ERR(*dentry); + mutex_unlock(&d_inode(parent)->i_mutex); + + return error; +} + +static ssize_t atomic_stats_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return simple_read_from_buffer(buf, count, ppos, &ipath_stats, + sizeof ipath_stats); +} + +static const struct file_operations atomic_stats_ops = { + .read = atomic_stats_read, + .llseek = default_llseek, +}; + +static ssize_t atomic_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct infinipath_counters counters; + struct ipath_devdata *dd; + + dd = file_inode(file)->i_private; + dd->ipath_f_read_counters(dd, &counters); + + return simple_read_from_buffer(buf, count, ppos, &counters, + sizeof counters); +} + +static const struct file_operations atomic_counters_ops = { + .read = atomic_counters_read, + .llseek = default_llseek, +}; + +static ssize_t flash_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if ( pos < 0) { + ret = -EINVAL; + goto bail; + } + + if (pos >= sizeof(struct ipath_flash)) { + ret = 0; + goto bail; + } + + if (count > sizeof(struct ipath_flash) - pos) + count = sizeof(struct ipath_flash) - pos; + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + dd = file_inode(file)->i_private; + if (ipath_eeprom_read(dd, pos, tmp, count)) { + ipath_dev_err(dd, "failed to read from flash\n"); + ret = -ENXIO; + goto bail_tmp; + } + + if (copy_to_user(buf, tmp, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static ssize_t flash_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ipath_devdata *dd; + ssize_t ret; + loff_t pos; + char *tmp; + + pos = *ppos; + + if (pos != 0) { + ret = -EINVAL; + goto bail; + } + + if (count != sizeof(struct ipath_flash)) { + ret = -EINVAL; + goto bail; + } + + tmp = kmalloc(count, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto bail; + } + + if (copy_from_user(tmp, buf, count)) { + ret = -EFAULT; + goto bail_tmp; + } + + dd = file_inode(file)->i_private; + if (ipath_eeprom_write(dd, pos, tmp, count)) { + ret = -ENXIO; + ipath_dev_err(dd, "failed to write to flash\n"); + goto bail_tmp; + } + + *ppos = pos + count; + ret = count; + +bail_tmp: + kfree(tmp); + +bail: + return ret; +} + +static const struct file_operations flash_ops = { + .read = flash_read, + .write = flash_write, + .llseek = default_llseek, +}; + +static int create_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *tmp; + char unit[10]; + int ret; + + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir, + &simple_dir_operations, dd); + if (ret) { + printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp, + &atomic_counters_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/atomic_counters) " + "failed: %d\n", unit, ret); + goto bail; + } + + ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp, + &flash_ops, dd); + if (ret) { + printk(KERN_ERR "create_file(%s/flash) " + "failed: %d\n", unit, ret); + goto bail; + } + +bail: + return ret; +} + +static int remove_file(struct dentry *parent, char *name) +{ + struct dentry *tmp; + int ret; + + tmp = lookup_one_len(name, parent, strlen(name)); + + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto bail; + } + + spin_lock(&tmp->d_lock); + if (simple_positive(tmp)) { + dget_dlock(tmp); + __d_drop(tmp); + spin_unlock(&tmp->d_lock); + simple_unlink(d_inode(parent), tmp); + } else + spin_unlock(&tmp->d_lock); + + ret = 0; +bail: + /* + * We don't expect clients to care about the return value, but + * it's there if they need it. + */ + return ret; +} + +static int remove_device_files(struct super_block *sb, + struct ipath_devdata *dd) +{ + struct dentry *dir, *root; + char unit[10]; + int ret; + + root = dget(sb->s_root); + mutex_lock(&d_inode(root)->i_mutex); + snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); + dir = lookup_one_len(unit, root, strlen(unit)); + + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + printk(KERN_ERR "Lookup of %s failed\n", unit); + goto bail; + } + + remove_file(dir, "flash"); + remove_file(dir, "atomic_counters"); + d_delete(dir); + ret = simple_rmdir(d_inode(root), dir); + +bail: + mutex_unlock(&d_inode(root)->i_mutex); + dput(root); + return ret; +} + +static int ipathfs_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct ipath_devdata *dd, *tmp; + unsigned long flags; + int ret; + + static struct tree_descr files[] = { + [2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO}, + {""}, + }; + + ret = simple_fill_super(sb, IPATHFS_MAGIC, files); + if (ret) { + printk(KERN_ERR "simple_fill_super failed: %d\n", ret); + goto bail; + } + + spin_lock_irqsave(&ipath_devs_lock, flags); + + list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) { + spin_unlock_irqrestore(&ipath_devs_lock, flags); + ret = create_device_files(sb, dd); + if (ret) + goto bail; + spin_lock_irqsave(&ipath_devs_lock, flags); + } + + spin_unlock_irqrestore(&ipath_devs_lock, flags); + +bail: + return ret; +} + +static struct dentry *ipathfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct dentry *ret; + ret = mount_single(fs_type, flags, data, ipathfs_fill_super); + if (!IS_ERR(ret)) + ipath_super = ret->d_sb; + return ret; +} + +static void ipathfs_kill_super(struct super_block *s) +{ + kill_litter_super(s); + ipath_super = NULL; +} + +int ipathfs_add_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = create_device_files(ipath_super, dd); + +bail: + return ret; +} + +int ipathfs_remove_device(struct ipath_devdata *dd) +{ + int ret; + + if (ipath_super == NULL) { + ret = 0; + goto bail; + } + + ret = remove_device_files(ipath_super, dd); + +bail: + return ret; +} + +static struct file_system_type ipathfs_fs_type = { + .owner = THIS_MODULE, + .name = "ipathfs", + .mount = ipathfs_mount, + .kill_sb = ipathfs_kill_super, +}; +MODULE_ALIAS_FS("ipathfs"); + +int __init ipath_init_ipathfs(void) +{ + return register_filesystem(&ipathfs_fs_type); +} + +void __exit ipath_exit_ipathfs(void) +{ + unregister_filesystem(&ipathfs_fs_type); +} diff --git a/drivers/staging/rdma/ipath/ipath_iba6110.c b/drivers/staging/rdma/ipath/ipath_iba6110.c new file mode 100644 index 000000000..7cc305488 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_iba6110.c @@ -0,0 +1,1940 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file contains all of the code that is specific to the InfiniPath + * HT chip. + */ + +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_registers.h" + +static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64); + + +/* + * This lists the InfiniPath registers, in the actual chip layout. + * This structure should never be directly accessed. + * + * The names are in InterCap form because they're taken straight from + * the chip specification. Since they're only used in this file, they + * don't pollute the rest of the source. +*/ + +struct _infinipath_do_not_use_kernel_regs { + unsigned long long Revision; + unsigned long long Control; + unsigned long long PageAlign; + unsigned long long PortCnt; + unsigned long long DebugPortSelect; + unsigned long long DebugPort; + unsigned long long SendRegBase; + unsigned long long UserRegBase; + unsigned long long CounterRegBase; + unsigned long long Scratch; + unsigned long long ReservedMisc1; + unsigned long long InterruptConfig; + unsigned long long IntBlocked; + unsigned long long IntMask; + unsigned long long IntStatus; + unsigned long long IntClear; + unsigned long long ErrorMask; + unsigned long long ErrorStatus; + unsigned long long ErrorClear; + unsigned long long HwErrMask; + unsigned long long HwErrStatus; + unsigned long long HwErrClear; + unsigned long long HwDiagCtrl; + unsigned long long MDIO; + unsigned long long IBCStatus; + unsigned long long IBCCtrl; + unsigned long long ExtStatus; + unsigned long long ExtCtrl; + unsigned long long GPIOOut; + unsigned long long GPIOMask; + unsigned long long GPIOStatus; + unsigned long long GPIOClear; + unsigned long long RcvCtrl; + unsigned long long RcvBTHQP; + unsigned long long RcvHdrSize; + unsigned long long RcvHdrCnt; + unsigned long long RcvHdrEntSize; + unsigned long long RcvTIDBase; + unsigned long long RcvTIDCnt; + unsigned long long RcvEgrBase; + unsigned long long RcvEgrCnt; + unsigned long long RcvBufBase; + unsigned long long RcvBufSize; + unsigned long long RxIntMemBase; + unsigned long long RxIntMemSize; + unsigned long long RcvPartitionKey; + unsigned long long ReservedRcv[10]; + unsigned long long SendCtrl; + unsigned long long SendPIOBufBase; + unsigned long long SendPIOSize; + unsigned long long SendPIOBufCnt; + unsigned long long SendPIOAvailAddr; + unsigned long long TxIntMemBase; + unsigned long long TxIntMemSize; + unsigned long long ReservedSend[9]; + unsigned long long SendBufferError; + unsigned long long SendBufferErrorCONT1; + unsigned long long SendBufferErrorCONT2; + unsigned long long SendBufferErrorCONT3; + unsigned long long ReservedSBE[4]; + unsigned long long RcvHdrAddr0; + unsigned long long RcvHdrAddr1; + unsigned long long RcvHdrAddr2; + unsigned long long RcvHdrAddr3; + unsigned long long RcvHdrAddr4; + unsigned long long RcvHdrAddr5; + unsigned long long RcvHdrAddr6; + unsigned long long RcvHdrAddr7; + unsigned long long RcvHdrAddr8; + unsigned long long ReservedRHA[7]; + unsigned long long RcvHdrTailAddr0; + unsigned long long RcvHdrTailAddr1; + unsigned long long RcvHdrTailAddr2; + unsigned long long RcvHdrTailAddr3; + unsigned long long RcvHdrTailAddr4; + unsigned long long RcvHdrTailAddr5; + unsigned long long RcvHdrTailAddr6; + unsigned long long RcvHdrTailAddr7; + unsigned long long RcvHdrTailAddr8; + unsigned long long ReservedRHTA[7]; + unsigned long long Sync; /* Software only */ + unsigned long long Dump; /* Software only */ + unsigned long long SimVer; /* Software only */ + unsigned long long ReservedSW[5]; + unsigned long long SerdesConfig0; + unsigned long long SerdesConfig1; + unsigned long long SerdesStatus; + unsigned long long XGXSConfig; + unsigned long long ReservedSW2[4]; +}; + +struct _infinipath_do_not_use_counters { + __u64 LBIntCnt; + __u64 LBFlowStallCnt; + __u64 Reserved1; + __u64 TxUnsupVLErrCnt; + __u64 TxDataPktCnt; + __u64 TxFlowPktCnt; + __u64 TxDwordCnt; + __u64 TxLenErrCnt; + __u64 TxMaxMinLenErrCnt; + __u64 TxUnderrunCnt; + __u64 TxFlowStallCnt; + __u64 TxDroppedPktCnt; + __u64 RxDroppedPktCnt; + __u64 RxDataPktCnt; + __u64 RxFlowPktCnt; + __u64 RxDwordCnt; + __u64 RxLenErrCnt; + __u64 RxMaxMinLenErrCnt; + __u64 RxICRCErrCnt; + __u64 RxVCRCErrCnt; + __u64 RxFlowCtrlErrCnt; + __u64 RxBadFormatCnt; + __u64 RxLinkProblemCnt; + __u64 RxEBPCnt; + __u64 RxLPCRCErrCnt; + __u64 RxBufOvflCnt; + __u64 RxTIDFullErrCnt; + __u64 RxTIDValidErrCnt; + __u64 RxPKeyMismatchCnt; + __u64 RxP0HdrEgrOvflCnt; + __u64 RxP1HdrEgrOvflCnt; + __u64 RxP2HdrEgrOvflCnt; + __u64 RxP3HdrEgrOvflCnt; + __u64 RxP4HdrEgrOvflCnt; + __u64 RxP5HdrEgrOvflCnt; + __u64 RxP6HdrEgrOvflCnt; + __u64 RxP7HdrEgrOvflCnt; + __u64 RxP8HdrEgrOvflCnt; + __u64 Reserved6; + __u64 Reserved7; + __u64 IBStatusChangeCnt; + __u64 IBLinkErrRecoveryCnt; + __u64 IBLinkDownedCnt; + __u64 IBSymbolErrCnt; +}; + +#define IPATH_KREG_OFFSET(field) (offsetof( \ + struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64)) +#define IPATH_CREG_OFFSET(field) (offsetof( \ + struct _infinipath_do_not_use_counters, field) / sizeof(u64)) + +static const struct ipath_kregs ipath_ht_kregs = { + .kr_control = IPATH_KREG_OFFSET(Control), + .kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase), + .kr_debugport = IPATH_KREG_OFFSET(DebugPort), + .kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect), + .kr_errorclear = IPATH_KREG_OFFSET(ErrorClear), + .kr_errormask = IPATH_KREG_OFFSET(ErrorMask), + .kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus), + .kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl), + .kr_extstatus = IPATH_KREG_OFFSET(ExtStatus), + .kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear), + .kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask), + .kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut), + .kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus), + .kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl), + .kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear), + .kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask), + .kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus), + .kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl), + .kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus), + .kr_intblocked = IPATH_KREG_OFFSET(IntBlocked), + .kr_intclear = IPATH_KREG_OFFSET(IntClear), + .kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig), + .kr_intmask = IPATH_KREG_OFFSET(IntMask), + .kr_intstatus = IPATH_KREG_OFFSET(IntStatus), + .kr_mdio = IPATH_KREG_OFFSET(MDIO), + .kr_pagealign = IPATH_KREG_OFFSET(PageAlign), + .kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey), + .kr_portcnt = IPATH_KREG_OFFSET(PortCnt), + .kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP), + .kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase), + .kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize), + .kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl), + .kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase), + .kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt), + .kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt), + .kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize), + .kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize), + .kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase), + .kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize), + .kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase), + .kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt), + .kr_revision = IPATH_KREG_OFFSET(Revision), + .kr_scratch = IPATH_KREG_OFFSET(Scratch), + .kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError), + .kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl), + .kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr), + .kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase), + .kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt), + .kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize), + .kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase), + .kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase), + .kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize), + .kr_userregbase = IPATH_KREG_OFFSET(UserRegBase), + .kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0), + .kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1), + .kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus), + .kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig), + /* + * These should not be used directly via ipath_write_kreg64(), + * use them with ipath_write_kreg64_port(), + */ + .kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0), + .kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0) +}; + +static const struct ipath_cregs ipath_ht_cregs = { + .cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt), + .cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt), + .cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt), + .cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt), + .cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt), + .cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt), + .cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt), + .cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt), + .cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt), + .cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt), + .cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt), + .cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt), + /* calc from Reg_CounterRegBase + offset */ + .cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt), + .cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt), + .cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt), + .cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt), + .cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt), + .cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt), + .cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt), + .cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt), + .cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt), + .cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt), + .cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt), + .cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt), + .cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt), + .cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt), + .cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt), + .cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt), + .cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt), + .cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt), + .cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt), + .cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt), + .cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt) +}; + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVURG_SHIFT 0 +#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1) +#define INFINIPATH_I_RCVAVAIL_SHIFT 12 + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0 +#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL +#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR 0x0000000000800000ULL +#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR 0x0000000001000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR 0x0000000002000000ULL +#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR 0x0000000004000000ULL +#define INFINIPATH_HWE_HTCMISCERR4 0x0000000008000000ULL +#define INFINIPATH_HWE_HTCMISCERR5 0x0000000010000000ULL +#define INFINIPATH_HWE_HTCMISCERR6 0x0000000020000000ULL +#define INFINIPATH_HWE_HTCMISCERR7 0x0000000040000000ULL +#define INFINIPATH_HWE_HTCBUSTREQPARITYERR 0x0000000080000000ULL +#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL +#define INFINIPATH_HWE_HTCBUSIREQPARITYERR 0x0000000200000000ULL +#define INFINIPATH_HWE_COREPLL_FBSLIP 0x0080000000000000ULL +#define INFINIPATH_HWE_COREPLL_RFSLIP 0x0100000000000000ULL +#define INFINIPATH_HWE_HTBPLL_FBSLIP 0x0200000000000000ULL +#define INFINIPATH_HWE_HTBPLL_RFSLIP 0x0400000000000000ULL +#define INFINIPATH_HWE_HTAPLL_FBSLIP 0x0800000000000000ULL +#define INFINIPATH_HWE_HTAPLL_RFSLIP 0x1000000000000000ULL +#define INFINIPATH_HWE_SERDESPLLFAILED 0x2000000000000000ULL + +#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf +#define IBA6110_IBCS_LINKSTATE_SHIFT 4 + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_FREQSEL 0x2 +#define INFINIPATH_EXTS_SERDESSEL 0x4 +#define INFINIPATH_EXTS_MEMBIST_ENDTEST 0x0000000000004000 +#define INFINIPATH_EXTS_MEMBIST_CORRECT 0x0000000000008000 + + +/* TID entries (memory), HT-only */ +#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL /* 40 bits valid */ +#define INFINIPATH_RT_VALID 0x8000000000000000ULL +#define INFINIPATH_RT_ADDR_SHIFT 0 +#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL +#define INFINIPATH_RT_BUFSIZE_SHIFT 48 + +#define INFINIPATH_R_INTRAVAIL_SHIFT 16 +#define INFINIPATH_R_TAILUPD_SHIFT 31 + +/* kr_xgxsconfig bits */ +#define INFINIPATH_XGXS_RESET 0x7ULL + +/* + * masks and bits that are different in different chips, or present only + * in one + */ +static const ipath_err_t infinipath_hwe_htcmemparityerr_mask = + INFINIPATH_HWE_HTCMEMPARITYERR_MASK; +static const ipath_err_t infinipath_hwe_htcmemparityerr_shift = + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT; + +static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr = + INFINIPATH_HWE_HTCLNKABYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr = + INFINIPATH_HWE_HTCLNKABYTE1CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr = + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR; +static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr = + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR; + +#define _IPATH_GPIO_SDA_NUM 1 +#define _IPATH_GPIO_SCL_NUM 0 + +#define IPATH_GPIO_SDA \ + (1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) +#define IPATH_GPIO_SCL \ + (1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT)) + +/* keep the code below somewhat more readable; not used elsewhere */ +#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkabyte1crcerr) +#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) +#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr | \ + infinipath_hwe_htclnkbbyte0crcerr) +#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr | \ + infinipath_hwe_htclnkbbyte1crcerr) + +static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs, + char *msg, size_t msgl) +{ + char bitsmsg[64]; + ipath_err_t crcbits = hwerrs & + (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS); + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + crcbits &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't check if 8bit HT */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr; + /* + * we'll want to ignore link errors on link that is + * not in use, if any. For now, complain about both + */ + if (crcbits) { + u16 ctrl0, ctrl1; + snprintf(bitsmsg, sizeof bitsmsg, + "[HT%s lane %s CRC (%llx); powercycle to completely clear]", + !(crcbits & _IPATH_HTLINK1_CRCBITS) ? + "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS) + ? "1 (B)" : "0+1 (A+B)"), + !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0" + : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" : + "0+1"), (unsigned long long) crcbits); + strlcat(msg, bitsmsg, msgl); + + /* + * print extra info for debugging. slave/primary + * config word 4, 8 (link control 0, 1) + */ + + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x4, + &ctrl0)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl0 of slave/primary " + "config block\n"); + else if (!(ctrl0 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0, + ((ctrl0 >> 8) & 7) ? " CRC" : "", + ((ctrl0 >> 4) & 1) ? "linkfail" : + ""); + if (pci_read_config_word(dd->pcidev, + dd->ipath_ht_slave_off + 0x8, + &ctrl1)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkctrl1 of slave/primary " + "config block\n"); + else if (!(ctrl1 & 1 << 6)) + /* not if EOC bit set */ + ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1, + ((ctrl1 >> 8) & 7) ? " CRC" : "", + ((ctrl1 >> 4) & 1) ? "linkfail" : + ""); + + /* disable until driver reloaded */ + dd->ipath_hwerrmask &= ~crcbits; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + ipath_dbg("HT crc errs: %s\n", msg); + } else + ipath_dbg("ignoring HT crc errors 0x%llx, " + "not in use\n", (unsigned long long) + (hwerrs & (_IPATH_HTLINK0_CRCBITS | + _IPATH_HTLINK1_CRCBITS))); +} + +/* 6110 specific hardware errors... */ +static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"), + INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"), + INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"), + INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"), + INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"), + INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"), + INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"), +}; + +#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \ + INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \ + << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) +#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \ + << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) + +static void ipath_ht_txe_recover(struct ipath_devdata *dd) +{ + ++ipath_stats.sps_txeparity; + dev_info(&dd->pcidev->dev, + "Recovering from TXE PIO parity error\n"); +} + + +/** + * ipath_ht_handle_hwerrors - display hardware errors. + * @dd: the infinipath device + * @msg: the output buffer + * @msgl: the size of the output buffer + * + * Use same msg buffer as regular errors to avoid excessive stack + * use. Most hardware errors are catastrophic, but for right now, + * we'll print them and continue. We reuse the same message buffer as + * ipath_handle_errors() to avoid excessive stack usage. + */ +static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, + size_t msgl) +{ + ipath_err_t hwerrs; + u32 bits, ctrl; + int isfatal = 0; + char bitsmsg[64]; + int log_idx; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + + if (!hwerrs) { + ipath_cdbg(VERBOSE, "Called but no hardware errors set\n"); + /* + * better than printing cofusing messages + * This seems to be related to clearing the crc error, or + * the pll error during init. + */ + goto bail; + } else if (hwerrs == -1LL) { + ipath_dev_err(dd, "Read of hardware error status failed " + "(all bits set); ignoring\n"); + goto bail; + } + ipath_stats.sps_hwerrs++; + + /* Always clear the error status register, except MEMBISTFAIL, + * regardless of whether we continue or stop using the chip. + * We want that set so we know it failed, even across driver reload. + * We'll still ignore it in the hwerrmask. We do this partly for + * diagnostics, but also for support */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + hwerrs&~INFINIPATH_HWE_MEMBISTFAILED); + + hwerrs &= dd->ipath_hwerrmask; + + /* We log some errors to EEPROM, check if we have any of those. */ + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) + if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log) + ipath_inc_eeprom_err(dd, log_idx, 1); + + /* + * make sure we get this much out, unless told to be quiet, + * it's a parity error we may recover from, + * or it's occurred within the last 5 seconds + */ + if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY | + RXE_EAGER_PARITY)) || + (ipath_debug & __IPATH_VERBDBG)) + dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " + "(cleared)\n", (unsigned long long) hwerrs); + dd->ipath_lasthwerror |= hwerrs; + + if (hwerrs & ~dd->ipath_hwe_bitsextant) + ipath_dev_err(dd, "hwerror interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (hwerrs & ~dd->ipath_hwe_bitsextant)); + + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) { + /* + * parity errors in send memory are recoverable, + * just cancel the send (if indicated in * sendbuffererror), + * count the occurrence, unfreeze (if no other handled + * hardware error bits are set), and continue. They can + * occur if a processor speculative read is done to the PIO + * buffer while we are sending a packet, for example. + */ + if (hwerrs & TXE_PIO_PARITY) { + ipath_ht_txe_recover(dd); + hwerrs &= ~TXE_PIO_PARITY; + } + + if (!hwerrs) { + ipath_dbg("Clearing freezemode on ignored or " + "recovered hardware error\n"); + ipath_clear_freeze(dd); + } + } + + *msg = '\0'; + + /* + * may someday want to decode into which bits are which + * functional area for parity errors, etc. + */ + if (hwerrs & (infinipath_hwe_htcmemparityerr_mask + << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) { + bits = (u32) ((hwerrs >> + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) & + INFINIPATH_HWE_HTCMEMPARITYERR_MASK); + snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ", + bits); + strlcat(msg, bitsmsg, msgl); + } + + ipath_format_hwerrors(hwerrs, + ipath_6110_hwerror_msgs, + ARRAY_SIZE(ipath_6110_hwerror_msgs), + msg, msgl); + + if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS)) + hwerr_crcbits(dd, hwerrs, msg, msgl); + + if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) { + strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]", + msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } +#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP | \ + INFINIPATH_HWE_COREPLL_RFSLIP | \ + INFINIPATH_HWE_HTBPLL_FBSLIP | \ + INFINIPATH_HWE_HTBPLL_RFSLIP | \ + INFINIPATH_HWE_HTAPLL_FBSLIP | \ + INFINIPATH_HWE_HTAPLL_RFSLIP) + + if (hwerrs & _IPATH_PLL_FAIL) { + snprintf(bitsmsg, sizeof bitsmsg, + "[PLL failed (%llx), InfiniPath hardware unusable]", + (unsigned long long) (hwerrs & _IPATH_PLL_FAIL)); + strlcat(msg, bitsmsg, msgl); + /* ignore from now on, so disable until driver reloaded */ + dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) { + /* + * If it occurs, it is left masked since the eternal + * interface is unused + */ + dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED; + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + } + + if (hwerrs) { + /* + * if any set that we aren't ignoring; only + * make the complaint once, in case it's stuck + * or recurring, and we get here multiple + * times. + * force link down, so switch knows, and + * LEDs are turned off + */ + if (dd->ipath_flags & IPATH_INITTED) { + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + ipath_setup_ht_setextled(dd, + INFINIPATH_IBCS_L_STATE_DOWN, + INFINIPATH_IBCS_LT_STATE_DISABLED); + ipath_dev_err(dd, "Fatal Hardware Error (freeze " + "mode), no longer usable, SN %.16s\n", + dd->ipath_serial); + isfatal = 1; + } + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + /* + * mark as not usable, at a minimum until driver + * is reloaded, probably until reboot, since no + * other reset is possible. + */ + dd->ipath_flags &= ~IPATH_INITTED; + } + else + *msg = 0; /* recovered from all of them */ + if (*msg) + ipath_dev_err(dd, "%s hardware error\n", msg); + if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg) + /* + * for status file; if no trailing brace is copied, + * we'll know it was truncated. + */ + snprintf(dd->ipath_freezemsg, + dd->ipath_freezelen, "{%s}", msg); + +bail:; +} + +/** + * ipath_ht_boardname - fill in the board name + * @dd: the infinipath device + * @name: the output buffer + * @namelen: the size of the output buffer + * + * fill in the board name, based on the board revision register + */ +static int ipath_ht_boardname(struct ipath_devdata *dd, char *name, + size_t namelen) +{ + char *n = NULL; + u8 boardrev = dd->ipath_boardrev; + int ret = 0; + + switch (boardrev) { + case 5: + /* + * original production board; two production levels, with + * different serial number ranges. See ipath_ht_early_init() for + * case where we enable IPATH_GPIO_INTR for later serial # range. + * Original 112* serial number is no longer supported. + */ + n = "InfiniPath_QHT7040"; + break; + case 7: + /* small form factor production board */ + n = "InfiniPath_QHT7140"; + break; + default: /* don't know, just print the number */ + ipath_dev_err(dd, "Don't yet know about board " + "with ID %u\n", boardrev); + snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u", + boardrev); + break; + } + if (n) + snprintf(name, namelen, "%s", n); + + if (ret) { + ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name); + goto bail; + } + if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || + dd->ipath_minrev > 4)) { + /* + * This version of the driver only supports Rev 3.2 - 3.4 + */ + ipath_dev_err(dd, + "Unsupported InfiniPath hardware revision %u.%u!\n", + dd->ipath_majrev, dd->ipath_minrev); + ret = 1; + goto bail; + } + /* + * pkt/word counters are 32 bit, and therefore wrap fast enough + * that we snapshot them from a timer, and maintain 64 bit shadow + * copies + */ + dd->ipath_flags |= IPATH_32BITCOUNTERS; + dd->ipath_flags |= IPATH_GPIO_INTR; + if (dd->ipath_lbus_speed != 800) + ipath_dev_err(dd, + "Incorrectly configured for HT @ %uMHz\n", + dd->ipath_lbus_speed); + + /* + * set here, not in ipath_init_*_funcs because we have to do + * it after we can read chip registers. + */ + dd->ipath_ureg_align = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); + +bail: + return ret; +} + +static void ipath_check_htlink(struct ipath_devdata *dd) +{ + u8 linkerr, link_off, i; + + for (i = 0; i < 2; i++) { + link_off = dd->ipath_ht_slave_off + i * 4 + 0xd; + if (pci_read_config_byte(dd->pcidev, link_off, &linkerr)) + dev_info(&dd->pcidev->dev, "Couldn't read " + "linkerror%d of HT slave/primary block\n", + i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set should + * clear them + */ + if (pci_write_config_byte(dd->pcidev, link_off, + linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(dd->pcidev, link_off, + &linkerr)) + dev_info(&dd->pcidev->dev, + "Couldn't reread linkerror%d of " + "HT slave/primary block\n", i); + else if (linkerr & 0xf0) + dev_info(&dd->pcidev->dev, + "HT linkerror%d bits 0x%x " + "couldn't be cleared\n", + i, linkerr >> 4); + } + } +} + +static int ipath_setup_ht_reset(struct ipath_devdata *dd) +{ + ipath_dbg("No reset possible for this InfiniPath hardware\n"); + return 0; +} + +#define HT_INTR_DISC_CONFIG 0x80 /* HT interrupt and discovery cap */ +#define HT_INTR_REG_INDEX 2 /* intconfig requires indirect accesses */ + +/* + * Bits 13-15 of command==0 is slave/primary block. Clear any HT CRC + * errors. We only bother to do this at load time, because it's OK if + * it happened before we were loaded (first time after boot/reset), + * but any time after that, it's fatal anyway. Also need to not check + * for upper byte errors if we are in 8 bit mode, so figure out + * our width. For now, at least, also complain if it's 8 bit. + */ +static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev, + int pos, u8 cap_type) +{ + u8 linkwidth = 0, linkerr, link_a_b_off, link_off; + u16 linkctrl = 0; + int i; + + dd->ipath_ht_slave_off = pos; + /* command word, master_host bit */ + /* master host || slave */ + if ((cap_type >> 2) & 1) + link_a_b_off = 4; + else + link_a_b_off = 0; + ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n", + link_a_b_off ? 1 : 0, + link_a_b_off ? 'B' : 'A'); + + link_a_b_off += pos; + + /* + * check both link control registers; clear both HT CRC sets if + * necessary. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0x4; + if (pci_read_config_word(pdev, link_off, &linkctrl)) + ipath_dev_err(dd, "Couldn't read HT link control%d " + "register\n", i); + else if (linkctrl & (0xf << 8)) { + ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error " + "bits %x\n", i, linkctrl & (0xf << 8)); + /* + * now write them back to clear the error. + */ + pci_write_config_word(pdev, link_off, + linkctrl & (0xf << 8)); + } + } + + /* + * As with HT CRC bits, same for protocol errors that might occur + * during boot. + */ + for (i = 0; i < 2; i++) { + link_off = pos + i * 4 + 0xd; + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't read linkerror%d " + "of HT slave/primary block\n", i); + else if (linkerr & 0xf0) { + ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, " + "clearing\n", linkerr >> 4, i); + /* + * writing the linkerr bits that are set will clear + * them + */ + if (pci_write_config_byte + (pdev, link_off, linkerr)) + ipath_dbg("Failed write to clear HT " + "linkerror%d\n", i); + if (pci_read_config_byte(pdev, link_off, &linkerr)) + dev_info(&pdev->dev, "Couldn't reread " + "linkerror%d of HT slave/primary " + "block\n", i); + else if (linkerr & 0xf0) + dev_info(&pdev->dev, "HT linkerror%d bits " + "0x%x couldn't be cleared\n", + i, linkerr >> 4); + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + + if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link width " + "config register\n"); + else { + u32 width; + switch (linkwidth & 7) { + case 5: + width = 4; + break; + case 4: + width = 2; + break; + case 3: + width = 32; + break; + case 1: + width = 16; + break; + case 0: + default: /* if wrong, assume 8 bit */ + width = 8; + break; + } + + dd->ipath_lbus_width = width; + + if (linkwidth != 0x11) { + ipath_dev_err(dd, "Not configured for 16 bit HT " + "(%x)\n", linkwidth); + if (!(linkwidth & 0xf)) { + ipath_dbg("Will ignore HT lane1 errors\n"); + dd->ipath_flags |= IPATH_8BIT_IN_HT0; + } + } + } + + /* + * this is just for our link to the host, not devices connected + * through tunnel. + */ + if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth)) + ipath_dev_err(dd, "Couldn't read HT link frequency " + "config register\n"); + else { + u32 speed; + switch (linkwidth & 0xf) { + case 6: + speed = 1000; + break; + case 5: + speed = 800; + break; + case 4: + speed = 600; + break; + case 3: + speed = 500; + break; + case 2: + speed = 400; + break; + case 1: + speed = 300; + break; + default: + /* + * assume reserved and vendor-specific are 200... + */ + case 0: + speed = 200; + break; + } + dd->ipath_lbus_speed = speed; + } + + snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info), + "HyperTransport,%uMHz,x%u\n", + dd->ipath_lbus_speed, + dd->ipath_lbus_width); +} + +static int ipath_ht_intconfig(struct ipath_devdata *dd) +{ + int ret; + + if (dd->ipath_intconfig) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig, + dd->ipath_intconfig); /* interrupt address */ + ret = 0; + } else { + ipath_dev_err(dd, "No interrupts enabled, couldn't setup " + "interrupt address\n"); + ret = -EINVAL; + } + + return ret; +} + +static void ipath_ht_irq_update(struct pci_dev *dev, int irq, + struct ht_irq_msg *msg) +{ + struct ipath_devdata *dd = pci_get_drvdata(dev); + u64 prev_intconfig = dd->ipath_intconfig; + + dd->ipath_intconfig = msg->address_lo; + dd->ipath_intconfig |= ((u64) msg->address_hi) << 32; + + /* + * If the previous value of dd->ipath_intconfig is zero, we're + * getting configured for the first time, and must not program the + * intconfig register here (it will be programmed later, when the + * hardware is ready). Otherwise, we should. + */ + if (prev_intconfig) + ipath_ht_intconfig(dd); +} + +/** + * ipath_setup_ht_config - setup the interruptconfig register + * @dd: the infinipath device + * @pdev: the PCI device + * + * setup the interruptconfig register from the HT config info. + * Also clear CRC errors in HT linkcontrol, if necessary. + * This is done only for the real hardware. It is done before + * chip address space is initted, so can't touch infinipath registers + */ +static int ipath_setup_ht_config(struct ipath_devdata *dd, + struct pci_dev *pdev) +{ + int pos, ret; + + ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update); + if (ret < 0) { + ipath_dev_err(dd, "Couldn't create interrupt handler: " + "err %d\n", ret); + goto bail; + } + dd->ipath_irq = ret; + ret = 0; + + /* + * Handle clearing CRC errors in linkctrl register if necessary. We + * do this early, before we ever enable errors or hardware errors, + * mostly to avoid causing the chip to enter freeze mode. + */ + pos = pci_find_capability(pdev, PCI_CAP_ID_HT); + if (!pos) { + ipath_dev_err(dd, "Couldn't find HyperTransport " + "capability; no interrupts\n"); + ret = -ENODEV; + goto bail; + } + do { + u8 cap_type; + + /* + * The HT capability type byte is 3 bytes after the + * capability byte. + */ + if (pci_read_config_byte(pdev, pos + 3, &cap_type)) { + dev_info(&pdev->dev, "Couldn't read config " + "command @ %d\n", pos); + continue; + } + if (!(cap_type & 0xE0)) + slave_or_pri_blk(dd, pdev, pos, cap_type); + } while ((pos = pci_find_next_capability(pdev, pos, + PCI_CAP_ID_HT))); + + dd->ipath_flags |= IPATH_SWAP_PIOBUFS; + +bail: + return ret; +} + +/** + * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff + * @dd: the infinipath device + * + * Called during driver unload. + * This is currently a nop for the HT chip, not for all chips + */ +static void ipath_setup_ht_cleanup(struct ipath_devdata *dd) +{ +} + +/** + * ipath_setup_ht_setextled - set the state of the two external LEDs + * @dd: the infinipath device + * @lst: the L state + * @ltst: the LT state + * + * Set the state of the two external LEDs, to indicate physical and + * logical state of IB link. For this chip (at least with recommended + * board pinouts), LED1 is Green (physical state), and LED2 is Yellow + * (logical state) + * + * Note: We try to match the Mellanox HCA LED behavior as best + * we can. Green indicates physical link state is OK (something is + * plugged in, and we can train). + * Amber indicates the link is logically up (ACTIVE). + * Mellanox further blinks the amber LED to indicate data packet + * activity, but we have no hardware support for that, so it would + * require waking up every 10-20 msecs and checking the counters + * on the chip, and then turning the LED off if appropriate. That's + * visible overhead, so not something we will do. + * + */ +static void ipath_setup_ht_setextled(struct ipath_devdata *dd, + u64 lst, u64 ltst) +{ + u64 extctl; + unsigned long flags = 0; + + /* the diags use the LED to indicate diag info, so we leave + * the external LED alone when the diags are running */ + if (ipath_diag_inuse) + return; + + /* Allow override of LED display for, e.g. Locating system in rack */ + if (dd->ipath_led_override) { + ltst = (dd->ipath_led_override & IPATH_LED_PHYS) + ? INFINIPATH_IBCS_LT_STATE_LINKUP + : INFINIPATH_IBCS_LT_STATE_DISABLED; + lst = (dd->ipath_led_override & IPATH_LED_LOG) + ? INFINIPATH_IBCS_L_STATE_ACTIVE + : INFINIPATH_IBCS_L_STATE_DOWN; + } + + spin_lock_irqsave(&dd->ipath_gpio_lock, flags); + /* + * start by setting both LED control bits to off, then turn + * on the appropriate bit(s). + */ + if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */ + /* + * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF + * is inverted, because it is normally used to indicate + * a hardware fault at reset, if there were errors + */ + extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON) + | INFINIPATH_EXTC_LEDGBLERR_OFF; + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LEDGBLOK_ON; + } + else { + extctl = dd->ipath_extctrl & + ~(INFINIPATH_EXTC_LED1PRIPORT_ON | + INFINIPATH_EXTC_LED2PRIPORT_ON); + if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP) + extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON; + if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE) + extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON; + } + dd->ipath_extctrl = extctl; + ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl); + spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags); +} + +static void ipath_init_ht_variables(struct ipath_devdata *dd) +{ + /* + * setup the register offsets, since they are different for each + * chip + */ + dd->ipath_kregs = &ipath_ht_kregs; + dd->ipath_cregs = &ipath_ht_cregs; + + dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM; + dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM; + dd->ipath_gpio_sda = IPATH_GPIO_SDA; + dd->ipath_gpio_scl = IPATH_GPIO_SCL; + + /* + * Fill in data for field-values that change in newer chips. + * We dynamically specify only the mask for LINKTRAININGSTATE + * and only the shift for LINKSTATE, as they are the only ones + * that change. Also precalculate the 3 link states of interest + * and the combined mask. + */ + dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT; + dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK; + dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK << + dd->ibcs_ls_shift) | dd->ibcs_lts_mask; + dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift); + dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift); + dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP << + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) | + (INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift); + + /* + * Fill in data for ibcc field-values that change in newer chips. + * We dynamically specify only the mask for LINKINITCMD + * and only the shift for LINKCMD and MAXPKTLEN, as they are + * the only ones that change. + */ + dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK; + dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT; + dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT; + + /* Fill in shifts for RcvCtrl. */ + dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT; + dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT; + dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT; + dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */ + + dd->ipath_i_bitsextant = + (INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) | + (INFINIPATH_I_RCVAVAIL_MASK << + INFINIPATH_I_RCVAVAIL_SHIFT) | + INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT | + INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO; + + dd->ipath_e_bitsextant = + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC | + INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN | + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN | + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR | + INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP | + INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN | + INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK | + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN | + INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN | + INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT | + INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | + INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET | + INFINIPATH_E_HARDWARE; + + dd->ipath_hwe_bitsextant = + (INFINIPATH_HWE_HTCMEMPARITYERR_MASK << + INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) | + (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | + INFINIPATH_HWE_HTCLNKABYTE0CRCERR | + INFINIPATH_HWE_HTCLNKABYTE1CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE0CRCERR | + INFINIPATH_HWE_HTCLNKBBYTE1CRCERR | + INFINIPATH_HWE_HTCMISCERR4 | + INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 | + INFINIPATH_HWE_HTCMISCERR7 | + INFINIPATH_HWE_HTCBUSTREQPARITYERR | + INFINIPATH_HWE_HTCBUSTRESPPARITYERR | + INFINIPATH_HWE_HTCBUSIREQPARITYERR | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR | + INFINIPATH_HWE_MEMBISTFAILED | + INFINIPATH_HWE_COREPLL_FBSLIP | + INFINIPATH_HWE_COREPLL_RFSLIP | + INFINIPATH_HWE_HTBPLL_FBSLIP | + INFINIPATH_HWE_HTBPLL_RFSLIP | + INFINIPATH_HWE_HTAPLL_FBSLIP | + INFINIPATH_HWE_HTAPLL_RFSLIP | + INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_IBCBUSTOSPCPARITYERR | + INFINIPATH_HWE_IBCBUSFRSPCPARITYERR; + + dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK; + dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK; + dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT; + dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT; + + /* + * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity. + * 2 is Some Misc, 3 is reserved for future. + */ + dd->ipath_eep_st_masks[0].hwerrs_to_log = + INFINIPATH_HWE_TXEMEMPARITYERR_MASK << + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[1].hwerrs_to_log = + INFINIPATH_HWE_RXEMEMPARITYERR_MASK << + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT; + + dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET; + + dd->delay_mult = 2; /* SDR, 4X, can't change */ + + dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X; + dd->ipath_link_speed_supported = IPATH_IB_SDR; + dd->ipath_link_width_enabled = IB_WIDTH_4X; + dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported; + /* these can't change for this chip, so set once */ + dd->ipath_link_width_active = dd->ipath_link_width_enabled; + dd->ipath_link_speed_active = dd->ipath_link_speed_enabled; +} + +/** + * ipath_ht_init_hwerrors - enable hardware errors + * @dd: the infinipath device + * + * now that we have finished initializing everything that might reasonably + * cause a hardware error, and cleared those errors bits as they occur, + * we can enable hardware errors in the mask (potentially enabling + * freeze mode), and enable hardware errors as errors (along with + * everything else) in errormask + */ +static void ipath_ht_init_hwerrors(struct ipath_devdata *dd) +{ + ipath_err_t val; + u64 extsval; + + extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus); + + if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST)) + ipath_dev_err(dd, "MemBIST did not complete!\n"); + if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT) + ipath_dbg("MemBIST corrected\n"); + + ipath_check_htlink(dd); + + /* barring bugs, all hwerrors become interrupts, which can */ + val = -1LL; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT0) + val &= ~infinipath_hwe_htclnkabyte1crcerr; + /* don't look at crc lane1 if 8 bit */ + if (dd->ipath_flags & IPATH_8BIT_IN_HT1) + val &= ~infinipath_hwe_htclnkbbyte1crcerr; + + /* + * disable RXDSYNCMEMPARITY because external serdes is unused, + * and therefore the logic will never be used or initialized, + * and uninitialized state will normally result in this error + * being asserted. Similarly for the external serdess pll + * lock signal. + */ + val &= ~(INFINIPATH_HWE_SERDESPLLFAILED | + INFINIPATH_HWE_RXDSYNCMEMPARITYERR); + + /* + * Disable MISCERR4 because of an inversion in the HT core + * logic checking for errors that cause this bit to be set. + * The errata can also cause the protocol error bit to be set + * in the HT config space linkerror register(s). + */ + val &= ~INFINIPATH_HWE_HTCMISCERR4; + + /* + * PLL ignored because unused MDIO interface has a logic problem + */ + if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9) + val &= ~INFINIPATH_HWE_SERDESPLLFAILED; + dd->ipath_hwerrmask = val; +} + + + + +/** + * ipath_ht_bringup_serdes - bring up the serdes + * @dd: the infinipath device + */ +static int ipath_ht_bringup_serdes(struct ipath_devdata *dd) +{ + u64 val, config1; + int ret = 0, change = 0; + + ipath_dbg("Trying to bringup serdes\n"); + + if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) & + INFINIPATH_HWE_SERDESPLLFAILED) + { + ipath_dbg("At start, serdes PLL failed bit set in " + "hwerrstatus, clearing and continuing\n"); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + INFINIPATH_HWE_SERDESPLLFAILED); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1); + + ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + /* force reset on */ + val |= INFINIPATH_SERDC0_RESET_PLL + /* | INFINIPATH_SERDC0_RESET_MASK */ + ; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); + udelay(15); /* need pll reset set at least for a bit */ + + if (val & INFINIPATH_SERDC0_RESET_PLL) { + u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL; + /* set lane resets, and tx idle, during pll reset */ + val2 |= INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE; + ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing " + "%llx)\n", (unsigned long long) val2); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val2); + /* + * be sure chip saw it + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + /* + * need pll reset clear at least 11 usec before lane + * resets cleared; give it a few more + */ + udelay(15); + val = val2; /* for check below */ + } + + if (val & (INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE)) { + val &= ~(INFINIPATH_SERDC0_RESET_PLL | + INFINIPATH_SERDC0_RESET_MASK | + INFINIPATH_SERDC0_TXIDLE); + /* clear them */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, + val); + } + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + if (val & INFINIPATH_XGXS_RESET) { + /* normally true after boot */ + val &= ~INFINIPATH_XGXS_RESET; + change = 1; + } + if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) & + INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) { + /* need to compensate for Tx inversion in partner */ + val &= ~(INFINIPATH_XGXS_RX_POL_MASK << + INFINIPATH_XGXS_RX_POL_SHIFT); + val |= dd->ipath_rx_pol_inv << + INFINIPATH_XGXS_RX_POL_SHIFT; + change = 1; + } + if (change) + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + /* clear current and de-emphasis bits */ + config1 &= ~0x0ffffffff00ULL; + /* set current to 20ma */ + config1 |= 0x00000000000ULL; + /* set de-emphasis to -5.68dB */ + config1 |= 0x0cccc000000ULL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1); + + ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx " + "config1=%llx, sstatus=%llx xgxs %llx\n", + (unsigned long long) val, (unsigned long long) config1, + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus), + (unsigned long long) + ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig)); + + return ret; /* for now, say we always succeeded */ +} + +/** + * ipath_ht_quiet_serdes - set serdes to txidle + * @dd: the infinipath device + * driver is being unloaded + */ +static void ipath_ht_quiet_serdes(struct ipath_devdata *dd) +{ + u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0); + + val |= INFINIPATH_SERDC0_TXIDLE; + ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n", + (unsigned long long) val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val); +} + +/** + * ipath_pe_put_tid - write a TID in chip + * @dd: the infinipath device + * @tidptr: pointer to the expected TID (in chip) to update + * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected + * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing + * + * This exists as a separate routine to allow for special locking etc. + * It's used for both the full cleanup on exit, as well as the normal + * setup and teardown. + */ +static void ipath_ht_put_tid(struct ipath_devdata *dd, + u64 __iomem *tidptr, u32 type, + unsigned long pa) +{ + if (!dd->ipath_kregbase) + return; + + if (pa != dd->ipath_tidinvalid) { + if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) { + dev_info(&dd->pcidev->dev, + "physaddr %lx has more than " + "40 bits, using only 40!!!\n", pa); + pa &= INFINIPATH_RT_ADDR_MASK; + } + if (type == RCVHQ_RCV_TYPE_EAGER) + pa |= dd->ipath_tidtemplate; + else { + /* in words (fixed, full page). */ + u64 lenvalid = PAGE_SIZE >> 2; + lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT; + pa |= lenvalid | INFINIPATH_RT_VALID; + } + } + + writeq(pa, tidptr); +} + + +/** + * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager + * @dd: the infinipath device + * @port: the port + * + * Used from ipath_close(), and at chip initialization. + */ +static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port) +{ + u64 __iomem *tidbase; + int i; + + if (!dd->ipath_kregbase) + return; + + ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port); + + /* + * need to invalidate all of the expected TID entries for this + * port, so we don't have valid entries that might somehow get + * used (early in next use of this port, or through some bug) + */ + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvtidbase + + port * dd->ipath_rcvtidcnt * + sizeof(*tidbase)); + for (i = 0; i < dd->ipath_rcvtidcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED, + dd->ipath_tidinvalid); + + tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) + + dd->ipath_rcvegrbase + + port * dd->ipath_rcvegrcnt * + sizeof(*tidbase)); + + for (i = 0; i < dd->ipath_rcvegrcnt; i++) + ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER, + dd->ipath_tidinvalid); +} + +/** + * ipath_ht_tidtemplate - setup constants for TID updates + * @dd: the infinipath device + * + * We setup stuff that we use a lot, to avoid calculating each time + */ +static void ipath_ht_tidtemplate(struct ipath_devdata *dd) +{ + dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2; + dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT; + dd->ipath_tidtemplate |= INFINIPATH_RT_VALID; + + /* + * work around chip errata bug 7358, by marking invalid tids + * as having max length + */ + dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) << + INFINIPATH_RT_BUFSIZE_SHIFT; +} + +static int ipath_ht_early_init(struct ipath_devdata *dd) +{ + u32 __iomem *piobuf; + u32 pioincr, val32; + int i; + + /* + * one cache line; long IB headers will spill over into received + * buffer + */ + dd->ipath_rcvhdrentsize = 16; + dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE; + + /* + * For HT, we allocate a somewhat overly large eager buffer, + * such that we can guarantee that we can receive the largest + * packet that we can send out. To truly support a 4KB MTU, + * we need to bump this to a large value. To date, other than + * testing, we have never encountered an HCA that can really + * send 4KB MTU packets, so we do not handle that (we'll get + * errors interrupts if we ever see one). + */ + dd->ipath_rcvegrbufsize = dd->ipath_piosize2k; + + /* + * the min() check here is currently a nop, but it may not + * always be, depending on just how we do ipath_rcvegrbufsize + */ + dd->ipath_ibmaxlen = min(dd->ipath_piosize2k, + dd->ipath_rcvegrbufsize); + dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen; + ipath_ht_tidtemplate(dd); + + /* + * zero all the TID entries at startup. We do this for sanity, + * in case of a previous driver crash of some kind, and also + * because the chip powers up with these memories in an unknown + * state. Use portcnt, not cfgports, since this is for the + * full chip, not for current (possibly different) configuration + * value. + * Chip Errata bug 6447 + */ + for (val32 = 0; val32 < dd->ipath_portcnt; val32++) + ipath_ht_clear_tids(dd, val32); + + /* + * write the pbc of each buffer, to be sure it's initialized, then + * cancel all the buffers, and also abort any packets that might + * have been in flight for some reason (the latter is for driver + * unload/reload, but isn't a bad idea at first init). PIO send + * isn't enabled at this point, so there is no danger of sending + * these out on the wire. + * Chip Errata bug 6610 + */ + piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) + + dd->ipath_piobufbase); + pioincr = dd->ipath_palign / sizeof(*piobuf); + for (i = 0; i < dd->ipath_piobcnt2k; i++) { + /* + * reasonable word count, just to init pbc + */ + writel(16, piobuf); + piobuf += pioincr; + } + + ipath_get_eeprom_info(dd); + if (dd->ipath_boardrev == 5) { + /* + * Later production QHT7040 has same changes as QHT7140, so + * can use GPIO interrupts. They have serial #'s starting + * with 128, rather than 112. + */ + if (dd->ipath_serial[0] == '1' && + dd->ipath_serial[1] == '2' && + dd->ipath_serial[2] == '8') + dd->ipath_flags |= IPATH_GPIO_INTR; + else { + ipath_dev_err(dd, "Unsupported InfiniPath board " + "(serial number %.16s)!\n", + dd->ipath_serial); + return 1; + } + } + + if (dd->ipath_minrev >= 4) { + /* Rev4+ reports extra errors via internal GPIO pins */ + dd->ipath_flags |= IPATH_GPIO_ERRINTRS; + dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK; + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + + return 0; +} + + +/** + * ipath_init_ht_get_base_info - set chip-specific flags for user code + * @dd: the infinipath device + * @kbase: ipath_base_info pointer + * + * We set the PCIE flag because the lower bandwidth on PCIe vs + * HyperTransport can affect some user packet algorithms. + */ +static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase) +{ + struct ipath_base_info *kinfo = kbase; + + kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT | + IPATH_RUNTIME_PIO_REGSWAPPED; + + if (pd->port_dd->ipath_minrev < 4) + kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY; + + return 0; +} + +static void ipath_ht_free_irq(struct ipath_devdata *dd) +{ + free_irq(dd->ipath_irq, dd); + ht_destroy_irq(dd->ipath_irq); + dd->ipath_irq = 0; + dd->ipath_intconfig = 0; +} + +static struct ipath_message_header * +ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr) +{ + return (struct ipath_message_header *) + &rhf_addr[sizeof(u64) / sizeof(u32)]; +} + +static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports) +{ + dd->ipath_portcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt); + dd->ipath_p0_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); +} + +static void ipath_ht_read_counters(struct ipath_devdata *dd, + struct infinipath_counters *cntrs) +{ + cntrs->LBIntCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt)); + cntrs->LBFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt)); + cntrs->TxSDmaDescCnt = 0; + cntrs->TxUnsupVLErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt)); + cntrs->TxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt)); + cntrs->TxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt)); + cntrs->TxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt)); + cntrs->TxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt)); + cntrs->TxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt)); + cntrs->TxUnderrunCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt)); + cntrs->TxFlowStallCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt)); + cntrs->TxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt)); + cntrs->RxDroppedPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt)); + cntrs->RxDataPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt)); + cntrs->RxFlowPktCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt)); + cntrs->RxDwordCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt)); + cntrs->RxLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt)); + cntrs->RxMaxMinLenErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt)); + cntrs->RxICRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt)); + cntrs->RxVCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt)); + cntrs->RxFlowCtrlErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt)); + cntrs->RxBadFormatCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt)); + cntrs->RxLinkProblemCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt)); + cntrs->RxEBPCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt)); + cntrs->RxLPCRCErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt)); + cntrs->RxBufOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt)); + cntrs->RxTIDFullErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt)); + cntrs->RxTIDValidErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt)); + cntrs->RxPKeyMismatchCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt)); + cntrs->RxP0HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt)); + cntrs->RxP1HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt)); + cntrs->RxP2HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt)); + cntrs->RxP3HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt)); + cntrs->RxP4HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt)); + cntrs->RxP5HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt)); + cntrs->RxP6HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt)); + cntrs->RxP7HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt)); + cntrs->RxP8HdrEgrOvflCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt)); + cntrs->RxP9HdrEgrOvflCnt = 0; + cntrs->RxP10HdrEgrOvflCnt = 0; + cntrs->RxP11HdrEgrOvflCnt = 0; + cntrs->RxP12HdrEgrOvflCnt = 0; + cntrs->RxP13HdrEgrOvflCnt = 0; + cntrs->RxP14HdrEgrOvflCnt = 0; + cntrs->RxP15HdrEgrOvflCnt = 0; + cntrs->RxP16HdrEgrOvflCnt = 0; + cntrs->IBStatusChangeCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt)); + cntrs->IBLinkErrRecoveryCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt)); + cntrs->IBLinkDownedCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt)); + cntrs->IBSymbolErrCnt = + ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt)); + cntrs->RxVL15DroppedPktCnt = 0; + cntrs->RxOtherLocalPhyErrCnt = 0; + cntrs->PcieRetryBufDiagQwordCnt = 0; + cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs; + cntrs->LocalLinkIntegrityErrCnt = + (dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors; + cntrs->RxVlErrCnt = 0; + cntrs->RxDlidFltrCnt = 0; +} + + +/* no interrupt fallback for these chips */ +static int ipath_ht_nointr_fallback(struct ipath_devdata *dd) +{ + return 0; +} + + +/* + * reset the XGXS (between serdes and IBC). Slightly less intrusive + * than resetting the IBC or external link state, and useful in some + * cases to cause some retraining. To do this right, we reset IBC + * as well. + */ +static void ipath_ht_xgxs_reset(struct ipath_devdata *dd) +{ + u64 val, prev_val; + + prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig); + val = prev_val | INFINIPATH_XGXS_RESET; + prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control & ~INFINIPATH_C_LINKENABLE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val); + ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch); + ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); +} + + +static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which) +{ + int ret; + + switch (which) { + case IPATH_IB_CFG_LWID: + ret = dd->ipath_link_width_active; + break; + case IPATH_IB_CFG_SPD: + ret = dd->ipath_link_speed_active; + break; + case IPATH_IB_CFG_LWID_ENB: + ret = dd->ipath_link_width_enabled; + break; + case IPATH_IB_CFG_SPD_ENB: + ret = dd->ipath_link_speed_enabled; + break; + default: + ret = -ENOTSUPP; + break; + } + return ret; +} + + +/* we assume range checking is already done, if needed */ +static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val) +{ + int ret = 0; + + if (which == IPATH_IB_CFG_LWID_ENB) + dd->ipath_link_width_enabled = val; + else if (which == IPATH_IB_CFG_SPD_ENB) + dd->ipath_link_speed_enabled = val; + else + ret = -ENOTSUPP; + return ret; +} + + +static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b) +{ +} + + +static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs) +{ + ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs), + ipath_ib_linktrstate(dd, ibcs)); + return 0; +} + + +/** + * ipath_init_iba6110_funcs - set up the chip-specific function pointers + * @dd: the infinipath device + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +void ipath_init_iba6110_funcs(struct ipath_devdata *dd) +{ + dd->ipath_f_intrsetup = ipath_ht_intconfig; + dd->ipath_f_bus = ipath_setup_ht_config; + dd->ipath_f_reset = ipath_setup_ht_reset; + dd->ipath_f_get_boardname = ipath_ht_boardname; + dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors; + dd->ipath_f_early_init = ipath_ht_early_init; + dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors; + dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes; + dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes; + dd->ipath_f_clear_tids = ipath_ht_clear_tids; + dd->ipath_f_put_tid = ipath_ht_put_tid; + dd->ipath_f_cleanup = ipath_setup_ht_cleanup; + dd->ipath_f_setextled = ipath_setup_ht_setextled; + dd->ipath_f_get_base_info = ipath_ht_get_base_info; + dd->ipath_f_free_irq = ipath_ht_free_irq; + dd->ipath_f_tidtemplate = ipath_ht_tidtemplate; + dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback; + dd->ipath_f_get_msgheader = ipath_ht_get_msgheader; + dd->ipath_f_config_ports = ipath_ht_config_ports; + dd->ipath_f_read_counters = ipath_ht_read_counters; + dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset; + dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg; + dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg; + dd->ipath_f_config_jint = ipath_ht_config_jint; + dd->ipath_f_ib_updown = ipath_ht_ib_updown; + + /* + * initialize chip-specific variables + */ + ipath_init_ht_variables(dd); +} diff --git a/drivers/staging/rdma/ipath/ipath_init_chip.c b/drivers/staging/rdma/ipath/ipath_init_chip.c new file mode 100644 index 000000000..be2a60e14 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_init_chip.c @@ -0,0 +1,1066 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_common.h" + +/* + * min buffers we want to have per port, after driver + */ +#define IPATH_MIN_USER_PORT_BUFCNT 7 + +/* + * Number of ports we are configured to use (to allow for more pio + * buffers per port, etc.) Zero means use chip value. + */ +static ushort ipath_cfgports; + +module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO); +MODULE_PARM_DESC(cfgports, "Set max number of ports to use"); + +/* + * Number of buffers reserved for driver (verbs and layered drivers.) + * Initialized based on number of PIO buffers if not set via module interface. + * The problem with this is that it's global, but we'll use different + * numbers for different chip types. + */ +static ushort ipath_kpiobufs; + +static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp); + +module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort, + &ipath_kpiobufs, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver"); + +/** + * create_port0_egr - allocate the eager TID buffers + * @dd: the infinipath device + * + * This code is now quite different for user and kernel, because + * the kernel uses skb's, for the accelerated network performance. + * This is the kernel (port0) version. + * + * Allocate the eager TID buffers and program them into infinipath. + * We use the network layer alloc_skb() allocator to allocate the + * memory, and either use the buffers as is for things like verbs + * packets, or pass the buffers up to the ipath layered driver and + * thence the network layer, replacing them as we do so (see + * ipath_rcv_layer()). + */ +static int create_port0_egr(struct ipath_devdata *dd) +{ + unsigned e, egrcnt; + struct ipath_skbinfo *skbinfo; + int ret; + + egrcnt = dd->ipath_p0_rcvegrcnt; + + skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt); + if (skbinfo == NULL) { + ipath_dev_err(dd, "allocation error for eager TID " + "skb array\n"); + ret = -ENOMEM; + goto bail; + } + for (e = 0; e < egrcnt; e++) { + /* + * This is a bit tricky in that we allocate extra + * space for 2 bytes of the 14 byte ethernet header. + * These two bytes are passed in the ipath header so + * the rest of the data is word aligned. We allocate + * 4 bytes so that the data buffer stays word aligned. + * See ipath_kreceive() for more details. + */ + skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL); + if (!skbinfo[e].skb) { + ipath_dev_err(dd, "SKB allocation error for " + "eager TID %u\n", e); + while (e != 0) + dev_kfree_skb(skbinfo[--e].skb); + vfree(skbinfo); + ret = -ENOMEM; + goto bail; + } + } + /* + * After loop above, so we can test non-NULL to see if ready + * to use at receive, etc. + */ + dd->ipath_port0_skbinfo = skbinfo; + + for (e = 0; e < egrcnt; e++) { + dd->ipath_port0_skbinfo[e].phys = + ipath_map_single(dd->pcidev, + dd->ipath_port0_skbinfo[e].skb->data, + dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE); + dd->ipath_f_put_tid(dd, e + (u64 __iomem *) + ((char __iomem *) dd->ipath_kregbase + + dd->ipath_rcvegrbase), + RCVHQ_RCV_TYPE_EAGER, + dd->ipath_port0_skbinfo[e].phys); + } + + ret = 0; + +bail: + return ret; +} + +static int bringup_link(struct ipath_devdata *dd) +{ + u64 val, ibc; + int ret = 0; + + /* hold IBC in reset */ + dd->ipath_control &= ~INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + + /* + * set initial max size pkt IBC will send, including ICRC; it's the + * PIO buffer size in dwords, less 1; also see ipath_set_mtu() + */ + val = (dd->ipath_ibmaxlen >> 2) + 1; + ibc = val << dd->ibcc_mpl_shift; + + /* flowcontrolwatermark is in units of KBytes */ + ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT; + /* + * How often flowctrl sent. More or less in usecs; balance against + * watermark value, so that in theory senders always get a flow + * control update in time to not let the IB link go idle. + */ + ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT; + /* max error tolerance */ + ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + /* use "real" buffer space for */ + ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT; + /* IB credit flow control. */ + ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + /* initially come up waiting for TS1, without sending anything. */ + dd->ipath_ibcctrl = ibc; + /* + * Want to start out with both LINKCMD and LINKINITCMD in NOP + * (0 and 0). Don't put linkinitcmd in ipath_ibcctrl, want that + * to stay a NOP. Flag that we are disabled, for the (unlikely) + * case that some recovery path is trying to bring the link up + * before we are ready. + */ + ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE << + INFINIPATH_IBCC_LINKINITCMD_SHIFT; + dd->ipath_flags |= IPATH_IB_LINK_DISABLED; + ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n", + (unsigned long long) ibc); + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc); + + // be sure chip saw it + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + ret = dd->ipath_f_bringup_serdes(dd); + + if (ret) + dev_info(&dd->pcidev->dev, "Could not initialize SerDes, " + "not usable\n"); + else { + /* enable IBC */ + dd->ipath_control |= INFINIPATH_C_LINKENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + } + + return ret; +} + +static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd = NULL; + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (pd) { + pd->port_dd = dd; + pd->port_cnt = 1; + /* The port 0 pkey table is used by the layer interface. */ + pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY; + pd->port_seq_cnt = 1; + } + return pd; +} + +static int init_chip_first(struct ipath_devdata *dd) +{ + struct ipath_portdata *pd; + int ret = 0; + u64 val; + + spin_lock_init(&dd->ipath_kernel_tid_lock); + spin_lock_init(&dd->ipath_user_tid_lock); + spin_lock_init(&dd->ipath_sendctrl_lock); + spin_lock_init(&dd->ipath_uctxt_lock); + spin_lock_init(&dd->ipath_sdma_lock); + spin_lock_init(&dd->ipath_gpio_lock); + spin_lock_init(&dd->ipath_eep_st_lock); + spin_lock_init(&dd->ipath_sdepb_lock); + mutex_init(&dd->ipath_eep_lock); + + /* + * skip cfgports stuff because we are not allocating memory, + * and we don't want problems if the portcnt changed due to + * cfgports. We do still check and report a difference, if + * not same (should be impossible). + */ + dd->ipath_f_config_ports(dd, ipath_cfgports); + if (!ipath_cfgports) + dd->ipath_cfgports = dd->ipath_portcnt; + else if (ipath_cfgports <= dd->ipath_portcnt) { + dd->ipath_cfgports = ipath_cfgports; + ipath_dbg("Configured to use %u ports out of %u in chip\n", + dd->ipath_cfgports, ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); + } else { + dd->ipath_cfgports = dd->ipath_portcnt; + ipath_dbg("Tried to configured to use %u ports; chip " + "only supports %u\n", ipath_cfgports, + ipath_read_kreg32(dd, + dd->ipath_kregs->kr_portcnt)); + } + /* + * Allocate full portcnt array, rather than just cfgports, because + * cleanup iterates across all possible ports. + */ + dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt, + GFP_KERNEL); + + if (!dd->ipath_pd) { + ipath_dev_err(dd, "Unable to allocate portdata array, " + "failing\n"); + ret = -ENOMEM; + goto done; + } + + pd = create_portdata0(dd); + if (!pd) { + ipath_dev_err(dd, "Unable to allocate portdata for port " + "0, failing\n"); + ret = -ENOMEM; + goto done; + } + dd->ipath_pd[0] = pd; + + dd->ipath_rcvtidcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + dd->ipath_rcvtidbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + dd->ipath_rcvegrcnt = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + dd->ipath_rcvegrbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + dd->ipath_palign = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign); + dd->ipath_piobufbase = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase); + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize); + dd->ipath_piosize2k = val & ~0U; + dd->ipath_piosize4k = val >> 32; + if (dd->ipath_piosize4k == 0 && ipath_mtu4096) + ipath_mtu4096 = 0; /* 4KB not supported by this chip */ + dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048; + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt); + dd->ipath_piobcnt2k = val & ~0U; + dd->ipath_piobcnt4k = val >> 32; + dd->ipath_pio2kbase = + (u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase & 0xffffffff)); + if (dd->ipath_piobcnt4k) { + dd->ipath_pio4kbase = (u32 __iomem *) + (((char __iomem *) dd->ipath_kregbase) + + (dd->ipath_piobufbase >> 32)); + /* + * 4K buffers take 2 pages; we use roundup just to be + * paranoid; we calculate it once here, rather than on + * ever buf allocate + */ + dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k, + dd->ipath_palign); + ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p " + "(%x aligned)\n", + dd->ipath_piobcnt2k, dd->ipath_piosize2k, + dd->ipath_pio2kbase, dd->ipath_piobcnt4k, + dd->ipath_piosize4k, dd->ipath_pio4kbase, + dd->ipath_4kalign); + } + else ipath_dbg("%u 2k piobufs @ %p\n", + dd->ipath_piobcnt2k, dd->ipath_pio2kbase); + +done: + return ret; +} + +/** + * init_chip_reset - re-initialize after a reset, or enable + * @dd: the infinipath device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_chip_reset(struct ipath_devdata *dd) +{ + u32 rtmp; + int i; + unsigned long flags; + + /* + * ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize + */ + dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift); + for (i = 0; i < dd->ipath_portcnt; i++) { + clear_bit(dd->ipath_r_portenable_shift + i, + &dd->ipath_rcvctrl); + clear_bit(dd->ipath_r_intravail_shift + i, + &dd->ipath_rcvctrl); + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl = 0U; /* no sdma, etc */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); + + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt); + if (rtmp != dd->ipath_rcvtidcnt) + dev_info(&dd->pcidev->dev, "tidcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase); + if (rtmp != dd->ipath_rcvtidbase) + dev_info(&dd->pcidev->dev, "tidbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvtidbase, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt); + if (rtmp != dd->ipath_rcvegrcnt) + dev_info(&dd->pcidev->dev, "egrcnt was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrcnt, rtmp); + rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase); + if (rtmp != dd->ipath_rcvegrbase) + dev_info(&dd->pcidev->dev, "egrbase was %u before " + "reset, now %u, using original\n", + dd->ipath_rcvegrbase, rtmp); + + return 0; +} + +static int init_pioavailregs(struct ipath_devdata *dd) +{ + int ret; + + dd->ipath_pioavailregs_dma = dma_alloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys, + GFP_KERNEL); + if (!dd->ipath_pioavailregs_dma) { + ipath_dev_err(dd, "failed to allocate PIOavail reg area " + "in memory\n"); + ret = -ENOMEM; + goto done; + } + + /* + * we really want L2 cache aligned, but for current CPUs of + * interest, they are the same. + */ + dd->ipath_statusp = (u64 *) + ((char *)dd->ipath_pioavailregs_dma + + ((2 * L1_CACHE_BYTES + + dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES)); + /* copy the current value now that it's really allocated */ + *dd->ipath_statusp = dd->_ipath_status; + /* + * setup buffer to hold freeze msg, accessible to apps, + * following statusp + */ + dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1]; + /* and its length */ + dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]); + + ret = 0; + +done: + return ret; +} + +/** + * init_shadow_tids - allocate the shadow TID array + * @dd: the infinipath device + * + * allocate the shadow TID array, so we can ipath_munlock previous + * entries. It may make more sense to move the pageshadow to the + * port data structure, so we only allocate memory for ports actually + * in use, since we at 8k per port, now. + */ +static void init_shadow_tids(struct ipath_devdata *dd) +{ + struct page **pages; + dma_addr_t *addrs; + + pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(struct page *)); + if (!pages) { + ipath_dev_err(dd, "failed to allocate shadow page * " + "array, no expected sends!\n"); + dd->ipath_pageshadow = NULL; + return; + } + + addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt * + sizeof(dma_addr_t)); + if (!addrs) { + ipath_dev_err(dd, "failed to allocate shadow dma handle " + "array, no expected sends!\n"); + vfree(pages); + dd->ipath_pageshadow = NULL; + return; + } + + dd->ipath_pageshadow = pages; + dd->ipath_physshadow = addrs; +} + +static void enable_chip(struct ipath_devdata *dd, int reinit) +{ + u32 val; + u64 rcvmask; + unsigned long flags; + int i; + + if (!reinit) + init_waitqueue_head(&ipath_state_wait); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* Enable PIO send, and update of PIOavail regs to memory. */ + dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE | + INFINIPATH_S_PIOBUFAVAILUPD; + + /* + * Set the PIO avail update threshold to host memory + * on chips that support it. + */ + if (dd->ipath_pioupd_thresh) + dd->ipath_sendctrl |= dd->ipath_pioupd_thresh + << INFINIPATH_S_UPDTHRESH_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* + * Enable kernel ports' receive and receive interrupt. + * Other ports done as user opens and inits them. + */ + rcvmask = 1ULL; + dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) | + (rcvmask << dd->ipath_r_intravail_shift); + if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) + dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + + /* + * now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. + */ + dd->ipath_flags |= IPATH_INITTED; + + /* + * Init our shadow copies of head from tail values, + * and write head values to match. + */ + val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0); + ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0); + + /* Initialize so we interrupt on next packet received */ + ipath_write_ureg(dd, ur_rcvhdrhead, + dd->ipath_rhdrhead_intr_off | + dd->ipath_pd[0]->port_head, 0); + + /* + * by now pioavail updates to memory should have occurred, so + * copy them into our working/shadow registers; this is in + * case something went wrong with abort, but mostly to get the + * initial values of the generation bit correct. + */ + for (i = 0; i < dd->ipath_pioavregs; i++) { + __le64 pioavail; + + /* + * Chip Errata bug 6641; even and odd qwords>3 are swapped. + */ + if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) + pioavail = dd->ipath_pioavailregs_dma[i ^ 1]; + else + pioavail = dd->ipath_pioavailregs_dma[i]; + /* + * don't need to worry about ipath_pioavailkernel here + * because we will call ipath_chg_pioavailkernel() later + * in initialization, to busy out buffers as needed + */ + dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail); + } + /* can get counters, stats, etc. */ + dd->ipath_flags |= IPATH_PRESENT; +} + +static int init_housekeeping(struct ipath_devdata *dd, int reinit) +{ + char boardn[40]; + int ret = 0; + + /* + * have to clear shadow copies of registers at init that are + * not otherwise set here, or all kinds of bizarre things + * happen with driver on chip reset + */ + dd->ipath_rcvhdrsize = 0; + + /* + * Don't clear ipath_flags as 8bit mode was set before + * entering this func. However, we do set the linkstate to + * unknown, so we can watch for a transition. + * PRESENT is set because we want register reads to work, + * and the kernel infrastructure saw it in config space; + * We clear it if we have failures. + */ + dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT; + dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED | + IPATH_LINKDOWN | IPATH_LINKINIT); + + ipath_cdbg(VERBOSE, "Try to read spc chip revision\n"); + dd->ipath_revision = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision); + + /* + * set up fundamental info we need to use the chip; we assume + * if the revision reg and these regs are OK, we don't need to + * special case the rest + */ + dd->ipath_sregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase); + dd->ipath_cregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase); + dd->ipath_uregbase = + ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase); + ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, " + "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase, + dd->ipath_uregbase, dd->ipath_cregbase); + if ((dd->ipath_revision & 0xffffffff) == 0xffffffff + || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff + || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) { + ipath_dev_err(dd, "Register read failures from chip, " + "giving up initialization\n"); + dd->ipath_flags &= ~IPATH_PRESENT; + ret = -ENODEV; + goto done; + } + + + /* clear diagctrl register, in case diags were running and crashed */ + ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0); + + /* clear the initial reset flag, in case first driver load */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + INFINIPATH_E_RESET); + + ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n", + (unsigned long long) dd->ipath_revision, + dd->ipath_pcirev); + + if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) { + ipath_dev_err(dd, "Driver only handles version %d, " + "chip swversion is %d (%llx), failng\n", + IPATH_CHIP_SWVERSION, + (int)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK, + (unsigned long long) dd->ipath_revision); + ret = -ENOSYS; + goto done; + } + dd->ipath_majrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMAJOR_SHIFT) & + INFINIPATH_R_CHIPREVMAJOR_MASK); + dd->ipath_minrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_CHIPREVMINOR_SHIFT) & + INFINIPATH_R_CHIPREVMINOR_MASK); + dd->ipath_boardrev = (u8) ((dd->ipath_revision >> + INFINIPATH_R_BOARDID_SHIFT) & + INFINIPATH_R_BOARDID_MASK); + + ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn); + + snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion), + "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, " + "SW Compat %u\n", + IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn, + (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) & + INFINIPATH_R_ARCH_MASK, + dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev, + (unsigned)(dd->ipath_revision >> + INFINIPATH_R_SOFTWARE_SHIFT) & + INFINIPATH_R_SOFTWARE_MASK); + + ipath_dbg("%s", dd->ipath_boardversion); + + if (ret) + goto done; + + if (reinit) + ret = init_chip_reset(dd); + else + ret = init_chip_first(dd); + +done: + return ret; +} + +static void verify_interrupt(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + + if (!dd) + return; /* being torn down */ + + /* + * If we don't have any interrupts, let the user know and + * don't bother checking again. + */ + if (dd->ipath_int_counter == 0) { + if (!dd->ipath_f_intr_fallback(dd)) + dev_err(&dd->pcidev->dev, "No interrupts detected, " + "not usable.\n"); + else /* re-arm the timer to see if fallback works */ + mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2); + } else + ipath_cdbg(VERBOSE, "%u interrupts at timer check\n", + dd->ipath_int_counter); +} + +/** + * ipath_init_chip - do the actual initialization sequence on the chip + * @dd: the infinipath device + * @reinit: reinitializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int ipath_init_chip(struct ipath_devdata *dd, int reinit) +{ + int ret = 0; + u32 kpiobufs, defkbufs; + u32 piobufs, uports; + u64 val; + struct ipath_portdata *pd; + gfp_t gfp_flags = GFP_USER | __GFP_COMP; + + ret = init_housekeeping(dd, reinit); + if (ret) + goto done; + + /* + * We could bump this to allow for full rcvegrcnt + rcvtidcnt, + * but then it no longer nicely fits power of two, and since + * we now use routines that backend onto __get_free_pages, the + * rest would be wasted. + */ + dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt, + dd->ipath_rcvhdrcnt); + + /* + * Set up the shadow copies of the piobufavail registers, + * which we compare against the chip registers for now, and + * the in memory DMA'ed copies of the registers. This has to + * be done early, before we calculate lastport, etc. + */ + piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* + * calc number of pioavail registers, and save it; we have 2 + * bits per buffer. + */ + dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) + / (sizeof(u64) * BITS_PER_BYTE / 2); + uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0; + if (piobufs > 144) + defkbufs = 32 + dd->ipath_pioreserved; + else + defkbufs = 16 + dd->ipath_pioreserved; + + if (ipath_kpiobufs && (ipath_kpiobufs + + (uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) { + int i = (int) piobufs - + (int) (uports * IPATH_MIN_USER_PORT_BUFCNT); + if (i < 1) + i = 1; + dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of " + "%d for kernel leaves too few for %d user ports " + "(%d each); using %u\n", ipath_kpiobufs, + piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i); + /* + * shouldn't change ipath_kpiobufs, because could be + * different for different devices... + */ + kpiobufs = i; + } else if (ipath_kpiobufs) + kpiobufs = ipath_kpiobufs; + else + kpiobufs = defkbufs; + dd->ipath_lastport_piobuf = piobufs - kpiobufs; + dd->ipath_pbufsport = + uports ? dd->ipath_lastport_piobuf / uports : 0; + /* if not an even divisor, some user ports get extra buffers */ + dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf - + (dd->ipath_pbufsport * uports); + if (dd->ipath_ports_extrabuf) + ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to " + "ports <= %u\n", dd->ipath_pbufsport, + dd->ipath_ports_extrabuf); + dd->ipath_lastpioindex = 0; + dd->ipath_lastpioindexl = dd->ipath_piobcnt2k; + /* ipath_pioavailshadow initialized earlier */ + ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u " + "each for %u user ports\n", kpiobufs, + piobufs, dd->ipath_pbufsport, uports); + ret = dd->ipath_f_early_init(dd); + if (ret) { + ipath_dev_err(dd, "Early initialization failure\n"); + goto done; + } + + /* + * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be + * done after early_init. + */ + dd->ipath_hdrqlast = + dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize, + dd->ipath_rcvhdrentsize); + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize, + dd->ipath_rcvhdrsize); + + if (!reinit) { + ret = init_pioavailregs(dd); + init_shadow_tids(dd); + if (ret) + goto done; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr, + dd->ipath_pioavailregs_phys); + + /* + * this is to detect s/w errors, which the h/w works around by + * ignoring the low 6 bits of address, if it wasn't aligned. + */ + val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr); + if (val != dd->ipath_pioavailregs_phys) { + ipath_dev_err(dd, "Catastrophic software error, " + "SendPIOAvailAddr written as %lx, " + "read back as %llx\n", + (unsigned long) dd->ipath_pioavailregs_phys, + (unsigned long long) val); + ret = -EINVAL; + goto done; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP); + + /* + * make sure we are not in freeze, and PIO send enabled, so + * writes to pbc happen + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL); + + /* + * before error clears, since we expect serdes pll errors during + * this, the first time after reset + */ + if (bringup_link(dd)) { + dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n"); + ret = -ENETDOWN; + goto done; + } + + /* + * clear any "expected" hwerrs from reset and/or initialization + * clear any that aren't enabled (at least this once), and then + * set the enable mask + */ + dd->ipath_f_init_hwerrors(dd); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, + ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED); + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, + dd->ipath_hwerrmask); + + /* clear all */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL); + /* enable errors that are masked, at least this first time. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + ~dd->ipath_maskederrs); + dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */ + dd->ipath_errormask = + ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + /* clear any interrupts up to this point (ints still not enabled) */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL); + + dd->ipath_f_tidtemplate(dd); + + /* + * Set up the port 0 (kernel) rcvhdr q and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of port 0 portdata as well. + */ + pd = dd->ipath_pd[0]; + if (reinit) { + struct ipath_portdata *npd; + + /* + * Alloc and init new ipath_portdata for port0, + * Then free old pd. Could lead to fragmentation, but also + * makes later support for hot-swap easier. + */ + npd = create_portdata0(dd); + if (npd) { + ipath_free_pddata(dd, pd); + dd->ipath_pd[0] = npd; + pd = npd; + } else { + ipath_dev_err(dd, "Unable to allocate portdata" + " for port 0, failing\n"); + ret = -ENOMEM; + goto done; + } + } + ret = ipath_create_rcvhdrq(dd, pd); + if (!ret) + ret = create_port0_egr(dd); + if (ret) { + ipath_dev_err(dd, "failed to allocate kernel port's " + "rcvhdrq and/or egr bufs\n"); + goto done; + } + else + enable_chip(dd, reinit); + + /* after enable_chip, so pioavailshadow setup */ + ipath_chg_pioavailkernel(dd, 0, piobufs, 1); + + /* + * Cancel any possible active sends from early driver load. + * Follows early_init because some chips have to initialize + * PIO buffers in early_init to avoid false parity errors. + * After enable and ipath_chg_pioavailkernel so we can safely + * enable pioavail updates and PIOENABLE; packets are now + * ready to go out. + */ + ipath_cancel_sends(dd, 1); + + if (!reinit) { + /* + * Used when we close a port, for DMA already in flight + * at close. + */ + dd->ipath_dummy_hdrq = dma_alloc_coherent( + &dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size, + &dd->ipath_dummy_hdrq_phys, + gfp_flags); + if (!dd->ipath_dummy_hdrq) { + dev_info(&dd->pcidev->dev, + "Couldn't allocate 0x%lx bytes for dummy hdrq\n", + dd->ipath_pd[0]->port_rcvhdrq_size); + /* fallback to just 0'ing */ + dd->ipath_dummy_hdrq_phys = 0UL; + } + } + + /* + * cause retrigger of pending interrupts ignored during init, + * even if we had errors + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + + if (!dd->ipath_stats_timer_active) { + /* + * first init, or after an admin disable/enable + * set up stats retrieval timer, even if we had errors + * in last portion of setup + */ + init_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer.function = ipath_get_faststats; + dd->ipath_stats_timer.data = (unsigned long) dd; + /* every 5 seconds; */ + dd->ipath_stats_timer.expires = jiffies + 5 * HZ; + /* takes ~16 seconds to overflow at full IB 4x bandwdith */ + add_timer(&dd->ipath_stats_timer); + dd->ipath_stats_timer_active = 1; + } + + /* Set up SendDMA if chip supports it */ + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ret = setup_sdma(dd); + + /* Set up HoL state */ + init_timer(&dd->ipath_hol_timer); + dd->ipath_hol_timer.function = ipath_hol_event; + dd->ipath_hol_timer.data = (unsigned long)dd; + dd->ipath_hol_state = IPATH_HOL_UP; + +done: + if (!ret) { + *dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT; + if (!dd->ipath_f_intrsetup(dd)) { + /* now we can enable all interrupts from the chip */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + -1LL); + /* force re-interrupt of any pending interrupts. */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, + 0ULL); + /* chip is usable; mark it as initialized */ + *dd->ipath_statusp |= IPATH_STATUS_INITTED; + + /* + * setup to verify we get an interrupt, and fallback + * to an alternate if necessary and possible + */ + if (!reinit) { + init_timer(&dd->ipath_intrchk_timer); + dd->ipath_intrchk_timer.function = + verify_interrupt; + dd->ipath_intrchk_timer.data = + (unsigned long) dd; + } + dd->ipath_intrchk_timer.expires = jiffies + HZ/2; + add_timer(&dd->ipath_intrchk_timer); + } else + ipath_dev_err(dd, "No interrupts enabled, couldn't " + "setup interrupt address\n"); + + if (dd->ipath_cfgports > ipath_stats.sps_nports) + /* + * sps_nports is a global, so, we set it to + * the highest number of ports of any of the + * chips we find; we never decrement it, at + * least for now. Since this might have changed + * over disable/enable or prior to reset, always + * do the check and potentially adjust. + */ + ipath_stats.sps_nports = dd->ipath_cfgports; + } else + ipath_dbg("Failed (%d) to initialize chip\n", ret); + + /* if ret is non-zero, we probably should do some cleanup + here... */ + return ret; +} + +static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp) +{ + struct ipath_devdata *dd; + unsigned long flags; + unsigned short val; + int ret; + + ret = ipath_parse_ushort(str, &val); + + spin_lock_irqsave(&ipath_devs_lock, flags); + + if (ret < 0) + goto bail; + + if (val == 0) { + ret = -EINVAL; + goto bail; + } + + list_for_each_entry(dd, &ipath_dev_list, ipath_list) { + if (dd->ipath_kregbase) + continue; + if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - + (dd->ipath_cfgports * + IPATH_MIN_USER_PORT_BUFCNT))) + { + ipath_dev_err( + dd, + "Allocating %d PIO bufs for kernel leaves " + "too few for %d user ports (%d each)\n", + val, dd->ipath_cfgports - 1, + IPATH_MIN_USER_PORT_BUFCNT); + ret = -EINVAL; + goto bail; + } + dd->ipath_lastport_piobuf = + dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val; + } + + ipath_kpiobufs = val; + ret = 0; +bail: + spin_unlock_irqrestore(&ipath_devs_lock, flags); + + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_intr.c b/drivers/staging/rdma/ipath/ipath_intr.c new file mode 100644 index 000000000..01ba79279 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_intr.c @@ -0,0 +1,1273 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + + +/* + * Called when we might have an error that is specific to a particular + * PIO buffer, and may need to cancel that buffer, so it can be re-used. + */ +void ipath_disarm_senderrbufs(struct ipath_devdata *dd) +{ + u32 piobcnt; + unsigned long sbuf[4]; + /* + * it's possible that sendbuffererror could have bits set; might + * have already done this as a result of hardware error handling + */ + piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + /* read these before writing errorclear */ + sbuf[0] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror); + sbuf[1] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 1); + if (piobcnt > 128) + sbuf[2] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 2); + if (piobcnt > 192) + sbuf[3] = ipath_read_kreg64( + dd, dd->ipath_kregs->kr_sendbuffererror + 3); + else + sbuf[3] = 0; + + if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { + int i; + if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) && + time_after(dd->ipath_lastcancel, jiffies)) { + __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, + "SendbufErrs %lx %lx", sbuf[0], + sbuf[1]); + if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) + printk(" %lx %lx ", sbuf[2], sbuf[3]); + printk("\n"); + } + + for (i = 0; i < piobcnt; i++) + if (test_bit(i, sbuf)) + ipath_disarm_piobufs(dd, i, 1); + /* ignore armlaunch errs for a bit */ + dd->ipath_lastcancel = jiffies+3; + } +} + + +/* These are all rcv-related errors which we want to count for stats */ +#define E_SUM_PKTERRS \ + (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ + INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \ + INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \ + INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \ + INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP) + +/* These are all send-related errors which we want to count for stats */ +#define E_SUM_ERRS \ + (INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \ + INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_INVALIDADDR) + +/* + * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore + * errors not related to freeze and cancelling buffers. Can't ignore + * armlaunch because could get more while still cleaning up, and need + * to cancel those as they happen. + */ +#define E_SPKT_ERRS_IGNORE \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \ + INFINIPATH_E_SPKTLEN) + +/* + * these are errors that can occur when the link changes state while + * a packet is being sent or received. This doesn't cover things + * like EBP or VCRC that can be the result of a sending having the + * link change state, so we receive a "known bad" packet. + */ +#define E_SUM_LINK_PKTERRS \ + (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \ + INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \ + INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \ + INFINIPATH_E_RUNEXPCHAR) + +static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) +{ + u64 ignore_this_time = 0; + + ipath_disarm_senderrbufs(dd); + if ((errs & E_SUM_LINK_PKTERRS) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + return ignore_this_time; +} + +/* generic hw error messages... */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ), \ + .msg = "TXE " #a " Memory Parity" \ + } +#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \ + { \ + .mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a << \ + INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ), \ + .msg = "RXE " #a " Memory Parity" \ + } + +static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = { + INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"), + INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"), + + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC), + INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO), + + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO), + INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO), +}; + +/** + * ipath_format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * ipath_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t msgl) +{ + int i; + const int glen = + ARRAY_SIZE(ipath_generic_hwerror_msgs); + + for (i=0; iib_init) + ret = "Init"; + else if (state == dd->ib_arm) + ret = "Arm"; + else if (state == dd->ib_active) + ret = "Active"; + else + ret = "Down"; + return ret; +} + +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev) +{ + struct ib_event event; + + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = 1; + event.event = ev; + ib_dispatch_event(&event); +} + +static void handle_e_ibstatuschanged(struct ipath_devdata *dd, + ipath_err_t errs) +{ + u32 ltstate, lstate, ibstate, lastlstate; + u32 init = dd->ib_init; + u32 arm = dd->ib_arm; + u32 active = dd->ib_active; + const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus); + + lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */ + ibstate = ipath_ib_state(dd, ibcs); + /* linkstate at last interrupt */ + lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat); + ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */ + + /* + * Since going into a recovery state causes the link state to go + * down and since recovery is transitory, it is better if we "miss" + * ever seeing the link training state go into recovery (i.e., + * ignore this transition for link state special handling purposes) + * without even updating ipath_lastibcstat. + */ + if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) || + (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE)) + goto done; + + /* + * if linkstate transitions into INIT from any of the various down + * states, or if it transitions from any of the up (INIT or better) + * states into any of the down states (except link recovery), then + * call the chip-specific code to take appropriate actions. + */ + if (lstate >= INFINIPATH_IBCS_L_STATE_INIT && + lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) { + /* transitioned to UP */ + if (dd->ipath_f_ib_updown(dd, 1, ibcs)) { + /* link came up, so we must no longer be disabled */ + dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED; + ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ + } + } else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT || + (dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) && + ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT && + ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) { + int handled; + handled = dd->ipath_f_ib_updown(dd, 0, ibcs); + dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY; + if (handled) { + ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n"); + goto skip_ibchange; /* chip-code handled */ + } + } + + /* + * Significant enough to always print and get into logs, if it was + * unexpected. If it was a requested state change, we'll have + * already cleared the flags, so we won't print this warning + */ + if ((ibstate != arm && ibstate != active) && + (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) { + dev_info(&dd->pcidev->dev, "Link state changed from %s " + "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ? + "ARM" : "ACTIVE", ib_linkstate(dd, ibcs)); + } + + if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + u32 lastlts; + lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat); + /* + * Ignore cycling back and forth from Polling.Active to + * Polling.Quiet while waiting for the other end of the link + * to come up, except to try and decide if we are connected + * to a live IB device or not. We will cycle back and + * forth between them if no cable is plugged in, the other + * device is powered off or disabled, etc. + */ + if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE || + lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) { + if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) && + (++dd->ipath_ibpollcnt == 40)) { + dd->ipath_flags |= IPATH_NOCABLE; + *dd->ipath_statusp |= + IPATH_STATUS_IB_NOCABLE; + ipath_cdbg(LINKVERB, "Set NOCABLE\n"); + } + ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n", + ipath_ibcstatus_str[ltstate], ibstate); + goto skip_ibchange; + } + } + + dd->ipath_ibpollcnt = 0; /* not poll*, now */ + ipath_stats.sps_iblink++; + + if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) { + u64 linkrecov; + linkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + if (linkrecov != dd->ipath_lastlinkrecov) { + ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n", + (unsigned long long) ibcs, + ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], + (unsigned long long) linkrecov); + /* and no more until active again */ + dd->ipath_lastlinkrecov = 0; + ipath_set_linkstate(dd, IPATH_IB_LINKDOWN); + goto skip_ibchange; + } + } + + if (ibstate == init || ibstate == arm || ibstate == active) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE; + if (ibstate == init || ibstate == arm) { + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); + } + if (ibstate == arm) { + dd->ipath_flags |= IPATH_LINKARMED; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKINIT | IPATH_LINKDOWN | + IPATH_LINKACTIVE | IPATH_NOCABLE); + ipath_hol_down(dd); + } else if (ibstate == init) { + /* + * set INIT and DOWN. Down is checked by + * most of the other code, but INIT is + * useful to know in a few places. + */ + dd->ipath_flags |= IPATH_LINKINIT | + IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | + IPATH_LINKARMED | IPATH_LINKACTIVE | + IPATH_NOCABLE); + ipath_hol_down(dd); + } else { /* active */ + dd->ipath_lastlinkrecov = ipath_snap_cntr(dd, + dd->ipath_cregs->cr_iblinkerrrecovcnt); + *dd->ipath_statusp |= + IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF; + dd->ipath_flags |= IPATH_LINKACTIVE; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKDOWN | IPATH_LINKARMED | + IPATH_NOCABLE); + if (dd->ipath_flags & IPATH_HAS_SEND_DMA) + ipath_restart_sdma(dd); + signal_ib_event(dd, IB_EVENT_PORT_ACTIVE); + /* LED active not handled in chip _f_updown */ + dd->ipath_f_setextled(dd, lstate, ltstate); + ipath_hol_up(dd); + } + + /* + * print after we've already done the work, so as not to + * delay the state changes and notifications, for debugging + */ + if (lstate == lastlstate) + ipath_cdbg(LINKVERB, "Unchanged from last: %s " + "(%x)\n", ib_linkstate(dd, ibcs), ibstate); + else + ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n", + dd->ipath_unit, ib_linkstate(dd, ibcs), + ipath_ibcstatus_str[ltstate], ibstate); + } else { /* down */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + signal_ib_event(dd, IB_EVENT_PORT_ERR); + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKACTIVE | + IPATH_LINKARMED); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + dd->ipath_lli_counter = 0; + + if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN) + ipath_cdbg(VERBOSE, "Unit %u link state down " + "(state 0x%x), from %s\n", + dd->ipath_unit, lstate, + ib_linkstate(dd, dd->ipath_lastibcstat)); + else + ipath_cdbg(LINKVERB, "Unit %u link state changed " + "to %s (0x%x) from down (%x)\n", + dd->ipath_unit, + ipath_ibcstatus_str[ltstate], + ibstate, lastlstate); + } + +skip_ibchange: + dd->ipath_lastibcstat = ibcs; +done: + return; +} + +static void handle_supp_msgs(struct ipath_devdata *dd, + unsigned supp_msgs, char *msg, u32 msgsz) +{ + /* + * Print the message unless it's ibc status change only, which + * happens so often we never want to count it. + */ + if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) { + int iserr; + ipath_err_t mask; + iserr = ipath_decode_err(dd, msg, msgsz, + dd->ipath_lasterror & + ~INFINIPATH_E_IBSTATUSCHANGED); + + mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + if (dd->ipath_lasterror & ~mask) + ipath_dev_err(dd, "Suppressed %u messages for " + "fast-repeating errors (%s) (%llx)\n", + supp_msgs, msg, + (unsigned long long) + dd->ipath_lasterror); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + if (iserr) + ipath_dbg("Suppressed %u messages for %s\n", + supp_msgs, msg); + else + ipath_cdbg(ERRPKT, + "Suppressed %u messages for %s\n", + supp_msgs, msg); + } + } +} + +static unsigned handle_frequent_errors(struct ipath_devdata *dd, + ipath_err_t errs, char *msg, + u32 msgsz, int *noprint) +{ + unsigned long nc; + static unsigned long nextmsg_time; + static unsigned nmsgs, supp_msgs; + + /* + * Throttle back "fast" messages to no more than 10 per 5 seconds. + * This isn't perfect, but it's a reasonable heuristic. If we get + * more than 10, give a 6x longer delay. + */ + nc = jiffies; + if (nmsgs > 10) { + if (time_before(nc, nextmsg_time)) { + *noprint = 1; + if (!supp_msgs++) + nextmsg_time = nc + HZ * 3; + } + else if (supp_msgs) { + handle_supp_msgs(dd, supp_msgs, msg, msgsz); + supp_msgs = 0; + nmsgs = 0; + } + } + else if (!nmsgs++ || time_after(nc, nextmsg_time)) + nextmsg_time = nc + HZ / 2; + + return supp_msgs; +} + +static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + unsigned long flags; + int expected; + + if (ipath_debug & __IPATH_DBG) { + char msg[128]; + ipath_decode_err(dd, msg, sizeof msg, errs & + INFINIPATH_E_SDMAERRS); + ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg); + } + if (ipath_debug & __IPATH_VERBDBG) { + unsigned long tl, hd, status, lengen; + tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + status = ipath_read_kreg64(dd + , dd->ipath_kregs->kr_senddmastatus); + lengen = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmalengen); + ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx " + "lengen 0x%lx\n", tl, hd, status, lengen); + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); +} + +static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat) +{ + unsigned long flags; + int expected; + + if ((istat & INFINIPATH_I_SDMAINT) && + !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + ipath_sdma_intr(dd); + + if (istat & INFINIPATH_I_SDMADISABLED) { + expected = test_bit(IPATH_SDMA_ABORTING, + &dd->ipath_sdma_status); + ipath_dbg("%s SDmaDisabled intr\n", + expected ? "expected" : "unexpected"); + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!expected) + ipath_cancel_sends(dd, 1); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + } +} + +static int handle_hdrq_full(struct ipath_devdata *dd) +{ + int chkerrpkts = 0; + u32 hd, tl; + u32 i; + + ipath_stats.sps_hdrqfull++; + for (i = 0; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (i == 0) { + /* + * For kernel receive queues, we just want to know + * if there are packets in the queue that we can + * process. + */ + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1 << i; + continue; + } + + /* Skip if user context is not open */ + if (!pd || !pd->port_cnt) + continue; + + /* Don't report the same point multiple times. */ + if (dd->ipath_flags & IPATH_NODMA_RTAIL) + tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i); + else + tl = ipath_get_rcvhdrtail(pd); + if (tl == pd->port_lastrcvhdrqtail) + continue; + + hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i); + if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) { + pd->port_lastrcvhdrqtail = tl; + pd->port_hdrqfull++; + /* flush hdrqfull so that poll() sees it */ + wmb(); + wake_up_interruptible(&pd->port_wait); + } + } + + return chkerrpkts; +} + +static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) +{ + char msg[128]; + u64 ignore_this_time = 0; + u64 iserr = 0; + int chkerrpkts = 0, noprint = 0; + unsigned supp_msgs; + int log_idx; + + /* + * don't report errors that are masked, either at init + * (not set in ipath_errormask), or temporarily (set in + * ipath_maskederrs) + */ + errs &= dd->ipath_errormask & ~dd->ipath_maskederrs; + + supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg, + &noprint); + + /* do these first, they are most important */ + if (errs & INFINIPATH_E_HARDWARE) { + /* reuse same msg buf */ + dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg); + } else { + u64 mask; + for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) { + mask = dd->ipath_eep_st_masks[log_idx].errs_to_log; + if (errs & mask) + ipath_inc_eeprom_err(dd, log_idx, 1); + } + } + + if (errs & INFINIPATH_E_SDMAERRS) + handle_sdma_errors(dd, errs); + + if (!noprint && (errs & ~dd->ipath_e_bitsextant)) + ipath_dev_err(dd, "error interrupt with unknown errors " + "%llx set\n", (unsigned long long) + (errs & ~dd->ipath_e_bitsextant)); + + if (errs & E_SUM_ERRS) + ignore_this_time = handle_e_sum_errs(dd, errs); + else if ((errs & E_SUM_LINK_PKTERRS) && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + /* + * This can happen when SMA is trying to bring the link + * up, but the IB link changes state at the "wrong" time. + * The IB logic then complains that the packet isn't + * valid. We don't want to confuse people, so we just + * don't print them, except at debug + */ + ipath_dbg("Ignoring packet errors %llx, because link not " + "ACTIVE\n", (unsigned long long) errs); + ignore_this_time = errs & E_SUM_LINK_PKTERRS; + } + + if (supp_msgs == 250000) { + int s_iserr; + /* + * It's not entirely reasonable assuming that the errors set + * in the last clear period are all responsible for the + * problem, but the alternative is to assume it's the only + * ones on this particular interrupt, which also isn't great + */ + dd->ipath_maskederrs |= dd->ipath_lasterror | errs; + + dd->ipath_errormask &= ~dd->ipath_maskederrs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + s_iserr = ipath_decode_err(dd, msg, sizeof msg, + dd->ipath_maskederrs); + + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | + INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Temporarily disabling " + "error(s) %llx reporting; too frequent (%s)\n", + (unsigned long long) dd->ipath_maskederrs, + msg); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", + * for some types of processes (mostly benchmarks) + * that send huge numbers of messages, while not + * processing them. So only complain about + * these at debug level. + */ + if (s_iserr) + ipath_dbg("Temporarily disabling reporting " + "too frequent queue full errors (%s)\n", + msg); + else + ipath_cdbg(ERRPKT, + "Temporarily disabling reporting too" + " frequent packet errors (%s)\n", + msg); + } + + /* + * Re-enable the masked errors after around 3 minutes. in + * ipath_get_faststats(). If we have a series of fast + * repeating but different errors, the interval will keep + * stretching out, but that's OK, as that's pretty + * catastrophic. + */ + dd->ipath_unmasktime = jiffies + HZ * 180; + } + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs); + if (ignore_this_time) + errs &= ~ignore_this_time; + if (errs & ~dd->ipath_lasterror) { + errs &= ~dd->ipath_lasterror; + /* never suppress duplicate hwerrors or ibstatuschange */ + dd->ipath_lasterror |= errs & + ~(INFINIPATH_E_HARDWARE | + INFINIPATH_E_IBSTATUSCHANGED); + } + + if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) { + dd->ipath_spectriggerhit++; + ipath_dbg("%lu special trigger hits\n", + dd->ipath_spectriggerhit); + } + + /* likely due to cancel; so suppress message unless verbose */ + if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && + time_after(dd->ipath_lastcancel, jiffies)) { + /* armlaunch takes precedence; it often causes both. */ + ipath_cdbg(VERBOSE, + "Suppressed %s error (%llx) after sendbuf cancel\n", + (errs & INFINIPATH_E_SPIOARMLAUNCH) ? + "armlaunch" : "sendpktlen", (unsigned long long)errs); + errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN); + } + + if (!errs) + return 0; + + if (!noprint) { + ipath_err_t mask; + /* + * The ones we mask off are handled specially below + * or above. Also mask SDMADISABLED by default as it + * is too chatty. + */ + mask = INFINIPATH_E_IBSTATUSCHANGED | + INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED; + + /* if we're in debug, then don't mask SDMADISABLED msgs */ + if (ipath_debug & __IPATH_DBG) + mask &= ~INFINIPATH_E_SDMADISABLED; + + ipath_decode_err(dd, msg, sizeof msg, errs & ~mask); + } else + /* so we don't need if (!noprint) at strlcat's below */ + *msg = 0; + + if (errs & E_SUM_PKTERRS) { + ipath_stats.sps_pkterrs++; + chkerrpkts = 1; + } + if (errs & E_SUM_ERRS) + ipath_stats.sps_errs++; + + if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) { + ipath_stats.sps_crcerrs++; + chkerrpkts = 1; + } + iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS); + + + /* + * We don't want to print these two as they happen, or we can make + * the situation even worse, because it takes so long to print + * messages to serial consoles. Kernel ports get printed from + * fast_stats, no more than every 5 seconds, user ports get printed + * on close + */ + if (errs & INFINIPATH_E_RRCVHDRFULL) + chkerrpkts |= handle_hdrq_full(dd); + if (errs & INFINIPATH_E_RRCVEGRFULL) { + struct ipath_portdata *pd = dd->ipath_pd[0]; + + /* + * since this is of less importance and not likely to + * happen without also getting hdrfull, only count + * occurrences; don't check each port (or even the kernel + * vs user) + */ + ipath_stats.sps_etidfull++; + if (pd->port_head != ipath_get_hdrqtail(pd)) + chkerrpkts |= 1; + } + + /* + * do this before IBSTATUSCHANGED, in case both bits set in a single + * interrupt; we want the STATUSCHANGE to "win", so we do our + * internal copy of state machine correctly + */ + if (errs & INFINIPATH_E_RIBLOSTLINK) { + /* + * force through block below + */ + errs |= INFINIPATH_E_IBSTATUSCHANGED; + ipath_stats.sps_iblink++; + dd->ipath_flags |= IPATH_LINKDOWN; + dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT + | IPATH_LINKARMED | IPATH_LINKACTIVE); + *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY; + + ipath_dbg("Lost link, link now down (%s)\n", + ipath_ibcstatus_str[ipath_read_kreg64(dd, + dd->ipath_kregs->kr_ibcstatus) & 0xf]); + } + if (errs & INFINIPATH_E_IBSTATUSCHANGED) + handle_e_ibstatuschanged(dd, errs); + + if (errs & INFINIPATH_E_RESET) { + if (!noprint) + ipath_dev_err(dd, "Got reset, requires re-init " + "(unload and reload driver)\n"); + dd->ipath_flags &= ~IPATH_INITTED; /* needs re-init */ + /* mark as having had error */ + *dd->ipath_statusp |= IPATH_STATUS_HWERROR; + *dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF; + } + + if (!noprint && *msg) { + if (iserr) + ipath_dev_err(dd, "%s error\n", msg); + } + if (dd->ipath_state_wanted & dd->ipath_flags) { + ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, " + "waking\n", dd->ipath_state_wanted, + dd->ipath_flags); + wake_up_interruptible(&ipath_state_wait); + } + + return chkerrpkts; +} + +/* + * try to cleanup as much as possible for anything that might have gone + * wrong while in freeze mode, such as pio buffers being written by user + * processes (causing armlaunch), send errors due to going into freeze mode, + * etc., and try to avoid causing extra interrupts while doing so. + * Forcibly update the in-memory pioavail register copies after cleanup + * because the chip won't do it while in freeze mode (the register values + * themselves are kept correct). + * Make sure that we don't lose any important interrupts by using the chip + * feature that says that writing 0 to a bit in *clear that is set in + * *status will cause an interrupt to be generated again (if allowed by + * the *mask value). + */ +void ipath_clear_freeze(struct ipath_devdata *dd) +{ + /* disable error interrupts, to avoid confusion */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL); + + /* also disable interrupts; errormask is sometimes overwriten */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + + ipath_cancel_sends(dd, 1); + + /* clear the freeze, and be sure chip saw it */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_control, + dd->ipath_control); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + + /* force in-memory update now we are out of freeze */ + ipath_force_pio_avail_update(dd); + + /* + * force new interrupt if any hwerr, error or interrupt bits are + * still set, and clear "safe" send packet errors related to freeze + * and cancelling sends. Re-enable error interrupts before possible + * force of re-interrupt on pending interrupts. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, + E_SPKT_ERRS_IGNORE); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); +} + + +/* this is separate to allow for better optimization of ipath_intr() */ + +static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp) +{ + /* + * sometimes happen during driver init and unload, don't want + * to process any interrupts at that point + */ + + /* this is just a bandaid, not a fix, if something goes badly + * wrong */ + if (++*unexpectp > 100) { + if (++*unexpectp > 105) { + /* + * ok, we must be taking somebody else's interrupts, + * due to a messed up mptable and/or PIRQ table, so + * unregister the interrupt. We've seen this during + * linuxbios development work, and it may happen in + * the future again. + */ + if (dd->pcidev && dd->ipath_irq) { + ipath_dev_err(dd, "Now %u unexpected " + "interrupts, unregistering " + "interrupt handler\n", + *unexpectp); + ipath_dbg("free_irq of irq %d\n", + dd->ipath_irq); + dd->ipath_f_free_irq(dd); + } + } + if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) { + ipath_dev_err(dd, "%u unexpected interrupts, " + "disabling interrupts completely\n", + *unexpectp); + /* + * disable all interrupts, something is very wrong + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, + 0ULL); + } + } else if (*unexpectp > 1) + ipath_dbg("Interrupt when not ready, should not happen, " + "ignoring\n"); +} + +static noinline void ipath_bad_regread(struct ipath_devdata *dd) +{ + static int allbits; + + /* separate routine, for better optimization of ipath_intr() */ + + /* + * We print the message and disable interrupts, in hope of + * having a better chance of debugging the problem. + */ + ipath_dev_err(dd, + "Read of interrupt status failed (all bits set)\n"); + if (allbits++) { + /* disable all interrupts, something is very wrong */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL); + if (allbits == 2) { + ipath_dev_err(dd, "Still bad interrupt status, " + "unregistering interrupt\n"); + dd->ipath_f_free_irq(dd); + } else if (allbits > 2) { + if ((allbits % 10000) == 0) + printk("."); + } else + ipath_dev_err(dd, "Disabling interrupts, " + "multiple errors\n"); + } +} + +static void handle_layer_pioavail(struct ipath_devdata *dd) +{ + unsigned long flags; + int ret; + + ret = ipath_ib_piobufavail(dd->verbs_dev); + if (ret > 0) + goto set; + + return; +set: + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); +} + +/* + * Handle receive interrupts for user ports; this means a user + * process was waiting for a packet to arrive, and didn't want + * to poll + */ +static void handle_urcv(struct ipath_devdata *dd, u64 istat) +{ + u64 portr; + int i; + int rcvdint = 0; + + /* + * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and + * test_and_clear_bit(IPATH_PORT_WAITING_URG) below + * would both like timely updates of the bits so that + * we don't pass them by unnecessarily. the rmb() + * here ensures that we see them promptly -- the + * corresponding wmb()'s are in ipath_poll_urgent() + * and ipath_poll_next()... + */ + rmb(); + portr = ((istat >> dd->ipath_i_rcvavail_shift) & + dd->ipath_i_rcvavail_mask) | + ((istat >> dd->ipath_i_rcvurg_shift) & + dd->ipath_i_rcvurg_mask); + for (i = 1; i < dd->ipath_cfgports; i++) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (portr & (1 << i) && pd && pd->port_cnt) { + if (test_and_clear_bit(IPATH_PORT_WAITING_RCV, + &pd->port_flag)) { + clear_bit(i + dd->ipath_r_intravail_shift, + &dd->ipath_rcvctrl); + wake_up_interruptible(&pd->port_wait); + rcvdint = 1; + } else if (test_and_clear_bit(IPATH_PORT_WAITING_URG, + &pd->port_flag)) { + pd->port_urgent++; + wake_up_interruptible(&pd->port_wait); + } + } + } + if (rcvdint) { + /* only want to take one interrupt, so turn off the rcv + * interrupt for all the ports that we set the rcv_waiting + * (but never for kernel port) + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl, + dd->ipath_rcvctrl); + } +} + +irqreturn_t ipath_intr(int irq, void *data) +{ + struct ipath_devdata *dd = data; + u64 istat, chk0rcv = 0; + ipath_err_t estat = 0; + irqreturn_t ret; + static unsigned unexpected = 0; + u64 kportrbits; + + ipath_stats.sps_ints++; + + if (dd->ipath_int_counter != (u32) -1) + dd->ipath_int_counter++; + + if (!(dd->ipath_flags & IPATH_PRESENT)) { + /* + * This return value is not great, but we do not want the + * interrupt core code to remove our interrupt handler + * because we don't appear to be handling an interrupt + * during a chip reset. + */ + return IRQ_HANDLED; + } + + /* + * this needs to be flags&initted, not statusp, so we keep + * taking interrupts even after link goes down, etc. + * Also, we *must* clear the interrupt at some point, or we won't + * take it again, which can be real bad for errors, etc... + */ + + if (!(dd->ipath_flags & IPATH_INITTED)) { + ipath_bad_intr(dd, &unexpected); + ret = IRQ_NONE; + goto bail; + } + + istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus); + + if (unlikely(!istat)) { + ipath_stats.sps_nullintr++; + ret = IRQ_NONE; /* not our interrupt, or already handled */ + goto bail; + } + if (unlikely(istat == -1)) { + ipath_bad_regread(dd); + /* don't know if it was our interrupt or not */ + ret = IRQ_NONE; + goto bail; + } + + if (unexpected) + unexpected = 0; + + if (unlikely(istat & ~dd->ipath_i_bitsextant)) + ipath_dev_err(dd, + "interrupt with unknown interrupts %Lx set\n", + (unsigned long long) + istat & ~dd->ipath_i_bitsextant); + else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */ + ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n", + (unsigned long long) istat); + + if (istat & INFINIPATH_I_ERROR) { + ipath_stats.sps_errints++; + estat = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_errorstatus); + if (!estat) + dev_info(&dd->pcidev->dev, "error interrupt (%Lx), " + "but no error bits set!\n", + (unsigned long long) istat); + else if (estat == -1LL) + /* + * should we try clearing all, or hope next read + * works? + */ + ipath_dev_err(dd, "Read of error status failed " + "(all bits set); ignoring\n"); + else + chk0rcv |= handle_errors(dd, estat); + } + + if (istat & INFINIPATH_I_GPIO) { + /* + * GPIO interrupts fall in two broad classes: + * GPIO_2 indicates (on some HT4xx boards) that a packet + * has arrived for Port 0. Checking for this + * is controlled by flag IPATH_GPIO_INTR. + * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate + * errors that we need to count. Checking for this + * is controlled by flag IPATH_GPIO_ERRINTRS. + */ + u32 gpiostatus; + u32 to_clear = 0; + + gpiostatus = ipath_read_kreg32( + dd, dd->ipath_kregs->kr_gpio_status); + /* First the error-counter case. */ + if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) && + (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) { + /* want to clear the bits we see asserted. */ + to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK); + + /* + * Count appropriately, clear bits out of our copy, + * as they have been "handled". + */ + if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) { + ipath_dbg("FlowCtl on UnsupVL\n"); + dd->ipath_rxfc_unsupvl_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) { + ipath_dbg("Overrun Threshold exceeded\n"); + dd->ipath_overrun_thresh_errs++; + } + if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) { + ipath_dbg("Local Link Integrity error\n"); + dd->ipath_lli_errs++; + } + gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK; + } + /* Now the Port0 Receive case */ + if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) && + (dd->ipath_flags & IPATH_GPIO_INTR)) { + /* + * GPIO status bit 2 is set, and we expected it. + * clear it and indicate in p0bits. + * This probably only happens if a Port0 pkt + * arrives at _just_ the wrong time, and we + * handle that by seting chk0rcv; + */ + to_clear |= (1 << IPATH_GPIO_PORT0_BIT); + gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT); + chk0rcv = 1; + } + if (gpiostatus) { + /* + * Some unexpected bits remain. If they could have + * caused the interrupt, complain and clear. + * To avoid repetition of this condition, also clear + * the mask. It is almost certainly due to error. + */ + const u32 mask = (u32) dd->ipath_gpio_mask; + + if (mask & gpiostatus) { + ipath_dbg("Unexpected GPIO IRQ bits %x\n", + gpiostatus & mask); + to_clear |= (gpiostatus & mask); + dd->ipath_gpio_mask &= ~(gpiostatus & mask); + ipath_write_kreg(dd, + dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + } + if (to_clear) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear, + (u64) to_clear); + } + } + + /* + * Clear the interrupt bits we found set, unless they are receive + * related, in which case we already cleared them above, and don't + * want to clear them again, because we might lose an interrupt. + * Clear it early, so we "know" know the chip will have seen this by + * the time we process the queue, and will re-interrupt if necessary. + * The processor itself won't take the interrupt again until we return. + */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat); + + /* + * Handle kernel receive queues before checking for pio buffers + * available since receives can overflow; piobuf waiters can afford + * a few extra cycles, since they were waiting anyway, and user's + * waiting for receive are at the bottom. + */ + kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) | + (1ULL << dd->ipath_i_rcvurg_shift); + if (chk0rcv || (istat & kportrbits)) { + istat &= ~kportrbits; + ipath_kreceive(dd->ipath_pd[0]); + } + + if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) | + (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift))) + handle_urcv(dd, istat); + + if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED)) + handle_sdma_intr(dd, istat); + + if (istat & INFINIPATH_I_SPIOBUFAVAIL) { + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* always process; sdma verbs uses PIO for acks and VL15 */ + handle_layer_pioavail(dd); + } + + ret = IRQ_HANDLED; + +bail: + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_kernel.h b/drivers/staging/rdma/ipath/ipath_kernel.h new file mode 100644 index 000000000..f0f947122 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_kernel.h @@ -0,0 +1,1373 @@ +#ifndef _IPATH_KERNEL_H +#define _IPATH_KERNEL_H +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This header file is the base header file for infinipath kernel code + * ipath_user.h serves a similar purpose for user code. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_common.h" +#include "ipath_debug.h" +#include "ipath_registers.h" + +/* only s/w major version of InfiniPath we can handle */ +#define IPATH_CHIP_VERS_MAJ 2U + +/* don't care about this except printing */ +#define IPATH_CHIP_VERS_MIN 0U + +/* temporary, maybe always */ +extern struct infinipath_stats ipath_stats; + +#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ +/* + * First-cut critierion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000) +/* + * Struct used to indicate which errors are logged in each of the + * error-counters that are logged to EEPROM. A counter is incremented + * _once_ (saturating at 255) for each event with any bits set in + * the error or hwerror register masks below. + */ +#define IPATH_EEP_LOG_CNT (4) +struct ipath_eep_log_mask { + u64 errs_to_log; + u64 hwerrs_to_log; +}; + +struct ipath_portdata { + void **port_rcvegrbuf; + dma_addr_t *port_rcvegrbuf_phys; + /* rcvhdrq base, needs mmap before useful */ + void *port_rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + void *port_rcvhdrtail_kvaddr; + /* + * temp buffer for expected send setup, allocated at open, instead + * of each setup call + */ + void *port_tid_pg_list; + /* when waiting for rcv or pioavail */ + wait_queue_head_t port_wait; + /* + * rcvegr bufs base, physical, must fit + * in 44 bits so 32 bit programs mmap64 44 bit works) + */ + dma_addr_t port_rcvegr_phys; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t port_rcvhdrq_phys; + dma_addr_t port_rcvhdrqtailaddr_phys; + /* + * number of opens (including slave subports) on this instance + * (ignoring forks, dup, etc. for now) + */ + int port_cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned port_port; + /* non-zero if port is being shared. */ + u16 port_subport_cnt; + /* non-zero if port is being shared. */ + u16 port_subport_id; + /* number of pio bufs for this port (all procs, if shared) */ + u32 port_piocnt; + /* first pio buffer for this port */ + u32 port_pio_base; + /* chip offset of PIO buffers for this port */ + u32 port_piobufs; + /* how many alloc_pages() chunks in port_rcvegrbuf_pages */ + u32 port_rcvegrbuf_chunks; + /* how many egrbufs per chunk */ + u32 port_rcvegrbufs_perchunk; + /* order for port_rcvegrbuf_pages */ + size_t port_rcvegrbuf_size; + /* rcvhdrq size (for freeing) */ + size_t port_rcvhdrq_size; + /* next expected TID to check when looking for free */ + u32 port_tidcursor; + /* next expected TID to check */ + unsigned long port_flag; + /* what happened */ + unsigned long int_flag; + /* WAIT_RCV that timed out, no interrupt */ + u32 port_rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 port_piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 port_rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 port_pionowait; + /* total number of rcvhdrqfull errors */ + u32 port_hdrqfull; + /* + * Used to suppress multiple instances of same + * port staying stuck at same point. + */ + u32 port_lastrcvhdrqtail; + /* saved total number of rcvhdrqfull errors for poll edge trigger */ + u32 port_hdrqfull_poll; + /* total number of polled urgent packets */ + u32 port_urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 port_urgent_poll; + /* pid of process using this port */ + struct pid *port_pid; + struct pid *port_subpid[INFINIPATH_MAX_SUBPORT]; + /* same size as task_struct .comm[] */ + char port_comm[16]; + /* pkeys set by this use of this port */ + u16 port_pkeys[4]; + /* so file ops can get at unit */ + struct ipath_devdata *port_dd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subport_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subport_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subport_rcvhdr_base; + /* The version of the library which opened this port */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* port rcvhdrq head offset */ + u32 port_head; + /* receive packet sequence counter */ + u32 port_seq_cnt; +}; + +struct sk_buff; +struct ipath_sge_state; +struct ipath_verbs_txreq; + +/* + * control information for layered drivers + */ +struct _ipath_layer { + void *l_arg; +}; + +struct ipath_skbinfo { + struct sk_buff *skb; + dma_addr_t phys; +}; + +struct ipath_sdma_txreq { + int flags; + int sg_count; + union { + struct scatterlist *sg; + void *map_addr; + }; + void (*callback)(void *, int); + void *callback_cookie; + int callback_status; + u16 start_idx; /* sdma private */ + u16 next_descq_idx; /* sdma private */ + struct list_head list; /* sdma private */ +}; + +struct ipath_sdma_desc { + __le64 qw[2]; +}; + +#define IPATH_SDMA_TXREQ_F_USELARGEBUF 0x1 +#define IPATH_SDMA_TXREQ_F_HEADTOHOST 0x2 +#define IPATH_SDMA_TXREQ_F_INTREQ 0x4 +#define IPATH_SDMA_TXREQ_F_FREEBUF 0x8 +#define IPATH_SDMA_TXREQ_F_FREEDESC 0x10 +#define IPATH_SDMA_TXREQ_F_VL15 0x20 + +#define IPATH_SDMA_TXREQ_S_OK 0 +#define IPATH_SDMA_TXREQ_S_SENDERROR 1 +#define IPATH_SDMA_TXREQ_S_ABORTED 2 +#define IPATH_SDMA_TXREQ_S_SHUTDOWN 3 + +#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG (1ull << 63) +#define IPATH_SDMA_STATUS_ABORT_IN_PROG (1ull << 62) +#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE (1ull << 61) +#define IPATH_SDMA_STATUS_SCB_EMPTY (1ull << 30) + +/* max dwords in small buffer packet */ +#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2) + +/* + * Possible IB config parameters for ipath_f_get/set_ib_cfg() + */ +#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */ +#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */ +#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */ +#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */ +#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */ +#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */ +#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */ +#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */ +#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */ +#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */ +#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */ + + +struct ipath_devdata { + struct list_head ipath_list; + + struct ipath_kregs const *ipath_kregs; + struct ipath_cregs const *ipath_cregs; + + /* mem-mapped pointer to base of chip regs */ + u64 __iomem *ipath_kregbase; + /* end of mem-mapped chip space; range checking */ + u64 __iomem *ipath_kregend; + /* physical address of chip for io_remap, etc. */ + unsigned long ipath_physaddr; + /* base of memory alloced for ipath_kregbase, for free */ + u64 *ipath_kregalloc; + /* ipath_cfgports pointers */ + struct ipath_portdata **ipath_pd; + /* sk_buffs used by port 0 eager receive queue */ + struct ipath_skbinfo *ipath_port0_skbinfo; + /* kvirt address of 1st 2k pio buffer */ + void __iomem *ipath_pio2kbase; + /* kvirt address of 1st 4k pio buffer */ + void __iomem *ipath_pio4kbase; + /* + * points to area where PIOavail registers will be DMA'ed. + * Has to be on a page of it's own, because the page will be + * mapped into user program space. This copy is *ONLY* ever + * written by DMA, not by the driver! Need a copy per device + * when we get to multiple devices + */ + volatile __le64 *ipath_pioavailregs_dma; + /* physical address where updates occur */ + dma_addr_t ipath_pioavailregs_phys; + struct _ipath_layer ipath_layer; + /* setup intr */ + int (*ipath_f_intrsetup)(struct ipath_devdata *); + /* fallback to alternate interrupt type if possible */ + int (*ipath_f_intr_fallback)(struct ipath_devdata *); + /* setup on-chip bus config */ + int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *); + /* hard reset chip */ + int (*ipath_f_reset)(struct ipath_devdata *); + int (*ipath_f_get_boardname)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_init_hwerrors)(struct ipath_devdata *); + void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *, + size_t); + void (*ipath_f_quiet_serdes)(struct ipath_devdata *); + int (*ipath_f_bringup_serdes)(struct ipath_devdata *); + int (*ipath_f_early_init)(struct ipath_devdata *); + void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned); + void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*, + u32, unsigned long); + void (*ipath_f_tidtemplate)(struct ipath_devdata *); + void (*ipath_f_cleanup)(struct ipath_devdata *); + void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64); + /* fill out chip-specific fields */ + int (*ipath_f_get_base_info)(struct ipath_portdata *, void *); + /* free irq */ + void (*ipath_f_free_irq)(struct ipath_devdata *); + struct ipath_message_header *(*ipath_f_get_msgheader) + (struct ipath_devdata *, __le32 *); + void (*ipath_f_config_ports)(struct ipath_devdata *, ushort); + int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int); + int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32); + void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16); + void (*ipath_f_read_counters)(struct ipath_devdata *, + struct infinipath_counters *); + void (*ipath_f_xgxs_reset)(struct ipath_devdata *); + /* per chip actions needed for IB Link up/down changes */ + int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64); + + unsigned ipath_lastegr_idx; + struct ipath_ibdev *verbs_dev; + struct timer_list verbs_timer; + /* total dwords sent (summed from counter) */ + u64 ipath_sword; + /* total dwords rcvd (summed from counter) */ + u64 ipath_rword; + /* total packets sent (summed from counter) */ + u64 ipath_spkts; + /* total packets rcvd (summed from counter) */ + u64 ipath_rpkts; + /* ipath_statusp initially points to this. */ + u64 _ipath_status; + /* GUID for this interface, in network order */ + __be64 ipath_guid; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of error reporting + */ + ipath_err_t ipath_lasterror; + /* + * aggregrate of error bits reported since last cleared, for + * limiting of hwerror reporting + */ + ipath_err_t ipath_lasthwerror; + /* errors masked because they occur too fast */ + ipath_err_t ipath_maskederrs; + u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */ + /* these 5 fields are used to establish deltas for IB Symbol + * errors and linkrecovery errors. They can be reported on + * some chips during link negotiation prior to INIT, and with + * DDR when faking DDR negotiations with non-IBTA switches. + * The chip counters are adjusted at driver unload if there is + * a non-zero delta. + */ + u64 ibdeltainprog; + u64 ibsymdelta; + u64 ibsymsnap; + u64 iblnkerrdelta; + u64 iblnkerrsnap; + + /* time in jiffies at which to re-enable maskederrs */ + unsigned long ipath_unmasktime; + /* count of egrfull errors, combined for all ports */ + u64 ipath_last_tidfull; + /* for ipath_qcheck() */ + u64 ipath_lastport0rcv_cnt; + /* template for writing TIDs */ + u64 ipath_tidtemplate; + /* value to write to free TIDs */ + u64 ipath_tidinvalid; + /* IBA6120 rcv interrupt setup */ + u64 ipath_rhdrhead_intr_off; + + /* size of memory at ipath_kregbase */ + u32 ipath_kregsize; + /* number of registers used for pioavail */ + u32 ipath_pioavregs; + /* IPATH_POLL, etc. */ + u32 ipath_flags; + /* ipath_flags driver is waiting for */ + u32 ipath_state_wanted; + /* last buffer for user use, first buf for kernel use is this + * index. */ + u32 ipath_lastport_piobuf; + /* is a stats timer active */ + u32 ipath_stats_timer_active; + /* number of interrupts for this device -- saturates... */ + u32 ipath_int_counter; + /* dwords sent read from counter */ + u32 ipath_lastsword; + /* dwords received read from counter */ + u32 ipath_lastrword; + /* sent packets read from counter */ + u32 ipath_lastspkts; + /* received packets read from counter */ + u32 ipath_lastrpkts; + /* pio bufs allocated per port */ + u32 ipath_pbufsport; + /* if remainder on bufs/port, ports < extrabuf get 1 extra */ + u32 ipath_ports_extrabuf; + u32 ipath_pioupd_thresh; /* update threshold, some chips */ + /* + * number of ports configured as max; zero is set to number chip + * supports, less gives more pio bufs/port, etc. + */ + u32 ipath_cfgports; + /* count of port 0 hdrqfull errors */ + u32 ipath_p0_hdrqfull; + /* port 0 number of receive eager buffers */ + u32 ipath_p0_rcvegrcnt; + + /* + * index of last piobuffer we used. Speeds up searching, by + * starting at this point. Doesn't matter if multiple cpu's use and + * update, last updater is only write that matters. Whenever it + * wraps, we update shadow copies. Need a copy per device when we + * get to multiple devices + */ + u32 ipath_lastpioindex; + u32 ipath_lastpioindexl; + /* max length of freezemsg */ + u32 ipath_freezelen; + /* + * consecutive times we wanted a PIO buffer but were unable to + * get one + */ + u32 ipath_consec_nopiobuf; + /* + * hint that we should update ipath_pioavailshadow before + * looking for a PIO buffer + */ + u32 ipath_upd_pio_shadow; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar0; + /* so we can rewrite it after a chip reset */ + u32 ipath_pcibar1; + u32 ipath_x1_fix_tries; + u32 ipath_autoneg_tries; + u32 serdes_first_init_done; + + struct ipath_relock { + atomic_t ipath_relock_timer_active; + struct timer_list ipath_relock_timer; + unsigned int ipath_relock_interval; /* in jiffies */ + } ipath_relock_singleton; + + /* interrupt number */ + int ipath_irq; + /* HT/PCI Vendor ID (here for NodeInfo) */ + u16 ipath_vendorid; + /* HT/PCI Device ID (here for NodeInfo) */ + u16 ipath_deviceid; + /* offset in HT config space of slave/primary interface block */ + u8 ipath_ht_slave_off; + /* for write combining settings */ + int wc_cookie; + /* ref count for each pkey */ + atomic_t ipath_pkeyrefs[4]; + /* shadow copy of struct page *'s for exp tid pages */ + struct page **ipath_pageshadow; + /* shadow copy of dma handles for exp tid pages */ + dma_addr_t *ipath_physshadow; + u64 __iomem *ipath_egrtidbase; + /* lock to workaround chip bug 9437 and others */ + spinlock_t ipath_kernel_tid_lock; + spinlock_t ipath_user_tid_lock; + spinlock_t ipath_sendctrl_lock; + /* around ipath_pd and (user ports) port_cnt use (intr vs free) */ + spinlock_t ipath_uctxt_lock; + + /* + * IPATH_STATUS_*, + * this address is mapped readonly into user processes so they can + * get status cheaply, whenever they want. + */ + u64 *ipath_statusp; + /* freeze msg if hw error put chip in freeze */ + char *ipath_freezemsg; + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev *user_cdev; + struct cdev *diag_cdev; + struct device *user_dev; + struct device *diag_dev; + /* timer used to prevent stats overflow, error throttling, etc. */ + struct timer_list ipath_stats_timer; + /* timer to verify interrupts work, and fallback if possible */ + struct timer_list ipath_intrchk_timer; + void *ipath_dummy_hdrq; /* used after port close */ + dma_addr_t ipath_dummy_hdrq_phys; + + /* SendDMA related entries */ + spinlock_t ipath_sdma_lock; + unsigned long ipath_sdma_status; + unsigned long ipath_sdma_abort_jiffies; + unsigned long ipath_sdma_abort_intr_timeout; + unsigned long ipath_sdma_buf_jiffies; + struct ipath_sdma_desc *ipath_sdma_descq; + u64 ipath_sdma_descq_added; + u64 ipath_sdma_descq_removed; + int ipath_sdma_desc_nreserved; + u16 ipath_sdma_descq_cnt; + u16 ipath_sdma_descq_tail; + u16 ipath_sdma_descq_head; + u16 ipath_sdma_next_intr; + u16 ipath_sdma_reset_wait; + u8 ipath_sdma_generation; + struct tasklet_struct ipath_sdma_abort_task; + struct tasklet_struct ipath_sdma_notify_task; + struct list_head ipath_sdma_activelist; + struct list_head ipath_sdma_notifylist; + atomic_t ipath_sdma_vl15_count; + struct timer_list ipath_sdma_vl15_timer; + + dma_addr_t ipath_sdma_descq_phys; + volatile __le64 *ipath_sdma_head_dma; + dma_addr_t ipath_sdma_head_phys; + + unsigned long ipath_ureg_align; /* user register alignment */ + + struct delayed_work ipath_autoneg_work; + wait_queue_head_t ipath_autoneg_wait; + + /* HoL blocking / user app forward-progress state */ + unsigned ipath_hol_state; + unsigned ipath_hol_next; + struct timer_list ipath_hol_timer; + + /* + * Shadow copies of registers; size indicates read access size. + * Most of them are readonly, but some are write-only register, + * where we manipulate the bits in the shadow copy, and then write + * the shadow copy to infinipath. + * + * We deliberately make most of these 32 bits, since they have + * restricted range. For any that we read, we won't to generate 32 + * bit accesses, since Opteron will generate 2 separate 32 bit HT + * transactions for a 64 bit read, and we want to avoid unnecessary + * HT transactions. + */ + + /* This is the 64 bit group */ + + /* + * shadow of pioavail, check to be sure it's large enough at + * init time. + */ + unsigned long ipath_pioavailshadow[8]; + /* bitmap of send buffers available for the kernel to use with PIO. */ + unsigned long ipath_pioavailkernel[8]; + /* shadow of kr_gpio_out, for rmw ops */ + u64 ipath_gpio_out; + /* shadow the gpio mask register */ + u64 ipath_gpio_mask; + /* shadow the gpio output enable, etc... */ + u64 ipath_extctrl; + /* kr_revision shadow */ + u64 ipath_revision; + /* + * shadow of ibcctrl, for interrupt handling of link changes, + * etc. + */ + u64 ipath_ibcctrl; + /* + * last ibcstatus, to suppress "duplicate" status change messages, + * mostly from 2 to 3 + */ + u64 ipath_lastibcstat; + /* hwerrmask shadow */ + ipath_err_t ipath_hwerrmask; + ipath_err_t ipath_errormask; /* errormask shadow */ + /* interrupt config reg shadow */ + u64 ipath_intconfig; + /* kr_sendpiobufbase value */ + u64 ipath_piobufbase; + /* kr_ibcddrctrl shadow */ + u64 ipath_ibcddrctrl; + + /* these are the "32 bit" regs */ + + /* + * number of GUIDs in the flash for this interface; may need some + * rethinking for setting on other ifaces + */ + u32 ipath_nguid; + /* + * the following two are 32-bit bitmasks, but {test,clear,set}_bit + * all expect bit fields to be "unsigned long" + */ + /* shadow kr_rcvctrl */ + unsigned long ipath_rcvctrl; + /* shadow kr_sendctrl */ + unsigned long ipath_sendctrl; + /* to not count armlaunch after cancel */ + unsigned long ipath_lastcancel; + /* count cases where special trigger was needed (double write) */ + unsigned long ipath_spectriggerhit; + + /* value we put in kr_rcvhdrcnt */ + u32 ipath_rcvhdrcnt; + /* value we put in kr_rcvhdrsize */ + u32 ipath_rcvhdrsize; + /* value we put in kr_rcvhdrentsize */ + u32 ipath_rcvhdrentsize; + /* offset of last entry in rcvhdrq */ + u32 ipath_hdrqlast; + /* kr_portcnt value */ + u32 ipath_portcnt; + /* kr_pagealign value */ + u32 ipath_palign; + /* number of "2KB" PIO buffers */ + u32 ipath_piobcnt2k; + /* size in bytes of "2KB" PIO buffers */ + u32 ipath_piosize2k; + /* number of "4KB" PIO buffers */ + u32 ipath_piobcnt4k; + /* size in bytes of "4KB" PIO buffers */ + u32 ipath_piosize4k; + u32 ipath_pioreserved; /* reserved special-inkernel; */ + /* kr_rcvegrbase value */ + u32 ipath_rcvegrbase; + /* kr_rcvegrcnt value */ + u32 ipath_rcvegrcnt; + /* kr_rcvtidbase value */ + u32 ipath_rcvtidbase; + /* kr_rcvtidcnt value */ + u32 ipath_rcvtidcnt; + /* kr_sendregbase */ + u32 ipath_sregbase; + /* kr_userregbase */ + u32 ipath_uregbase; + /* kr_counterregbase */ + u32 ipath_cregbase; + /* shadow the control register contents */ + u32 ipath_control; + /* PCI revision register (HTC rev on FPGA) */ + u32 ipath_pcirev; + + /* chip address space used by 4k pio buffers */ + u32 ipath_4kalign; + /* The MTU programmed for this unit */ + u32 ipath_ibmtu; + /* + * The max size IB packet, included IB headers that we can send. + * Starts same as ipath_piosize, but is affected when ibmtu is + * changed, or by size of eager buffers + */ + u32 ipath_ibmaxlen; + /* + * ibmaxlen at init time, limited by chip and by receive buffer + * size. Not changed after init. + */ + u32 ipath_init_ibmaxlen; + /* size of each rcvegrbuffer */ + u32 ipath_rcvegrbufsize; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 ipath_lbus_width; + /* localbus speed (HT: 200,400,800,1000; PCIe 2500) */ + u32 ipath_lbus_speed; + /* + * number of sequential ibcstatus change for polling active/quiet + * (i.e., link not coming up). + */ + u32 ipath_ibpollcnt; + /* low and high portions of MSI capability/vector */ + u32 ipath_msi_lo; + /* saved after PCIe init for restore after reset */ + u32 ipath_msi_hi; + /* MSI data (vector) saved for restore */ + u16 ipath_msi_data; + /* MLID programmed for this instance */ + u16 ipath_mlid; + /* LID programmed for this instance */ + u16 ipath_lid; + /* list of pkeys programmed; 0 if not set */ + u16 ipath_pkeys[4]; + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer QLogic serial number format + */ + u8 ipath_serial[16]; + /* human readable board version */ + u8 ipath_boardversion[96]; + u8 ipath_lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from ipath_revision */ + u8 ipath_majrev; + /* chip minor rev, from ipath_revision */ + u8 ipath_minrev; + /* board rev, from ipath_revision */ + u8 ipath_boardrev; + /* saved for restore after reset */ + u8 ipath_pci_cacheline; + /* LID mask control */ + u8 ipath_lmc; + /* link width supported */ + u8 ipath_link_width_supported; + /* link speed supported */ + u8 ipath_link_speed_supported; + u8 ipath_link_width_enabled; + u8 ipath_link_speed_enabled; + u8 ipath_link_width_active; + u8 ipath_link_speed_active; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 ipath_rx_pol_inv; + + u8 ipath_r_portenable_shift; + u8 ipath_r_intravail_shift; + u8 ipath_r_tailupd_shift; + u8 ipath_r_portcfg_shift; + + /* unit # of this chip, if present */ + int ipath_unit; + + /* local link integrity counter */ + u32 ipath_lli_counter; + /* local link integrity errors */ + u32 ipath_lli_errors; + /* + * Above counts only cases where _successive_ LocalLinkIntegrity + * errors were seen in the receive headers of kern-packets. + * Below are the three (monotonically increasing) counters + * maintained via GPIO interrupts on iba6120-rev2. + */ + u32 ipath_rxfc_unsupvl_errs; + u32 ipath_overrun_thresh_errs; + u32 ipath_lli_errs; + + /* + * Not all devices managed by a driver instance are the same + * type, so these fields must be per-device. + */ + u64 ipath_i_bitsextant; + ipath_err_t ipath_e_bitsextant; + ipath_err_t ipath_hwe_bitsextant; + + /* + * Below should be computable from number of ports, + * since they are never modified. + */ + u64 ipath_i_rcvavail_mask; + u64 ipath_i_rcvurg_mask; + u16 ipath_i_rcvurg_shift; + u16 ipath_i_rcvavail_shift; + + /* + * Register bits for selecting i2c direction and values, used for + * I2C serial flash. + */ + u8 ipath_gpio_sda_num; + u8 ipath_gpio_scl_num; + u8 ipath_i2c_chain_type; + u64 ipath_gpio_sda; + u64 ipath_gpio_scl; + + /* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */ + spinlock_t ipath_gpio_lock; + + /* + * IB link and linktraining states and masks that vary per chip in + * some way. Set at init, to avoid each IB status change interrupt + */ + u8 ibcs_ls_shift; + u8 ibcs_lts_mask; + u32 ibcs_mask; + u32 ib_init; + u32 ib_arm; + u32 ib_active; + + u16 ipath_rhf_offset; /* offset of RHF within receive header entry */ + + /* + * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol + * reg. Changes for IBA7220 + */ + u8 ibcc_lic_mask; /* LinkInitCmd */ + u8 ibcc_lc_shift; /* LinkCmd */ + u8 ibcc_mpl_shift; /* Maxpktlen */ + + u8 delay_mult; + + /* used to override LED behavior */ + u8 ipath_led_override; /* Substituted for normal value, if non-zero */ + u16 ipath_led_override_timeoff; /* delta to next timer event */ + u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */ + u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */ + atomic_t ipath_led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list ipath_led_override_timer; + + /* Support (including locks) for EEPROM logging of errors and time */ + /* control access to actual counters, timer */ + spinlock_t ipath_eep_st_lock; + /* control high-level access to EEPROM */ + struct mutex ipath_eep_lock; + /* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */ + uint64_t ipath_traffic_wds; + /* active time is kept in seconds, but logged in hours */ + atomic_t ipath_active_time; + /* Below are nominal shadow of EEPROM, new since last EEPROM update */ + uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT]; + uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT]; + uint16_t ipath_eep_hrs; + /* + * masks for which bits of errs, hwerrs that cause + * each of the counters to increment. + */ + struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT]; + + /* interrupt mitigation reload register info */ + u16 ipath_jint_idle_ticks; /* idle clock ticks */ + u16 ipath_jint_max_packets; /* max packets across all ports */ + + /* + * lock for access to SerDes, and flags to sequence preset + * versus steady-state. 7220-only at the moment. + */ + spinlock_t ipath_sdepb_lock; + u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */ +}; + +/* ipath_hol_state values (stopping/starting user proc, send flushing) */ +#define IPATH_HOL_UP 0 +#define IPATH_HOL_DOWN 1 +/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */ +#define IPATH_HOL_DOWNSTOP 0 +#define IPATH_HOL_DOWNCONT 1 + +/* bit positions for sdma_status */ +#define IPATH_SDMA_ABORTING 0 +#define IPATH_SDMA_DISARMED 1 +#define IPATH_SDMA_DISABLED 2 +#define IPATH_SDMA_LAYERBUF 3 +#define IPATH_SDMA_RUNNING 30 +#define IPATH_SDMA_SHUTDOWN 31 + +/* bit combinations that correspond to abort states */ +#define IPATH_SDMA_ABORT_NONE 0 +#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING) +#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED)) +#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \ + (1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED)) +#define IPATH_SDMA_ABORT_MASK ((1UL<private_data)->pd +#define subport_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->subport +#define tidcursor_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->tidcursor +#define user_sdma_queue_fp(fp) \ + ((struct ipath_filedata *)(fp)->private_data)->pq + +/* + * values for ipath_flags + */ + /* chip can report link latency (IB 1.2) */ +#define IPATH_HAS_LINK_LATENCY 0x1 + /* The chip is up and initted */ +#define IPATH_INITTED 0x2 + /* set if any user code has set kr_rcvhdrsize */ +#define IPATH_RCVHDRSZ_SET 0x4 + /* The chip is present and valid for accesses */ +#define IPATH_PRESENT 0x8 + /* HT link0 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT0 0x10 + /* HT link1 is only 8 bits wide, ignore upper byte crc + * errors, etc. */ +#define IPATH_8BIT_IN_HT1 0x20 + /* The link is down */ +#define IPATH_LINKDOWN 0x40 + /* The link level is up (0x11) */ +#define IPATH_LINKINIT 0x80 + /* The link is in the armed (0x21) state */ +#define IPATH_LINKARMED 0x100 + /* The link is in the active (0x31) state */ +#define IPATH_LINKACTIVE 0x200 + /* link current state is unknown */ +#define IPATH_LINKUNK 0x400 + /* Write combining flush needed for PIO */ +#define IPATH_PIO_FLUSH_WC 0x1000 + /* DMA Receive tail pointer */ +#define IPATH_NODMA_RTAIL 0x2000 + /* no IB cable, or no device on IB cable */ +#define IPATH_NOCABLE 0x4000 + /* Supports port zero per packet receive interrupts via + * GPIO */ +#define IPATH_GPIO_INTR 0x8000 + /* uses the coded 4byte TID, not 8 byte */ +#define IPATH_4BYTE_TID 0x10000 + /* packet/word counters are 32 bit, else those 4 counters + * are 64bit */ +#define IPATH_32BITCOUNTERS 0x20000 + /* Interrupt register is 64 bits */ +#define IPATH_INTREG_64 0x40000 + /* can miss port0 rx interrupts */ +#define IPATH_DISABLED 0x80000 /* administratively disabled */ + /* Use GPIO interrupts for new counters */ +#define IPATH_GPIO_ERRINTRS 0x100000 +#define IPATH_SWAP_PIOBUFS 0x200000 + /* Supports Send DMA */ +#define IPATH_HAS_SEND_DMA 0x400000 + /* Supports Send Count (not just word count) in PBC */ +#define IPATH_HAS_PBC_CNT 0x800000 + /* Suppress heartbeat, even if turning off loopback */ +#define IPATH_NO_HRTBT 0x1000000 +#define IPATH_HAS_THRESH_UPDATE 0x4000000 +#define IPATH_HAS_MULT_IB_SPEED 0x8000000 +#define IPATH_IB_AUTONEG_INPROG 0x10000000 +#define IPATH_IB_AUTONEG_FAILED 0x20000000 + /* Linkdown-disable intentionally, Do not attempt to bring up */ +#define IPATH_IB_LINK_DISABLED 0x40000000 +#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */ + +/* Bits in GPIO for the added interrupts */ +#define IPATH_GPIO_PORT0_BIT 2 +#define IPATH_GPIO_RXUVL_BIT 3 +#define IPATH_GPIO_OVRUN_BIT 4 +#define IPATH_GPIO_LLI_BIT 5 +#define IPATH_GPIO_ERRINTR_MASK 0x38 + +/* portdata flag bit offsets */ + /* waiting for a packet to arrive */ +#define IPATH_PORT_WAITING_RCV 2 + /* master has not finished initializing */ +#define IPATH_PORT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define IPATH_PORT_WAITING_URG 5 + +/* free up any allocated data at closes */ +void ipath_free_data(struct ipath_portdata *dd); +u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *); +void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start, + unsigned len, int avail); +void ipath_init_iba6110_funcs(struct ipath_devdata *); +void ipath_get_eeprom_info(struct ipath_devdata *); +int ipath_update_eeprom_log(struct ipath_devdata *dd); +void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr); +u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg); +void ipath_disarm_senderrbufs(struct ipath_devdata *); +void ipath_force_pio_avail_update(struct ipath_devdata *); +void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev); + +/* + * Set LED override, only the two LSBs have "public" meaning, but + * any non-zero value substitutes them for the Link and LinkTrain + * LED states. + */ +#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */ +#define IPATH_LED_LOG 2 /* Logical (link) YELLOW LED */ +void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val); + +/* send dma routines */ +int setup_sdma(struct ipath_devdata *); +void teardown_sdma(struct ipath_devdata *); +void ipath_restart_sdma(struct ipath_devdata *); +void ipath_sdma_intr(struct ipath_devdata *); +int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *, + u32, struct ipath_verbs_txreq *); +/* ipath_sdma_lock should be locked before calling this. */ +int ipath_sdma_make_progress(struct ipath_devdata *dd); + +/* must be called under ipath_sdma_lock */ +static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd) +{ + return dd->ipath_sdma_descq_cnt - + (dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) - + 1 - dd->ipath_sdma_desc_nreserved; +} + +static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt) +{ + dd->ipath_sdma_desc_nreserved += cnt; +} + +static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt) +{ + dd->ipath_sdma_desc_nreserved -= cnt; +} + +/* + * number of words used for protocol header if not set by ipath_userinit(); + */ +#define IPATH_DFLT_RCVHDRSIZE 9 + +int ipath_get_user_pages(unsigned long, size_t, struct page **); +void ipath_release_user_pages(struct page **, size_t); +void ipath_release_user_pages_on_close(struct page **, size_t); +int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int); +int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int); +int ipath_tempsense_read(struct ipath_devdata *, u8 regnum); +int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data); + +/* these are used for the registers that vary with port */ +void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg, + unsigned, u64); + +/* + * We could have a single register get/put routine, that takes a group type, + * but this is somewhat clearer and cleaner. It also gives us some error + * checking. 64 bit register reads should always work, but are inefficient + * on opteron (the northbridge always generates 2 separate HT 32 bit reads), + * so we use kreg32 wherever possible. User register and counter register + * reads are always 32 bit reads, so only one form of those routines. + */ + +/* + * At the moment, none of the s-registers are writable, so no + * ipath_write_sreg(). + */ + +/** + * ipath_read_ureg32 - read 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @port: port number + * + * Return the contents of a register that is virtualized to be per port. + * Returns -1 on errors (not distinguishable from valid contents at + * runtime; we may add a separate error variable at some point). + */ +static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd, + ipath_ureg regno, int port) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + + return readl(regno + (u64 __iomem *) + (dd->ipath_uregbase + + (char __iomem *)dd->ipath_kregbase + + dd->ipath_ureg_align * port)); +} + +/** + * ipath_write_ureg - write 32-bit virtualized per-port register + * @dd: device + * @regno: register number + * @value: value + * @port: port + * + * Write the contents of a register that is virtualized to be per port. + */ +static inline void ipath_write_ureg(const struct ipath_devdata *dd, + ipath_ureg regno, u64 value, int port) +{ + u64 __iomem *ubase = (u64 __iomem *) + (dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase + + dd->ipath_ureg_align * port); + if (dd->ipath_kregbase) + writeq(value, &ubase[regno]); +} + +static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return -1; + return readl((u32 __iomem *) & dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd, + ipath_kreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return -1; + + return readq(&dd->ipath_kregbase[regno]); +} + +static inline void ipath_write_kreg(const struct ipath_devdata *dd, + ipath_kreg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, &dd->ipath_kregbase[regno]); +} + +static inline u64 ipath_read_creg(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + + return readq(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline u32 ipath_read_creg32(const struct ipath_devdata *dd, + ipath_sreg regno) +{ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) + return 0; + return readl(regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline void ipath_write_creg(const struct ipath_devdata *dd, + ipath_creg regno, u64 value) +{ + if (dd->ipath_kregbase) + writeq(value, regno + (u64 __iomem *) + (dd->ipath_cregbase + + (char __iomem *)dd->ipath_kregbase)); +} + +static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd) +{ + *((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd) +{ + return (u32) le64_to_cpu(*((volatile __le64 *) + pd->port_rcvhdrtail_kvaddr)); +} + +static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd) +{ + const struct ipath_devdata *dd = pd->port_dd; + u32 hdrqtail; + + if (dd->ipath_flags & IPATH_NODMA_RTAIL) { + __le32 *rhf_addr; + u32 seq; + + rhf_addr = (__le32 *) pd->port_rcvhdrq + + pd->port_head + dd->ipath_rhf_offset; + seq = ipath_hdrget_seq(rhf_addr); + hdrqtail = pd->port_head; + if (seq == pd->port_seq_cnt) + hdrqtail++; + } else + hdrqtail = ipath_get_rcvhdrtail(pd); + + return hdrqtail; +} + +static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r) +{ + return (dd->ipath_flags & IPATH_INTREG_64) ? + ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r); +} + +/* + * from contents of IBCStatus (or a saved copy), return linkstate + * Report ACTIVE_DEFER as ACTIVE, because we treat them the same + * everywhere, anyway (and should be, for almost all purposes). + */ +static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs) +{ + u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) & + INFINIPATH_IBCS_LINKSTATE_MASK; + if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER) + state = INFINIPATH_IBCS_L_STATE_ACTIVE; + return state; +} + +/* from contents of IBCStatus (or a saved copy), return linktrainingstate */ +static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs) +{ + return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; +} + +/* + * from contents of IBCStatus (or a saved copy), return logical link state + * combination of link state and linktraining state (down, active, init, + * arm, etc. + */ +static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs) +{ + u32 ibs; + ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) & + dd->ibcs_lts_mask; + ibs |= (u32)(ibcs & + (INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift)); + return ibs; +} + +/* + * sysfs interface. + */ + +struct device_driver; + +extern const char ib_ipath_version[]; + +extern const struct attribute_group *ipath_driver_attr_groups[]; + +int ipath_device_create_group(struct device *, struct ipath_devdata *); +void ipath_device_remove_group(struct device *, struct ipath_devdata *); +int ipath_expose_reset(struct device *); + +int ipath_init_ipathfs(void); +void ipath_exit_ipathfs(void); +int ipathfs_add_device(struct ipath_devdata *); +int ipathfs_remove_device(struct ipath_devdata *); + +/* + * dma_addr wrappers - all 0's invalid for hw + */ +dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long, + size_t, int); +dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int); +const char *ipath_get_unit_name(int unit); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +#if defined(CONFIG_X86_64) +#define ipath_flush_wc() asm volatile("sfence" ::: "memory") +#else +#define ipath_flush_wc() wmb() +#endif + +extern unsigned ipath_debug; /* debugging bit mask */ +extern unsigned ipath_linkrecovery; +extern unsigned ipath_mtu4096; +extern struct mutex ipath_mutex; + +#define IPATH_DRV_NAME "ib_ipath" +#define IPATH_MAJOR 233 +#define IPATH_USER_MINOR_BASE 0 +#define IPATH_DIAGPKT_MINOR 127 +#define IPATH_DIAG_MINOR_BASE 129 +#define IPATH_NMINORS 255 + +#define ipath_dev_err(dd,fmt,...) \ + do { \ + const struct ipath_devdata *__dd = (dd); \ + if (__dd->pcidev) \ + dev_err(&__dd->pcidev->dev, "%s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + else \ + printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \ + ipath_get_unit_name(__dd->ipath_unit), \ + ##__VA_ARGS__); \ + } while (0) + +#if _IPATH_DEBUGGING + +# define __IPATH_DBG_WHICH(which,fmt,...) \ + do { \ + if (unlikely(ipath_debug & (which))) \ + printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \ + __func__,##__VA_ARGS__); \ + } while(0) + +# define ipath_dbg(fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__) +# define ipath_cdbg(which,fmt,...) \ + __IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__) + +#else /* ! _IPATH_DEBUGGING */ + +# define ipath_dbg(fmt,...) +# define ipath_cdbg(which,fmt,...) + +#endif /* _IPATH_DEBUGGING */ + +/* + * this is used for formatting hw error messages... + */ +struct ipath_hwerror_msgs { + u64 mask; + const char *msg; +}; + +#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b } + +/* in ipath_intr.c... */ +void ipath_format_hwerrors(u64 hwerrs, + const struct ipath_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, + char *msg, size_t lmsg); + +#endif /* _IPATH_KERNEL_H */ diff --git a/drivers/staging/rdma/ipath/ipath_keys.c b/drivers/staging/rdma/ipath/ipath_keys.c new file mode 100644 index 000000000..c0e933fec --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_keys.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/** + * ipath_alloc_lkey - allocate an lkey + * @rkt: lkey table in which to allocate the lkey + * @mr: memory region that this lkey protects + * + * Returns 1 if successful, otherwise returns 0. + */ + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr) +{ + unsigned long flags; + u32 r; + u32 n; + int ret; + + spin_lock_irqsave(&rkt->lock, flags); + + /* Find the next available LKEY */ + r = n = rkt->next; + for (;;) { + if (rkt->table[r] == NULL) + break; + r = (r + 1) & (rkt->max - 1); + if (r == n) { + spin_unlock_irqrestore(&rkt->lock, flags); + ipath_dbg("LKEY table full\n"); + ret = 0; + goto bail; + } + } + rkt->next = (r + 1) & (rkt->max - 1); + /* + * Make sure lkey is never zero which is reserved to indicate an + * unrestricted LKEY. + */ + rkt->gen++; + mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) | + ((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen) + << 8); + if (mr->lkey == 0) { + mr->lkey |= 1 << 8; + rkt->gen++; + } + rkt->table[r] = mr; + spin_unlock_irqrestore(&rkt->lock, flags); + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_free_lkey - free an lkey + * @rkt: table from which to free the lkey + * @lkey: lkey id to free + */ +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey) +{ + unsigned long flags; + u32 r; + + if (lkey == 0) + return; + r = lkey >> (32 - ib_ipath_lkey_table_size); + spin_lock_irqsave(&rkt->lock, flags); + rkt->table[r] = NULL; + spin_unlock_irqrestore(&rkt->lock, flags); +} + +/** + * ipath_lkey_ok - check IB SGE for validity and initialize + * @rkt: table containing lkey to check SGE against + * @isge: outgoing internal SGE + * @sge: SGE to check + * @acc: access flags + * + * Return 1 if valid and successful, otherwise returns 0. + * + * Check the IB SGE for validity and initialize our internal version + * of it. + */ +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, + struct ib_sge *sge, int acc) +{ + struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; + struct ipath_mregion *mr; + unsigned n, m; + size_t off; + int ret; + + /* + * We use LKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). + */ + if (sge->lkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } + isge->mr = NULL; + isge->vaddr = (void *) sge->addr; + isge->length = sge->length; + isge->sge_length = sge->length; + ret = 1; + goto bail; + } + mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))]; + if (unlikely(mr == NULL || mr->lkey != sge->lkey || + qp->ibqp.pd != mr->pd)) { + ret = 0; + goto bail; + } + + off = sge->addr - mr->user_base; + if (unlikely(sge->addr < mr->user_base || + off + sge->length > mr->length || + (mr->access_flags & acc) != acc)) { + ret = 0; + goto bail; + } + + off += mr->offset; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; + } + } + isge->mr = mr; + isge->vaddr = mr->map[m]->segs[n].vaddr + off; + isge->length = mr->map[m]->segs[n].length - off; + isge->sge_length = sge->length; + isge->m = m; + isge->n = n; + + ret = 1; + +bail: + return ret; +} + +/** + * ipath_rkey_ok - check the IB virtual address, length, and RKEY + * @dev: infiniband device + * @ss: SGE state + * @len: length of data + * @vaddr: virtual address to place data + * @rkey: rkey to check + * @acc: access flags + * + * Return 1 if successful, otherwise 0. + */ +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_lkey_table *rkt = &dev->lk_table; + struct ipath_sge *sge = &ss->sge; + struct ipath_mregion *mr; + unsigned n, m; + size_t off; + int ret; + + /* + * We use RKEY == zero for kernel virtual addresses + * (see ipath_get_dma_mr and ipath_dma.c). + */ + if (rkey == 0) { + /* always a kernel port, no locking needed */ + struct ipath_pd *pd = to_ipd(qp->ibqp.pd); + + if (pd->user) { + ret = 0; + goto bail; + } + sge->mr = NULL; + sge->vaddr = (void *) vaddr; + sge->length = len; + sge->sge_length = len; + ss->sg_list = NULL; + ss->num_sge = 1; + ret = 1; + goto bail; + } + + mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))]; + if (unlikely(mr == NULL || mr->lkey != rkey || + qp->ibqp.pd != mr->pd)) { + ret = 0; + goto bail; + } + + off = vaddr - mr->iova; + if (unlikely(vaddr < mr->iova || off + len > mr->length || + (mr->access_flags & acc) == 0)) { + ret = 0; + goto bail; + } + + off += mr->offset; + m = 0; + n = 0; + while (off >= mr->map[m]->segs[n].length) { + off -= mr->map[m]->segs[n].length; + n++; + if (n >= IPATH_SEGSZ) { + m++; + n = 0; + } + } + sge->mr = mr; + sge->vaddr = mr->map[m]->segs[n].vaddr + off; + sge->length = mr->map[m]->segs[n].length - off; + sge->sge_length = len; + sge->m = m; + sge->n = n; + ss->sg_list = NULL; + ss->num_sge = 1; + + ret = 1; + +bail: + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_mad.c b/drivers/staging/rdma/ipath/ipath_mad.c new file mode 100644 index 000000000..ad3a926ab --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_mad.c @@ -0,0 +1,1521 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +static int reply(struct ib_smp *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static int recv_subn_get_nodedescription(struct ib_smp *smp, + struct ib_device *ibdev) +{ + if (smp->attr_mod) + smp->status |= IB_SMP_INVALID_FIELD; + + memcpy(smp->data, ibdev->node_desc, sizeof(smp->data)); + + return reply(smp); +} + +struct nodeinfo { + u8 base_version; + u8 class_version; + u8 node_type; + u8 num_ports; + __be64 sys_guid; + __be64 node_guid; + __be64 port_guid; + __be16 partition_cap; + __be16 device_id; + __be32 revision; + u8 local_port_num; + u8 vendor_id[3]; +} __attribute__ ((packed)); + +static int recv_subn_get_nodeinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct nodeinfo *nip = (struct nodeinfo *)&smp->data; + struct ipath_devdata *dd = to_idev(ibdev)->dd; + u32 vendor, majrev, minrev; + + /* GUID 0 is illegal */ + if (smp->attr_mod || (dd->ipath_guid == 0)) + smp->status |= IB_SMP_INVALID_FIELD; + + nip->base_version = 1; + nip->class_version = 1; + nip->node_type = 1; /* channel adapter */ + /* + * XXX The num_ports value will need a layer function to get + * the value if we ever have more than one IB port on a chip. + * We will also need to get the GUID for the port. + */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = to_idev(ibdev)->sys_image_guid; + nip->node_guid = dd->ipath_guid; + nip->port_guid = dd->ipath_guid; + nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->ipath_deviceid); + majrev = dd->ipath_majrev; + minrev = dd->ipath_minrev; + nip->revision = cpu_to_be32((majrev << 16) | minrev); + nip->local_port_num = port; + vendor = dd->ipath_vendorid; + nip->vendor_id[0] = IPATH_SRC_OUI_1; + nip->vendor_id[1] = IPATH_SRC_OUI_2; + nip->vendor_id[2] = IPATH_SRC_OUI_3; + + return reply(smp); +} + +static int recv_subn_get_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startgx = 8 * be32_to_cpu(smp->attr_mod); + __be64 *p = (__be64 *) smp->data; + + /* 32 blocks of 8 64-bit GUIDs per block */ + + memset(smp->data, 0, sizeof(smp->data)); + + /* + * We only support one GUID for now. If this changes, the + * portinfo.guid_cap field needs to be updated too. + */ + if (startgx == 0) { + __be64 g = to_idev(ibdev)->dd->ipath_guid; + if (g == 0) + /* GUID 0 is illegal */ + smp->status |= IB_SMP_INVALID_FIELD; + else + /* The first is a copy of the read-only HW GUID. */ + *p = g; + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static void set_link_width_enabled(struct ipath_devdata *dd, u32 w) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w); +} + +static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s) +{ + (void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s); +} + +static int get_overrunthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; +} + +/** + * set_overrunthreshold - set the overrun threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK << + INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +static int get_phyerrthreshold(struct ipath_devdata *dd) +{ + return (dd->ipath_ibcctrl >> + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; +} + +/** + * set_phyerrthreshold - set the physical error threshold + * @dd: the infinipath device + * @n: the new threshold + * + * Note that this will only take effect when the link state changes. + */ +static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n) +{ + unsigned v; + + v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) & + INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK; + if (v != n) { + dd->ipath_ibcctrl &= + ~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK << + INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT); + dd->ipath_ibcctrl |= + (u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + } + return 0; +} + +/** + * get_linkdowndefaultstate - get the default linkdown state + * @dd: the infinipath device + * + * Returns zero if the default is POLL, 1 if the default is SLEEP. + */ +static int get_linkdowndefaultstate(struct ipath_devdata *dd) +{ + return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE); +} + +static int recv_subn_get_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct ipath_ibdev *dev; + struct ipath_devdata *dd; + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + u16 lid; + u8 ibcstat; + u8 mtu; + int ret; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) { + smp->status |= IB_SMP_INVALID_FIELD; + ret = reply(smp); + goto bail; + } + + dev = to_idev(ibdev); + dd = dev->dd; + + /* Clear all fields. Only set the non-zero fields. */ + memset(smp->data, 0, sizeof(smp->data)); + + /* Only return the mkey if the protection field allows it. */ + if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey || + dev->mkeyprot == 0) + pip->mkey = dev->mkey; + pip->gid_prefix = dev->gid_prefix; + lid = dd->ipath_lid; + pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE; + pip->sm_lid = cpu_to_be16(dev->sm_lid); + pip->cap_mask = cpu_to_be32(dev->port_cap_flags); + /* pip->diag_code; */ + pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period); + pip->local_port_num = port; + pip->link_width_enabled = dd->ipath_link_width_enabled; + pip->link_width_supported = dd->ipath_link_width_supported; + pip->link_width_active = dd->ipath_link_width_active; + pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1; + + pip->portphysstate_linkdown = + (ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) | + (get_linkdowndefaultstate(dd) ? 1 : 2); + pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc; + pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) | + dd->ipath_link_speed_enabled; + switch (dd->ipath_ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: /* oops, something is wrong */ + mtu = IB_MTU_2048; + break; + } + pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl; + pip->vlcap_inittype = 0x10; /* VLCap = VL0, InitType = 0 */ + pip->vl_high_limit = dev->vl_high_limit; + /* pip->vl_arb_high_cap; // only one VL */ + /* pip->vl_arb_low_cap; // only one VL */ + /* InitTypeReply = 0 */ + /* our mtu cap depends on whether 4K MTU enabled or not */ + pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + /* HCAs ignore VLStallCount and HOQLife */ + /* pip->vlstallcnt_hoqlife; */ + pip->operationalvl_pei_peo_fpi_fpo = 0x10; /* OVLs = 1 */ + pip->mkey_violations = cpu_to_be16(dev->mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pip->pkey_violations = + cpu_to_be16((ipath_get_cr_errpkey(dd) - + dev->z_pkey_violations) & 0xFFFF); + pip->qkey_violations = cpu_to_be16(dev->qkey_violations); + /* Only the hardware GUID is supported for now */ + pip->guid_cap = 1; + pip->clientrereg_resv_subnetto = dev->subnet_timeout; + /* 32.768 usec. response time (guessing) */ + pip->resv_resptimevalue = 3; + pip->localphyerrors_overrunerrors = + (get_phyerrthreshold(dd) << 4) | + get_overrunthreshold(dd); + /* pip->max_credit_hint; */ + if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) { + u32 v; + + v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY); + pip->link_roundtrip_latency[0] = v >> 16; + pip->link_roundtrip_latency[1] = v >> 8; + pip->link_roundtrip_latency[2] = v; + } + + ret = reply(smp); + +bail: + return ret; +} + +/** + * get_pkeys - return the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys) +{ + /* always a kernel port, no locking needed */ + struct ipath_portdata *pd = dd->ipath_pd[0]; + + memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys)); + + return 0; +} + +static int recv_subn_get_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + u16 *p = (u16 *) smp->data; + __be16 *q = (__be16 *) smp->data; + + /* 64 blocks of 32 16-bit P_Key entries */ + + memset(smp->data, 0, sizeof(smp->data)); + if (startpx == 0) { + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_get_npkeys(dev->dd); + + get_pkeys(dev->dd, p); + + for (i = 0; i < n; i++) + q[i] = cpu_to_be16(p[i]); + } else + smp->status |= IB_SMP_INVALID_FIELD; + + return reply(smp); +} + +static int recv_subn_set_guidinfo(struct ib_smp *smp, + struct ib_device *ibdev) +{ + /* The only GUID we support is the first read-only entry. */ + return recv_subn_get_guidinfo(smp, ibdev); +} + +/** + * set_linkdowndefaultstate - set the default linkdown state + * @dd: the infinipath device + * @sleep: the new state + * + * Note that this will only take effect when the link state changes. + */ +static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep) +{ + if (sleep) + dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + else + dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, + dd->ipath_ibcctrl); + return 0; +} + +/** + * recv_subn_set_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + * Set Portinfo (see ch. 14.2.5.6). + */ +static int recv_subn_set_portinfo(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + struct ib_port_info *pip = (struct ib_port_info *)smp->data; + struct ib_event event; + struct ipath_ibdev *dev; + struct ipath_devdata *dd; + char clientrereg = 0; + u16 lid, smlid; + u8 lwe; + u8 lse; + u8 state; + u16 lstate; + u32 mtu; + int ret, ore; + + if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) + goto err; + + dev = to_idev(ibdev); + dd = dev->dd; + event.device = ibdev; + event.element.port_num = port; + + dev->mkey = pip->mkey; + dev->gid_prefix = pip->gid_prefix; + dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period); + + lid = be16_to_cpu(pip->lid); + if (dd->ipath_lid != lid || + dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) { + /* Must be a valid unicast LID address. */ + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) + goto err; + ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + smlid = be16_to_cpu(pip->sm_lid); + if (smlid != dev->sm_lid) { + /* Must be a valid unicast LID address. */ + if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE) + goto err; + dev->sm_lid = smlid; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + /* Allow 1x or 4x to be set (see 14.2.6.6). */ + lwe = pip->link_width_enabled; + if (lwe) { + if (lwe == 0xFF) + lwe = dd->ipath_link_width_supported; + else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported)) + goto err; + set_link_width_enabled(dd, lwe); + } + + /* Allow 2.5 or 5.0 Gbs. */ + lse = pip->linkspeedactive_enabled & 0xF; + if (lse) { + if (lse == 15) + lse = dd->ipath_link_speed_supported; + else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported)) + goto err; + set_link_speed_enabled(dd, lse); + } + + /* Set link down default state. */ + switch (pip->portphysstate_linkdown & 0xF) { + case 0: /* NOP */ + break; + case 1: /* SLEEP */ + if (set_linkdowndefaultstate(dd, 1)) + goto err; + break; + case 2: /* POLL */ + if (set_linkdowndefaultstate(dd, 0)) + goto err; + break; + default: + goto err; + } + + dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6; + dev->vl_high_limit = pip->vl_high_limit; + + switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) { + case IB_MTU_256: + mtu = 256; + break; + case IB_MTU_512: + mtu = 512; + break; + case IB_MTU_1024: + mtu = 1024; + break; + case IB_MTU_2048: + mtu = 2048; + break; + case IB_MTU_4096: + if (!ipath_mtu4096) + goto err; + mtu = 4096; + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + ipath_set_mtu(dd, mtu); + + dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF; + + /* We only support VL0 */ + if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1) + goto err; + + if (pip->mkey_violations == 0) + dev->mkey_violations = 0; + + /* + * Hardware counter can't be reset so snapshot and subtract + * later. + */ + if (pip->pkey_violations == 0) + dev->z_pkey_violations = ipath_get_cr_errpkey(dd); + + if (pip->qkey_violations == 0) + dev->qkey_violations = 0; + + ore = pip->localphyerrors_overrunerrors; + if (set_phyerrthreshold(dd, (ore >> 4) & 0xF)) + goto err; + + if (set_overrunthreshold(dd, (ore & 0xF))) + goto err; + + dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F; + + if (pip->clientrereg_resv_subnetto & 0x80) { + clientrereg = 1; + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + state = pip->linkspeed_portstate & 0xF; + lstate = (pip->portphysstate_linkdown >> 4) & 0xF; + if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP)) + goto err; + + /* + * Only state changes of DOWN, ARM, and ACTIVE are valid + * and must be in the correct state to take effect (see 7.2.6). + */ + switch (state) { + case IB_PORT_NOP: + if (lstate == 0) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (lstate == 0) + lstate = IPATH_IB_LINKDOWN_ONLY; + else if (lstate == 1) + lstate = IPATH_IB_LINKDOWN_SLEEP; + else if (lstate == 2) + lstate = IPATH_IB_LINKDOWN; + else if (lstate == 3) + lstate = IPATH_IB_LINKDOWN_DISABLE; + else + goto err; + ipath_set_linkstate(dd, lstate); + if (lstate == IPATH_IB_LINKDOWN_DISABLE) { + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto done; + } + ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED | + IPATH_LINKACTIVE, 1000); + break; + case IB_PORT_ARMED: + ipath_set_linkstate(dd, IPATH_IB_LINKARM); + break; + case IB_PORT_ACTIVE: + ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE); + break; + default: + /* XXX We have already partially updated our state! */ + goto err; + } + + ret = recv_subn_get_portinfo(smp, ibdev, port); + + if (clientrereg) + pip->clientrereg_resv_subnetto |= 0x80; + + goto done; + +err: + smp->status |= IB_SMP_INVALID_FIELD; + ret = recv_subn_get_portinfo(smp, ibdev, port); + +done: + return ret; +} + +/** + * rm_pkey - decrecment the reference count for the given PKEY + * @dd: the infinipath device + * @key: the PKEY index + * + * Return true if this was the last reference and the hardware table entry + * needs to be changed. + */ +static int rm_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (dd->ipath_pkeys[i] != key) + continue; + if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) { + dd->ipath_pkeys[i] = 0; + ret = 1; + goto bail; + } + break; + } + + ret = 0; + +bail: + return ret; +} + +/** + * add_pkey - add the given PKEY to the hardware table + * @dd: the infinipath device + * @key: the PKEY + * + * Return an error code if unable to add the entry, zero if no change, + * or 1 if the hardware PKEY register needs to be updated. + */ +static int add_pkey(struct ipath_devdata *dd, u16 key) +{ + int i; + u16 lkey = key & 0x7FFF; + int any = 0; + int ret; + + if (lkey == 0x7FFF) { + ret = 0; + goto bail; + } + + /* Look for an empty slot or a matching PKEY. */ + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i]) { + any++; + continue; + } + /* If it matches exactly, try to increment the ref count */ + if (dd->ipath_pkeys[i] == key) { + if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) { + ret = 0; + goto bail; + } + /* Lost the race. Look for an empty slot below. */ + atomic_dec(&dd->ipath_pkeyrefs[i]); + any++; + } + /* + * It makes no sense to have both the limited and unlimited + * PKEY set at the same time since the unlimited one will + * disable the limited one. + */ + if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) { + ret = -EEXIST; + goto bail; + } + } + if (!any) { + ret = -EBUSY; + goto bail; + } + for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) { + if (!dd->ipath_pkeys[i] && + atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) { + /* for ipathstats, etc. */ + ipath_stats.sps_pkeys[i] = lkey; + dd->ipath_pkeys[i] = key; + ret = 1; + goto bail; + } + } + ret = -EBUSY; + +bail: + return ret; +} + +/** + * set_pkeys - set the PKEY table for port 0 + * @dd: the infinipath device + * @pkeys: the PKEY table + */ +static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port) +{ + struct ipath_portdata *pd; + int i; + int changed = 0; + + /* always a kernel port, no locking needed */ + pd = dd->ipath_pd[0]; + + for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = pd->port_pkeys[i]; + + if (key == okey) + continue; + /* + * The value of this PKEY table entry is changing. + * Remove the old entry in the hardware's array of PKEYs. + */ + if (okey & 0x7FFF) + changed |= rm_pkey(dd, okey); + if (key & 0x7FFF) { + int ret = add_pkey(dd, key); + + if (ret < 0) + key = 0; + else + changed |= ret; + } + pd->port_pkeys[i] = key; + } + if (changed) { + u64 pkey; + struct ib_event event; + + pkey = (u64) dd->ipath_pkeys[0] | + ((u64) dd->ipath_pkeys[1] << 16) | + ((u64) dd->ipath_pkeys[2] << 32) | + ((u64) dd->ipath_pkeys[3] << 48); + ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n", + (unsigned long long) pkey); + ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, + pkey); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); + } + return 0; +} + +static int recv_subn_set_pkeytable(struct ib_smp *smp, + struct ib_device *ibdev, u8 port) +{ + u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); + __be16 *p = (__be16 *) smp->data; + u16 *q = (u16 *) smp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + unsigned i, n = ipath_get_npkeys(dev->dd); + + for (i = 0; i < n; i++) + q[i] = be16_to_cpu(p[i]); + + if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0) + smp->status |= IB_SMP_INVALID_FIELD; + + return recv_subn_get_pkeytable(smp, ibdev); +} + +static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp) +{ + struct ib_class_port_info *p = + (struct ib_class_port_info *)pmp->data; + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + /* Indicate AllPortSelect is valid (only one port anyway) */ + p->capability_mask = cpu_to_be16(1 << 8); + p->base_version = 1; + p->class_version = 1; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 + * sec. + */ + p->resp_time_value = 18; + + return reply((struct ib_smp *) pmp); +} + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + unsigned long flags; + u8 port_select = p->port_select; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + /* + * Ticks are 10x the link transfer period which for 2.5Gbs is 4 + * nsec. 0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec. Sample + * intervals are counted in ticks. Since we use Linux timers, that + * count in jiffies, we can't sample for less than 1000 ticks if HZ + * == 1000 (4000 ticks if HZ is 250). link_speed_active returns 2 for + * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that + * have hardware support for delaying packets. + */ + if (crp->cr_psstat) + p->tick = dev->dd->ipath_link_speed_active - 1; + else + p->tick = 250; /* 1 usec. */ + p->counter_width = 4; /* 32 bit counters */ + p->counter_mask0_9 = COUNTER_MASK0_9; + spin_lock_irqsave(&dev->pending_lock, flags); + if (crp->cr_psstat) + p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + p->sample_status = dev->pma_sample_status; + p->sample_start = cpu_to_be32(dev->pma_sample_start); + p->sample_interval = cpu_to_be32(dev->pma_sample_interval); + p->tag = cpu_to_be16(dev->pma_tag); + p->counter_select[0] = dev->pma_counter_select[0]; + p->counter_select[1] = dev->pma_counter_select[1]; + p->counter_select[2] = dev->pma_counter_select[2]; + p->counter_select[3] = dev->pma_counter_select[3]; + p->counter_select[4] = dev->pma_counter_select[4]; + spin_unlock_irqrestore(&dev->pending_lock, flags); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portsamplescontrol *p = + (struct ib_pma_portsamplescontrol *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + unsigned long flags; + u8 status; + int ret; + + if (pmp->mad_hdr.attr_mod != 0 || + (p->port_select != port && p->port_select != 0xFF)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + spin_lock_irqsave(&dev->pending_lock, flags); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + if (status == IB_PMA_SAMPLE_STATUS_DONE) { + dev->pma_sample_start = be32_to_cpu(p->sample_start); + dev->pma_sample_interval = be32_to_cpu(p->sample_interval); + dev->pma_tag = be16_to_cpu(p->tag); + dev->pma_counter_select[0] = p->counter_select[0]; + dev->pma_counter_select[1] = p->counter_select[1]; + dev->pma_counter_select[2] = p->counter_select[2]; + dev->pma_counter_select[3] = p->counter_select[3]; + dev->pma_counter_select[4] = p->counter_select[4]; + if (crp->cr_psstat) { + ipath_write_creg(dev->dd, crp->cr_psinterval, + dev->pma_sample_interval); + ipath_write_creg(dev->dd, crp->cr_psstart, + dev->pma_sample_start); + } else + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port); + +bail: + return ret; +} + +static u64 get_counter(struct ipath_ibdev *dev, + struct ipath_cregs const *crp, + __be16 sel) +{ + u64 ret; + + switch (sel) { + case IB_PMA_PORT_XMIT_DATA: + ret = (crp->cr_psxmitdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) : + dev->ipath_sword; + break; + case IB_PMA_PORT_RCV_DATA: + ret = (crp->cr_psrcvdatacount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) : + dev->ipath_rword; + break; + case IB_PMA_PORT_XMIT_PKTS: + ret = (crp->cr_psxmitpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) : + dev->ipath_spkts; + break; + case IB_PMA_PORT_RCV_PKTS: + ret = (crp->cr_psrcvpktscount) ? + ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) : + dev->ipath_rpkts; + break; + case IB_PMA_PORT_XMIT_WAIT: + ret = (crp->cr_psxmitwaitcount) ? + ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) : + dev->ipath_xmit_wait; + break; + default: + ret = 0; + } + + return ret; +} + +static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult *p = + (struct ib_pma_portsamplesresult *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be32( + get_counter(dev, crp, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev) +{ + struct ib_pma_portsamplesresult_ext *p = + (struct ib_pma_portsamplesresult_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_cregs const *crp = dev->dd->ipath_cregs; + u8 status; + int i; + + memset(pmp->data, 0, sizeof(pmp->data)); + p->tag = cpu_to_be16(dev->pma_tag); + if (crp->cr_psstat) + status = ipath_read_creg32(dev->dd, crp->cr_psstat); + else + status = dev->pma_sample_status; + p->sample_status = cpu_to_be16(status); + /* 64 bits */ + p->extended_width = cpu_to_be32(0x80000000); + for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++) + p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 : + cpu_to_be64( + get_counter(dev, crp, dev->pma_counter_select[i])); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_verbs_counters cntrs; + u8 port_select = p->port_select; + + ipath_get_counters(dev->dd, &cntrs); + + /* Adjust counters for any resets done. */ + cntrs.symbol_error_counter -= dev->z_symbol_error_counter; + cntrs.link_error_recovery_counter -= + dev->z_link_error_recovery_counter; + cntrs.link_downed_counter -= dev->z_link_downed_counter; + cntrs.port_rcv_errors += dev->rcv_errors; + cntrs.port_rcv_errors -= dev->z_port_rcv_errors; + cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors; + cntrs.port_xmit_discards -= dev->z_port_xmit_discards; + cntrs.port_xmit_data -= dev->z_port_xmit_data; + cntrs.port_rcv_data -= dev->z_port_rcv_data; + cntrs.port_xmit_packets -= dev->z_port_xmit_packets; + cntrs.port_rcv_packets -= dev->z_port_rcv_packets; + cntrs.local_link_integrity_errors -= + dev->z_local_link_integrity_errors; + cntrs.excessive_buffer_overrun_errors -= + dev->z_excessive_buffer_overrun_errors; + cntrs.vl15_dropped -= dev->z_vl15_dropped; + cntrs.vl15_dropped += dev->n_vl15_dropped; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + if (cntrs.symbol_error_counter > 0xFFFFUL) + p->symbol_error_counter = cpu_to_be16(0xFFFF); + else + p->symbol_error_counter = + cpu_to_be16((u16)cntrs.symbol_error_counter); + if (cntrs.link_error_recovery_counter > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = + (u8)cntrs.link_error_recovery_counter; + if (cntrs.link_downed_counter > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)cntrs.link_downed_counter; + if (cntrs.port_rcv_errors > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = + cpu_to_be16((u16) cntrs.port_rcv_errors); + if (cntrs.port_rcv_remphys_errors > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = + cpu_to_be16((u16)cntrs.port_rcv_remphys_errors); + if (cntrs.port_xmit_discards > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = + cpu_to_be16((u16)cntrs.port_xmit_discards); + if (cntrs.local_link_integrity_errors > 0xFUL) + cntrs.local_link_integrity_errors = 0xFUL; + if (cntrs.excessive_buffer_overrun_errors > 0xFUL) + cntrs.excessive_buffer_overrun_errors = 0xFUL; + p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) | + cntrs.excessive_buffer_overrun_errors; + if (cntrs.vl15_dropped > 0xFFFFUL) + p->vl15_dropped = cpu_to_be16(0xFFFF); + else + p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped); + if (cntrs.port_xmit_data > 0xFFFFFFFFUL) + p->port_xmit_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data); + if (cntrs.port_rcv_data > 0xFFFFFFFFUL) + p->port_rcv_data = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data); + if (cntrs.port_xmit_packets > 0xFFFFFFFFUL) + p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_xmit_packets = + cpu_to_be32((u32)cntrs.port_xmit_packets); + if (cntrs.port_rcv_packets > 0xFFFFFFFFUL) + p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF); + else + p->port_rcv_packets = + cpu_to_be32((u32) cntrs.port_rcv_packets); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = + (struct ib_pma_portcounters_ext *)pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + u8 port_select = p->port_select; + + ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + /* Adjust counters for any resets done. */ + swords -= dev->z_port_xmit_data; + rwords -= dev->z_port_rcv_data; + spkts -= dev->z_port_xmit_packets; + rpkts -= dev->z_port_rcv_packets; + + memset(pmp->data, 0, sizeof(pmp->data)); + + p->port_select = port_select; + if (pmp->mad_hdr.attr_mod != 0 || + (port_select != port && port_select != 0xFF)) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + p->port_xmit_data = cpu_to_be64(swords); + p->port_rcv_data = cpu_to_be64(rwords); + p->port_xmit_packets = cpu_to_be64(spkts); + p->port_rcv_packets = cpu_to_be64(rpkts); + p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit); + p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv); + p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit); + p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv); + + return reply((struct ib_smp *) pmp); +} + +static int recv_pma_set_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_verbs_counters cntrs; + + /* + * Since the HW doesn't support clearing counters, we save the + * current count and subtract it from future responses. + */ + ipath_get_counters(dev->dd, &cntrs); + + if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR) + dev->z_symbol_error_counter = cntrs.symbol_error_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY) + dev->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + + if (p->counter_select & IB_PMA_SEL_LINK_DOWNED) + dev->z_link_downed_counter = cntrs.link_downed_counter; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS) + dev->z_port_rcv_errors = + cntrs.port_rcv_errors + dev->rcv_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS) + dev->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS) + dev->z_port_xmit_discards = cntrs.port_xmit_discards; + + if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS) + dev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + + if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS) + dev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + + if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) { + dev->n_vl15_dropped = 0; + dev->z_vl15_dropped = cntrs.vl15_dropped; + } + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA) + dev->z_port_xmit_data = cntrs.port_xmit_data; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA) + dev->z_port_rcv_data = cntrs.port_rcv_data; + + if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS) + dev->z_port_xmit_packets = cntrs.port_xmit_packets; + + if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS) + dev->z_port_rcv_packets = cntrs.port_rcv_packets; + + return recv_pma_get_portcounters(pmp, ibdev, port); +} + +static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct ipath_ibdev *dev = to_idev(ibdev); + u64 swords, rwords, spkts, rpkts, xwait; + + ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts, + &rpkts, &xwait); + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA) + dev->z_port_xmit_data = swords; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA) + dev->z_port_rcv_data = rwords; + + if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS) + dev->z_port_xmit_packets = spkts; + + if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS) + dev->z_port_rcv_packets = rpkts; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS) + dev->n_unicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS) + dev->n_unicast_rcv = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS) + dev->n_multicast_xmit = 0; + + if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS) + dev->n_multicast_rcv = 0; + + return recv_pma_get_portcounters_ext(pmp, ibdev, port); +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port_num, const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply(smp); + goto bail; + } + + /* Is the mkey in the process of expiring? */ + if (dev->mkey_lease_timeout && + time_after_eq(jiffies, dev->mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + dev->mkey_lease_timeout = 0; + dev->mkeyprot = 0; + } + + /* + * M_Key checking depends on + * Portinfo:M_Key_protect_bits + */ + if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 && + dev->mkey != smp->mkey && + (smp->method == IB_MGMT_METHOD_SET || + (smp->method == IB_MGMT_METHOD_GET && + dev->mkeyprot >= 2))) { + if (dev->mkey_violations != 0xFFFF) + ++dev->mkey_violations; + if (dev->mkey_lease_timeout || + dev->mkey_lease_period == 0) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + dev->mkey_lease_timeout = jiffies + + dev->mkey_lease_period * HZ; + /* Future: Generate a trap notice. */ + ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + goto bail; + } else if (dev->mkey_lease_timeout) + dev->mkey_lease_timeout = 0; + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = recv_subn_get_nodedescription(smp, ibdev); + goto bail; + case IB_SMP_ATTR_NODE_INFO: + ret = recv_subn_get_nodeinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_get_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_get_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_get_pkeytable(smp, ibdev); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (smp->attr_id) { + case IB_SMP_ATTR_GUID_INFO: + ret = recv_subn_set_guidinfo(smp, ibdev); + goto bail; + case IB_SMP_ATTR_PORT_INFO: + ret = recv_subn_set_portinfo(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_PKEY_TABLE: + ret = recv_subn_set_pkeytable(smp, ibdev, port_num); + goto bail; + case IB_SMP_ATTR_SM_INFO: + if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { + ret = IB_MAD_RESULT_SUCCESS | + IB_MAD_RESULT_CONSUMED; + goto bail; + } + if (dev->port_cap_flags & IB_PORT_SM) { + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + } + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply(smp); + goto bail; + } + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_TRAP_REPRESS: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply(smp); + } + +bail: + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port_num, + const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = recv_pma_get_classportinfo(pmp); + goto bail; + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_get_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT: + ret = recv_pma_get_portsamplesresult(pmp, ibdev); + goto bail; + case IB_PMA_PORT_SAMPLES_RESULT_EXT: + ret = recv_pma_get_portsamplesresult_ext(pmp, + ibdev); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_get_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_get_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_SAMPLES_CONTROL: + ret = recv_pma_set_portsamplescontrol(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS: + ret = recv_pma_set_portcounters(pmp, ibdev, + port_num); + goto bail; + case IB_PMA_PORT_COUNTERS_EXT: + ret = recv_pma_set_portcounters_ext(pmp, ibdev, + port_num); + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_smp *) pmp); + goto bail; + } + + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + goto bail; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_smp *) pmp); + } + +bail: + return ret; +} + +/** + * ipath_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port_num: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + int ret; + const struct ib_mad *in_mad = (const struct ib_mad *)in; + struct ib_mad *out_mad = (struct ib_mad *)out; + + if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || + *out_mad_size != sizeof(*out_mad))) + return IB_MAD_RESULT_FAILURE; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port_num, + in_mad, out_mad); + goto bail; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port_num, in_mad, out_mad); + goto bail; + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_mmap.c b/drivers/staging/rdma/ipath/ipath_mmap.c new file mode 100644 index 000000000..e73274229 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_mmap.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct ipath_mmap_info + */ +void ipath_release_mmap_info(struct kref *ref) +{ + struct ipath_mmap_info *ip = + container_of(ref, struct ipath_mmap_info, ref); + struct ipath_ibdev *dev = to_idev(ip->context->device); + + spin_lock_irq(&dev->pending_lock); + list_del(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + vfree(ip->obj); + kfree(ip); +} + +/* + * open and close keep track of how many times the CQ is mapped, + * to avoid releasing it. + */ +static void ipath_vma_open(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); +} + +static void ipath_vma_close(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + kref_put(&ip->ref, ipath_release_mmap_info); +} + +static const struct vm_operations_struct ipath_vm_ops = { + .open = ipath_vma_open, + .close = ipath_vma_close, +}; + +/** + * ipath_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct ipath_ibdev *dev = to_idev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct ipath_mmap_info *ip, *pp; + int ret = -EINVAL; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_irq(&dev->pending_lock); + list_for_each_entry_safe(ip, pp, &dev->pending_mmaps, + pending_mmaps) { + /* Only the creator is allowed to mmap the object */ + if (context != ip->context || (__u64) offset != ip->offset) + continue; + /* Don't allow a mmap larger than the object. */ + if (size > ip->size) + break; + + list_del_init(&ip->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + + ret = remap_vmalloc_range(vma, ip->obj, 0); + if (ret) + goto done; + vma->vm_ops = &ipath_vm_ops; + vma->vm_private_data = ip; + ipath_vma_open(vma); + goto done; + } + spin_unlock_irq(&dev->pending_lock); +done: + return ret; +} + +/* + * Allocate information for ipath_mmap + */ +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj) { + struct ipath_mmap_info *ip; + + ip = kmalloc(sizeof *ip, GFP_KERNEL); + if (!ip) + goto bail; + + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + INIT_LIST_HEAD(&ip->pending_mmaps); + ip->size = size; + ip->context = context; + ip->obj = obj; + kref_init(&ip->ref); + +bail: + return ip; +} + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj) { + size = PAGE_ALIGN(size); + + spin_lock_irq(&dev->mmap_offset_lock); + if (dev->mmap_offset == 0) + dev->mmap_offset = PAGE_SIZE; + ip->offset = dev->mmap_offset; + dev->mmap_offset += size; + spin_unlock_irq(&dev->mmap_offset_lock); + + ip->size = size; + ip->obj = obj; +} diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c new file mode 100644 index 000000000..c7278f6a8 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_mr.c @@ -0,0 +1,425 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +#include "ipath_verbs.h" + +/* Fast memory region */ +struct ipath_fmr { + struct ib_fmr ibfmr; + u8 page_shift; + struct ipath_mregion mr; /* must be last */ +}; + +static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr) +{ + return container_of(ibfmr, struct ipath_fmr, ibfmr); +} + +/** + * ipath_get_dma_mr - get a DMA memory region + * @pd: protection domain for this memory region + * @acc: access flags + * + * Returns the memory region on success, otherwise returns an errno. + * Note that all DMA addresses should be created via the + * struct ib_dma_mapping_ops functions (see ipath_dma.c). + */ +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc) +{ + struct ipath_mr *mr; + struct ib_mr *ret; + + mr = kzalloc(sizeof *mr, GFP_KERNEL); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.access_flags = acc; + ret = &mr->ibmr; + +bail: + return ret; +} + +static struct ipath_mr *alloc_mr(int count, + struct ipath_lkey_table *lk_table) +{ + struct ipath_mr *mr; + int m, i = 0; + + /* Allocate struct plus pointers to first level page tables. */ + m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL); + if (!mr) + goto done; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL); + if (!mr->mr.map[i]) + goto bail; + } + mr->mr.mapsz = m; + + /* + * ib_reg_phys_mr() will initialize mr->ibmr except for + * lkey and rkey. + */ + if (!ipath_alloc_lkey(lk_table, &mr->mr)) + goto bail; + mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey; + + goto done; + +bail: + while (i) { + i--; + kfree(mr->mr.map[i]); + } + kfree(mr); + mr = NULL; + +done: + return mr; +} + +/** + * ipath_reg_phys_mr - register a physical memory region + * @pd: protection domain for this memory region + * @buffer_list: pointer to the list of physical buffers to register + * @num_phys_buf: the number of physical buffers to register + * @iova_start: the starting address passed over IB which maps to this MR + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start) +{ + struct ipath_mr *mr; + int n, m, i; + struct ib_mr *ret; + + mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table); + if (mr == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + mr->mr.pd = pd; + mr->mr.user_base = *iova_start; + mr->mr.iova = *iova_start; + mr->mr.length = 0; + mr->mr.offset = 0; + mr->mr.access_flags = acc; + mr->mr.max_segs = num_phys_buf; + mr->umem = NULL; + + m = 0; + n = 0; + for (i = 0; i < num_phys_buf; i++) { + mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; + mr->mr.map[m]->segs[n].length = buffer_list[i].size; + mr->mr.length += buffer_list[i].size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_reg_user_mr - register a userspace memory region + * @pd: protection domain for this memory region + * @start: starting userspace address + * @length: length of region to register + * @virt_addr: virtual address to use (from HCA's point of view) + * @mr_access_flags: access flags for this memory region + * @udata: unused by the InfiniPath driver + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata) +{ + struct ipath_mr *mr; + struct ib_umem *umem; + int n, m, entry; + struct scatterlist *sg; + struct ib_mr *ret; + + if (length == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + umem = ib_umem_get(pd->uobject->context, start, length, + mr_access_flags, 0); + if (IS_ERR(umem)) + return (void *) umem; + + n = umem->nmap; + mr = alloc_mr(n, &to_idev(pd->device)->lk_table); + if (!mr) { + ret = ERR_PTR(-ENOMEM); + ib_umem_release(umem); + goto bail; + } + + mr->mr.pd = pd; + mr->mr.user_base = start; + mr->mr.iova = virt_addr; + mr->mr.length = length; + mr->mr.offset = ib_umem_offset(umem); + mr->mr.access_flags = mr_access_flags; + mr->mr.max_segs = n; + mr->umem = umem; + + m = 0; + n = 0; + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + void *vaddr; + + vaddr = page_address(sg_page(sg)); + if (!vaddr) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + mr->mr.map[m]->segs[n].vaddr = vaddr; + mr->mr.map[m]->segs[n].length = umem->page_size; + n++; + if (n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + ret = &mr->ibmr; + +bail: + return ret; +} + +/** + * ipath_dereg_mr - unregister and free a memory region + * @ibmr: the memory region to free + * + * Returns 0 on success. + * + * Note that this is called to free MRs created by ipath_get_dma_mr() + * or ipath_reg_user_mr(). + */ +int ipath_dereg_mr(struct ib_mr *ibmr) +{ + struct ipath_mr *mr = to_imr(ibmr); + int i; + + ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey); + i = mr->mr.mapsz; + while (i) { + i--; + kfree(mr->mr.map[i]); + } + + if (mr->umem) + ib_umem_release(mr->umem); + + kfree(mr); + return 0; +} + +/** + * ipath_alloc_fmr - allocate a fast memory region + * @pd: the protection domain for this memory region + * @mr_access_flags: access flags for this memory region + * @fmr_attr: fast memory region attributes + * + * Returns the memory region on success, otherwise returns an errno. + */ +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr) +{ + struct ipath_fmr *fmr; + int m, i = 0; + struct ib_fmr *ret; + + /* Allocate struct plus pointers to first level page tables. */ + m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ; + fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL); + if (!fmr) + goto bail; + + /* Allocate first level page tables. */ + for (; i < m; i++) { + fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0], + GFP_KERNEL); + if (!fmr->mr.map[i]) + goto bail; + } + fmr->mr.mapsz = m; + + /* + * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & + * rkey. + */ + if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr)) + goto bail; + fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey; + /* + * Resources are allocated but no valid mapping (RKEY can't be + * used). + */ + fmr->mr.pd = pd; + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + fmr->mr.offset = 0; + fmr->mr.access_flags = mr_access_flags; + fmr->mr.max_segs = fmr_attr->max_pages; + fmr->page_shift = fmr_attr->page_shift; + + ret = &fmr->ibfmr; + goto done; + +bail: + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + ret = ERR_PTR(-ENOMEM); + +done: + return ret; +} + +/** + * ipath_map_phys_fmr - set up a fast memory region + * @ibmfr: the fast memory region to set up + * @page_list: the list of pages to associate with the fast memory region + * @list_len: the number of pages to associate with the fast memory region + * @iova: the virtual address of the start of the fast memory region + * + * This may be called from interrupt context. + */ + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + struct ipath_lkey_table *rkt; + unsigned long flags; + int m, n, i; + u32 ps; + int ret; + + if (list_len > fmr->mr.max_segs) { + ret = -EINVAL; + goto bail; + } + rkt = &to_idev(ibfmr->device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = iova; + fmr->mr.iova = iova; + ps = 1 << fmr->page_shift; + fmr->mr.length = list_len * ps; + m = 0; + n = 0; + ps = 1 << fmr->page_shift; + for (i = 0; i < list_len; i++) { + fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i]; + fmr->mr.map[m]->segs[n].length = ps; + if (++n == IPATH_SEGSZ) { + m++; + n = 0; + } + } + spin_unlock_irqrestore(&rkt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_unmap_fmr - unmap fast memory regions + * @fmr_list: the list of fast memory regions to unmap + * + * Returns 0 on success. + */ +int ipath_unmap_fmr(struct list_head *fmr_list) +{ + struct ipath_fmr *fmr; + struct ipath_lkey_table *rkt; + unsigned long flags; + + list_for_each_entry(fmr, fmr_list, ibfmr.list) { + rkt = &to_idev(fmr->ibfmr.device)->lk_table; + spin_lock_irqsave(&rkt->lock, flags); + fmr->mr.user_base = 0; + fmr->mr.iova = 0; + fmr->mr.length = 0; + spin_unlock_irqrestore(&rkt->lock, flags); + } + return 0; +} + +/** + * ipath_dealloc_fmr - deallocate a fast memory region + * @ibfmr: the fast memory region to deallocate + * + * Returns 0 on success. + */ +int ipath_dealloc_fmr(struct ib_fmr *ibfmr) +{ + struct ipath_fmr *fmr = to_ifmr(ibfmr); + int i; + + ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey); + i = fmr->mr.mapsz; + while (i) + kfree(fmr->mr.map[--i]); + kfree(fmr); + return 0; +} diff --git a/drivers/staging/rdma/ipath/ipath_qp.c b/drivers/staging/rdma/ipath/ipath_qp.c new file mode 100644 index 000000000..face87602 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_qp.c @@ -0,0 +1,1080 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +#define BITS_PER_PAGE (PAGE_SIZE*BITS_PER_BYTE) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) +#define mk_qpn(qpt, map, off) (((map) - (qpt)->map) * BITS_PER_PAGE + \ + (off)) +#define find_next_offset(map, off) find_next_zero_bit((map)->page, \ + BITS_PER_PAGE, off) + +/* + * Convert the AETH credit code into the number of credits. + */ +static u32 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + + +static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map) +{ + unsigned long page = get_zeroed_page(GFP_KERNEL); + unsigned long flags; + + /* + * Free the page if someone raced with us installing it. + */ + + spin_lock_irqsave(&qpt->lock, flags); + if (map->page) + free_page(page); + else + map->page = (void *)page; + spin_unlock_irqrestore(&qpt->lock, flags); +} + + +static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type) +{ + u32 i, offset, max_scan, qpn; + struct qpn_map *map; + u32 ret = -1; + + if (type == IB_QPT_SMI) + ret = 0; + else if (type == IB_QPT_GSI) + ret = 1; + + if (ret != -1) { + map = &qpt->map[0]; + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) { + ret = -ENOMEM; + goto bail; + } + } + if (!test_and_set_bit(ret, map->page)) + atomic_dec(&map->n_free); + else + ret = -EBUSY; + goto bail; + } + + qpn = qpt->last + 1; + if (qpn >= QPN_MAX) + qpn = 2; + offset = qpn & BITS_PER_PAGE_MASK; + map = &qpt->map[qpn / BITS_PER_PAGE]; + max_scan = qpt->nmaps - !offset; + for (i = 0;;) { + if (unlikely(!map->page)) { + get_map_page(qpt, map); + if (unlikely(!map->page)) + break; + } + if (likely(atomic_read(&map->n_free))) { + do { + if (!test_and_set_bit(offset, map->page)) { + atomic_dec(&map->n_free); + qpt->last = qpn; + ret = qpn; + goto bail; + } + offset = find_next_offset(map, offset); + qpn = mk_qpn(qpt, map, offset); + /* + * This test differs from alloc_pidmap(). + * If find_next_offset() does find a zero + * bit, we don't need to check for QPN + * wrapping around past our starting QPN. + * We just need to be sure we don't loop + * forever. + */ + } while (offset < BITS_PER_PAGE && qpn < QPN_MAX); + } + /* + * In order to keep the number of pages allocated to a + * minimum, we scan the all existing pages before increasing + * the size of the bitmap table. + */ + if (++i > max_scan) { + if (qpt->nmaps == QPNMAP_ENTRIES) + break; + map = &qpt->map[qpt->nmaps++]; + offset = 0; + } else if (map < &qpt->map[qpt->nmaps]) { + ++map; + offset = 0; + } else { + map = &qpt->map[0]; + offset = 2; + } + qpn = mk_qpn(qpt, map, offset); + } + + ret = -ENOMEM; + +bail: + return ret; +} + +static void free_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + struct qpn_map *map; + + map = qpt->map + qpn / BITS_PER_PAGE; + if (map->page) + clear_bit(qpn & BITS_PER_PAGE_MASK, map->page); + atomic_inc(&map->n_free); +} + +/** + * ipath_alloc_qpn - allocate a QP number + * @qpt: the QP table + * @qp: the QP + * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special) + * + * Allocate the next available QPN and put the QP into the hash table. + * The hash table holds a reference to the QP. + */ +static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp, + enum ib_qp_type type) +{ + unsigned long flags; + int ret; + + ret = alloc_qpn(qpt, type); + if (ret < 0) + goto bail; + qp->ibqp.qp_num = ret; + + /* Add the QP to the hash table. */ + spin_lock_irqsave(&qpt->lock, flags); + + ret %= qpt->max; + qp->next = qpt->table[ret]; + qpt->table[ret] = qp; + atomic_inc(&qp->refcount); + + spin_unlock_irqrestore(&qpt->lock, flags); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_free_qp - remove a QP from the QP table + * @qpt: the QP table + * @qp: the QP to remove + * + * Remove the QP from the table so it can't be found asynchronously by + * the receive interrupt routine. + */ +static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp) +{ + struct ipath_qp *q, **qpp; + unsigned long flags; + + spin_lock_irqsave(&qpt->lock, flags); + + /* Remove QP from the hash table. */ + qpp = &qpt->table[qp->ibqp.qp_num % qpt->max]; + for (; (q = *qpp) != NULL; qpp = &q->next) { + if (q == qp) { + *qpp = qp->next; + qp->next = NULL; + atomic_dec(&qp->refcount); + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); +} + +/** + * ipath_free_all_qps - check for QPs still in use + * @qpt: the QP table to empty + * + * There should not be any QPs still in use. + * Free memory for table. + */ +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt) +{ + unsigned long flags; + struct ipath_qp *qp; + u32 n, qp_inuse = 0; + + spin_lock_irqsave(&qpt->lock, flags); + for (n = 0; n < qpt->max; n++) { + qp = qpt->table[n]; + qpt->table[n] = NULL; + + for (; qp; qp = qp->next) + qp_inuse++; + } + spin_unlock_irqrestore(&qpt->lock, flags); + + for (n = 0; n < ARRAY_SIZE(qpt->map); n++) + if (qpt->map[n].page) + free_page((unsigned long) qpt->map[n].page); + return qp_inuse; +} + +/** + * ipath_lookup_qpn - return the QP with the given QPN + * @qpt: the QP table + * @qpn: the QP number to look up + * + * The caller is responsible for decrementing the QP reference count + * when done. + */ +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn) +{ + unsigned long flags; + struct ipath_qp *qp; + + spin_lock_irqsave(&qpt->lock, flags); + + for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) { + if (qp->ibqp.qp_num == qpn) { + atomic_inc(&qp->refcount); + break; + } + } + + spin_unlock_irqrestore(&qpt->lock, flags); + return qp; +} + +/** + * ipath_reset_qp - initialize the QP state to the reset state + * @qp: the QP to reset + * @type: the QP type + */ +static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type) +{ + qp->remote_qpn = 0; + qp->qkey = 0; + qp->qp_access_flags = 0; + atomic_set(&qp->s_dma_busy, 0); + qp->s_flags &= IPATH_S_SIGNAL_REQ_WR; + qp->s_hdrwords = 0; + qp->s_wqe = NULL; + qp->s_pkt_delay = 0; + qp->s_draining = 0; + qp->s_psn = 0; + qp->r_psn = 0; + qp->r_msn = 0; + if (type == IB_QPT_RC) { + qp->s_state = IB_OPCODE_RC_SEND_LAST; + qp->r_state = IB_OPCODE_RC_SEND_LAST; + } else { + qp->s_state = IB_OPCODE_UC_SEND_LAST; + qp->r_state = IB_OPCODE_UC_SEND_LAST; + } + qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE; + qp->r_nak_state = 0; + qp->r_aflags = 0; + qp->r_flags = 0; + qp->s_rnr_timeout = 0; + qp->s_head = 0; + qp->s_tail = 0; + qp->s_cur = 0; + qp->s_last = 0; + qp->s_ssn = 1; + qp->s_lsn = 0; + memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); + qp->r_head_ack_queue = 0; + qp->s_tail_ack_queue = 0; + qp->s_num_rd_atomic = 0; + if (qp->r_rq.wq) { + qp->r_rq.wq->head = 0; + qp->r_rq.wq->tail = 0; + } +} + +/** + * ipath_error_qp - put a QP into the error state + * @qp: the QP to put into the error state + * @err: the receive completion error to signal if a RWQE is active + * + * Flushes both send and receive work queues. + * Returns true if last WQE event should be generated. + * The QP s_lock should be held and interrupts disabled. + * If we are already in error state, just return. + */ + +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + int ret = 0; + + if (qp->state == IB_QPS_ERR) + goto bail; + + qp->state = IB_QPS_ERR; + + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + + /* Schedule the sending tasklet to drain the send work queue. */ + if (qp->s_last != qp->s_head) + ipath_schedule_send(qp); + + memset(&wc, 0, sizeof(wc)); + wc.qp = &qp->ibqp; + wc.opcode = IB_WC_RECV; + + if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) { + wc.wr_id = qp->r_wr_id; + wc.status = err; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wc.status = IB_WC_WR_FLUSH_ERR; + + if (qp->r_rq.wq) { + struct ipath_rwq *wq; + u32 head; + u32 tail; + + spin_lock(&qp->r_rq.lock); + + /* sanity check pointers before trusting them */ + wq = qp->r_rq.wq; + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + while (tail != head) { + wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id; + if (++tail >= qp->r_rq.size) + tail = 0; + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + } + wq->tail = tail; + + spin_unlock(&qp->r_rq.lock); + } else if (qp->ibqp.event_handler) + ret = 1; + +bail: + return ret; +} + +/** + * ipath_modify_qp - modify the attributes of a queue pair + * @ibqp: the queue pair who's attributes we're modifying + * @attr: the new attributes + * @attr_mask: the mask of attributes to modify + * @udata: user data for ipathverbs.so + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_qp *qp = to_iqp(ibqp); + enum ib_qp_state cur_state, new_state; + int lastwqe = 0; + int ret; + + spin_lock_irq(&qp->s_lock); + + cur_state = attr_mask & IB_QP_CUR_STATE ? + attr->cur_qp_state : qp->state; + new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask, IB_LINK_LAYER_UNSPECIFIED)) + goto inval; + + if (attr_mask & IB_QP_AV) { + if (attr->ah_attr.dlid == 0 || + attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE) + goto inval; + + if ((attr->ah_attr.ah_flags & IB_AH_GRH) && + (attr->ah_attr.grh.sgid_index > 1)) + goto inval; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + if (attr->pkey_index >= ipath_get_npkeys(dev->dd)) + goto inval; + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + if (attr->min_rnr_timer > 31) + goto inval; + + if (attr_mask & IB_QP_PORT) + if (attr->port_num == 0 || + attr->port_num > ibqp->device->phys_port_cnt) + goto inval; + + /* + * don't allow invalid Path MTU values or greater than 2048 + * unless we are configured for a 4KB MTU + */ + if ((attr_mask & IB_QP_PATH_MTU) && + (ib_mtu_enum_to_int(attr->path_mtu) == -1 || + (attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096))) + goto inval; + + if (attr_mask & IB_QP_PATH_MIG_STATE) + if (attr->path_mig_state != IB_MIG_MIGRATED && + attr->path_mig_state != IB_MIG_REARM) + goto inval; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC) + goto inval; + + switch (new_state) { + case IB_QPS_RESET: + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + spin_lock_irq(&qp->s_lock); + } + ipath_reset_qp(qp, ibqp->qp_type); + break; + + case IB_QPS_SQD: + qp->s_draining = qp->s_last != qp->s_cur; + qp->state = new_state; + break; + + case IB_QPS_SQE: + if (qp->ibqp.qp_type == IB_QPT_RC) + goto inval; + qp->state = new_state; + break; + + case IB_QPS_ERR: + lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + qp->state = new_state; + break; + } + + if (attr_mask & IB_QP_PKEY_INDEX) + qp->s_pkey_index = attr->pkey_index; + + if (attr_mask & IB_QP_DEST_QPN) + qp->remote_qpn = attr->dest_qp_num; + + if (attr_mask & IB_QP_SQ_PSN) { + qp->s_psn = qp->s_next_psn = attr->sq_psn; + qp->s_last_psn = qp->s_next_psn - 1; + } + + if (attr_mask & IB_QP_RQ_PSN) + qp->r_psn = attr->rq_psn; + + if (attr_mask & IB_QP_ACCESS_FLAGS) + qp->qp_access_flags = attr->qp_access_flags; + + if (attr_mask & IB_QP_AV) { + qp->remote_ah_attr = attr->ah_attr; + qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate); + } + + if (attr_mask & IB_QP_PATH_MTU) + qp->path_mtu = attr->path_mtu; + + if (attr_mask & IB_QP_RETRY_CNT) + qp->s_retry = qp->s_retry_cnt = attr->retry_cnt; + + if (attr_mask & IB_QP_RNR_RETRY) { + qp->s_rnr_retry = attr->rnr_retry; + if (qp->s_rnr_retry > 7) + qp->s_rnr_retry = 7; + qp->s_rnr_retry_cnt = qp->s_rnr_retry; + } + + if (attr_mask & IB_QP_MIN_RNR_TIMER) + qp->r_min_rnr_timer = attr->min_rnr_timer; + + if (attr_mask & IB_QP_TIMEOUT) + qp->timeout = attr->timeout; + + if (attr_mask & IB_QP_QKEY) + qp->qkey = attr->qkey; + + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) + qp->r_max_rd_atomic = attr->max_dest_rd_atomic; + + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) + qp->s_max_rd_atomic = attr->max_rd_atomic; + + spin_unlock_irq(&qp->s_lock); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } + ret = 0; + goto bail; + +inval: + spin_unlock_irq(&qp->s_lock); + ret = -EINVAL; + +bail: + return ret; +} + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + + attr->qp_state = qp->state; + attr->cur_qp_state = attr->qp_state; + attr->path_mtu = qp->path_mtu; + attr->path_mig_state = 0; + attr->qkey = qp->qkey; + attr->rq_psn = qp->r_psn; + attr->sq_psn = qp->s_next_psn; + attr->dest_qp_num = qp->remote_qpn; + attr->qp_access_flags = qp->qp_access_flags; + attr->cap.max_send_wr = qp->s_size - 1; + attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1; + attr->cap.max_send_sge = qp->s_max_sge; + attr->cap.max_recv_sge = qp->r_rq.max_sge; + attr->cap.max_inline_data = 0; + attr->ah_attr = qp->remote_ah_attr; + memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr)); + attr->pkey_index = qp->s_pkey_index; + attr->alt_pkey_index = 0; + attr->en_sqd_async_notify = 0; + attr->sq_draining = qp->s_draining; + attr->max_rd_atomic = qp->s_max_rd_atomic; + attr->max_dest_rd_atomic = qp->r_max_rd_atomic; + attr->min_rnr_timer = qp->r_min_rnr_timer; + attr->port_num = 1; + attr->timeout = qp->timeout; + attr->retry_cnt = qp->s_retry_cnt; + attr->rnr_retry = qp->s_rnr_retry_cnt; + attr->alt_port_num = 0; + attr->alt_timeout = 0; + + init_attr->event_handler = qp->ibqp.event_handler; + init_attr->qp_context = qp->ibqp.qp_context; + init_attr->send_cq = qp->ibqp.send_cq; + init_attr->recv_cq = qp->ibqp.recv_cq; + init_attr->srq = qp->ibqp.srq; + init_attr->cap = attr->cap; + if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR) + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + else + init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr->qp_type = qp->ibqp.qp_type; + init_attr->port_num = 1; + return 0; +} + +/** + * ipath_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 ipath_compute_aeth(struct ipath_qp *qp) +{ + u32 aeth = qp->r_msn & IPATH_MSN_MASK; + + if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + struct ipath_rwq *wq = qp->r_rq.wq; + u32 head; + u32 tail; + + /* sanity check pointers before trusting them */ + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * XXX Not holding the r_rq.lock here so there is a small + * chance that the pair of reads are not atomic. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) + max = x; + else if (min == x) + break; + else + min = x; + } + aeth |= x << IPATH_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * ipath_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: unused by InfiniPath + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ipath_qp *qp; + int err; + struct ipath_swqe *swq = NULL; + struct ipath_ibdev *dev; + size_t sz; + size_t sg_list_sz; + struct ib_qp *ret; + + if (init_attr->create_flags) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (init_attr->cap.max_send_sge > ib_ipath_max_sges || + init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + /* Check receive queue parameters if no SRQ is specified. */ + if (!init_attr->srq) { + if (init_attr->cap.max_recv_sge > ib_ipath_max_sges || + init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + if (init_attr->cap.max_send_sge + + init_attr->cap.max_send_wr + + init_attr->cap.max_recv_sge + + init_attr->cap.max_recv_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + } + + switch (init_attr->qp_type) { + case IB_QPT_UC: + case IB_QPT_RC: + case IB_QPT_UD: + case IB_QPT_SMI: + case IB_QPT_GSI: + sz = sizeof(struct ipath_sge) * + init_attr->cap.max_send_sge + + sizeof(struct ipath_swqe); + swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + if (swq == NULL) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + sz = sizeof(*qp); + sg_list_sz = 0; + if (init_attr->srq) { + struct ipath_srq *srq = to_isrq(init_attr->srq); + + if (srq->rq.max_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (srq->rq.max_sge - 1); + } else if (init_attr->cap.max_recv_sge > 1) + sg_list_sz = sizeof(*qp->r_sg_list) * + (init_attr->cap.max_recv_sge - 1); + qp = kmalloc(sz + sg_list_sz, GFP_KERNEL); + if (!qp) { + ret = ERR_PTR(-ENOMEM); + goto bail_swq; + } + if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD || + init_attr->qp_type == IB_QPT_SMI || + init_attr->qp_type == IB_QPT_GSI)) { + qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL); + if (!qp->r_ud_sg_list) { + ret = ERR_PTR(-ENOMEM); + goto bail_qp; + } + } else + qp->r_ud_sg_list = NULL; + if (init_attr->srq) { + sz = 0; + qp->r_rq.size = 0; + qp->r_rq.max_sge = 0; + qp->r_rq.wq = NULL; + init_attr->cap.max_recv_wr = 0; + init_attr->cap.max_recv_sge = 0; + } else { + qp->r_rq.size = init_attr->cap.max_recv_wr + 1; + qp->r_rq.max_sge = init_attr->cap.max_recv_sge; + sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + + sizeof(struct ipath_rwqe); + qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + + qp->r_rq.size * sz); + if (!qp->r_rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_sg_list; + } + } + + /* + * ib_create_qp() will initialize qp->ibqp + * except for qp->ibqp.qp_num. + */ + spin_lock_init(&qp->s_lock); + spin_lock_init(&qp->r_rq.lock); + atomic_set(&qp->refcount, 0); + init_waitqueue_head(&qp->wait); + init_waitqueue_head(&qp->wait_dma); + tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp); + INIT_LIST_HEAD(&qp->piowait); + INIT_LIST_HEAD(&qp->timerwait); + qp->state = IB_QPS_RESET; + qp->s_wq = swq; + qp->s_size = init_attr->cap.max_send_wr + 1; + qp->s_max_sge = init_attr->cap.max_send_sge; + if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) + qp->s_flags = IPATH_S_SIGNAL_REQ_WR; + else + qp->s_flags = 0; + dev = to_idev(ibpd->device); + err = ipath_alloc_qpn(&dev->qp_table, qp, + init_attr->qp_type); + if (err) { + ret = ERR_PTR(err); + vfree(qp->r_rq.wq); + goto bail_sg_list; + } + qp->ip = NULL; + qp->s_tx = NULL; + ipath_reset_qp(qp, init_attr->qp_type); + break; + + default: + /* Don't support raw QPs */ + ret = ERR_PTR(-ENOSYS); + goto bail; + } + + init_attr->cap.max_inline_data = 0; + + /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + if (!qp->r_rq.wq) { + __u64 offset = 0; + + err = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else { + u32 s = sizeof(struct ipath_rwq) + + qp->r_rq.size * sz; + + qp->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + qp->r_rq.wq); + if (!qp->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + err = ib_copy_to_udata(udata, &(qp->ip->offset), + sizeof(qp->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } + } + + spin_lock(&dev->n_qps_lock); + if (dev->n_qps_allocated == ib_ipath_max_qps) { + spin_unlock(&dev->n_qps_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_qps_allocated++; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &qp->ibqp; + goto bail; + +bail_ip: + if (qp->ip) + kref_put(&qp->ip->ref, ipath_release_mmap_info); + else + vfree(qp->r_rq.wq); + ipath_free_qp(&dev->qp_table, qp); + free_qpn(&dev->qp_table, qp->ibqp.qp_num); +bail_sg_list: + kfree(qp->r_ud_sg_list); +bail_qp: + kfree(qp); +bail_swq: + vfree(swq); +bail: + return ret; +} + +/** + * ipath_destroy_qp - destroy a queue pair + * @ibqp: the queue pair to destroy + * + * Returns 0 on success. + * + * Note that this can be called while the QP is actively sending or + * receiving! + */ +int ipath_destroy_qp(struct ib_qp *ibqp) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + + /* Make sure HW and driver activity is stopped. */ + spin_lock_irq(&qp->s_lock); + if (qp->state != IB_QPS_RESET) { + qp->state = IB_QPS_RESET; + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + qp->s_flags &= ~IPATH_S_ANY_WAIT; + spin_unlock_irq(&qp->s_lock); + /* Stop the sending tasklet */ + tasklet_kill(&qp->s_task); + wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy)); + } else + spin_unlock_irq(&qp->s_lock); + + ipath_free_qp(&dev->qp_table, qp); + + if (qp->s_tx) { + atomic_dec(&qp->refcount); + if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(qp->s_tx->txreq.map_addr); + spin_lock_irq(&dev->pending_lock); + list_add(&qp->s_tx->txreq.list, &dev->txreq_free); + spin_unlock_irq(&dev->pending_lock); + qp->s_tx = NULL; + } + + wait_event(qp->wait, !atomic_read(&qp->refcount)); + + /* all user's cleaned up, mark it available */ + free_qpn(&dev->qp_table, qp->ibqp.qp_num); + spin_lock(&dev->n_qps_lock); + dev->n_qps_allocated--; + spin_unlock(&dev->n_qps_lock); + + if (qp->ip) + kref_put(&qp->ip->ref, ipath_release_mmap_info); + else + vfree(qp->r_rq.wq); + kfree(qp->r_ud_sg_list); + vfree(qp->s_wq); + kfree(qp); + return 0; +} + +/** + * ipath_init_qp_table - initialize the QP table for a device + * @idev: the device who's QP table we're initializing + * @size: the size of the QP table + * + * Returns 0 on success, otherwise returns an errno. + */ +int ipath_init_qp_table(struct ipath_ibdev *idev, int size) +{ + int i; + int ret; + + idev->qp_table.last = 1; /* QPN 0 and 1 are special. */ + idev->qp_table.max = size; + idev->qp_table.nmaps = 1; + idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table), + GFP_KERNEL); + if (idev->qp_table.table == NULL) { + ret = -ENOMEM; + goto bail; + } + + for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) { + atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE); + idev->qp_table.map[i].page = NULL; + } + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void ipath_get_credit(struct ipath_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == IPATH_AETH_CREDIT_INVAL) + qp->s_lsn = (u32) -1; + else if (qp->s_lsn != (u32) -1) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK; + if (ipath_cmp24(credit, qp->s_lsn) > 0) + qp->s_lsn = credit; + } + + /* Restart sending if it was blocked due to lack of credits. */ + if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) && + qp->s_cur != qp->s_head && + (qp->s_lsn == (u32) -1 || + ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn, + qp->s_lsn + 1) <= 0)) + ipath_schedule_send(qp); +} diff --git a/drivers/staging/rdma/ipath/ipath_rc.c b/drivers/staging/rdma/ipath/ipath_rc.c new file mode 100644 index 000000000..79b3dbc97 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_rc.c @@ -0,0 +1,1969 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ipath_skip_sge(ss, len); + return wqe->length - len; +} + +/** + * ipath_init_restart- initialize the qp->s_sge after a restart + * @qp: the QP who's SGE we're restarting + * @wqe: the work queue to initialize the QP's SGE from + * + * The QP s_lock should be held and interrupts disabled. + */ +static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe) +{ + struct ipath_ibdev *dev; + + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, + ib_mtu_enum_to_int(qp->path_mtu)); + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @pmtu: the path MTU + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, u32 pmtu) +{ + struct ipath_ack_entry *e; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & IPATH_S_ACK_PENDING) + goto normal; + qp->s_ack_state = OP(ACKNOWLEDGE); + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* Copy SGE state in case we need to resend */ + qp->s_ack_rdma_sge = e->rdma_sge; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + len = e->rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = ipath_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = e->psn; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) + len = pmtu; + else { + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK; + break; + + default: + normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~IPATH_S_ACK_PENDING; + qp->s_cur_sge = NULL; + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->s_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = qp->s_ack_psn & IPATH_PSN_MASK; + } + qp->s_hdrwords = hwords; + qp->s_cur_size = len; + ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2); + return 1; + +bail: + return 0; +} + +/** + * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_rc_req(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; + struct ipath_sge_state *ss; + struct ipath_swqe *wqe; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + char newreq; + unsigned long flags; + int ret = 0; + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* + * The lock is needed to synchronize between the sending tasklet, + * the receive interrupt handler, and timeout resends. + */ + spin_lock_irqsave(&qp->s_lock, flags); + + /* Sending responses has higher priority over sending requests. */ + if ((qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) && + ipath_make_rc_ack(dev, qp, ohdr, pmtu)) + goto done; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + /* Leave BUSY set until RNR timeout. */ + if (qp->s_rnr_timeout) { + qp->s_flags |= IPATH_S_WAITING; + goto bail; + } + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 1 << 22; /* Set M bit */ + + /* Send a request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) + goto bail; + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= IPATH_S_FENCE_PENDING; + goto bail; + } + wqe->psn = qp->s_next_psn; + newreq = 1; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = 0; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && qp->s_lsn != (u32) -1) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (qp->s_lsn != (u32) -1 && + ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT; + goto bail; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + wqe->lpsn = wqe->psn; + if (len > pmtu) { + wqe->lpsn += (len - 1) / pmtu; + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 = 1 << 31; /* Request ACK. */ + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; + /* + * Adjust s_next_psn to count the + * expected number of responses. + */ + if (len > pmtu) + qp->s_next_psn += (len - 1) / pmtu; + wqe->lpsn = qp->s_next_psn++; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= IPATH_S_RDMAR_PENDING; + goto bail; + } + qp->s_num_rd_atomic++; + if (qp->s_lsn != (u32) -1) + qp->s_lsn++; + wqe->lpsn = wqe->psn; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->wr.wr.atomic.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->wr.wr.atomic.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->wr.wr.atomic.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = wqe->length; + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + bth2 |= qp->s_psn & IPATH_PSN_MASK; + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else { + qp->s_psn++; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + } + /* + * Put the QP on the pending list so lost ACKs will cause + * a retry. More than one request can be pending so the + * QP may already be on the dev->pending list. + */ + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * This case can only happen if a send is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = qp->s_psn++ & IPATH_PSN_MASK; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * This case can only happen if a RDMA write is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = qp->s_psn++ & IPATH_PSN_MASK; + if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0) + qp->s_next_psn = qp->s_psn; + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * This case can only happen if a RDMA read is restarted. + * See ipath_restart_rc(). + */ + ipath_init_restart(qp, wqe); + len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = qp->s_psn & IPATH_PSN_MASK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + } + if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0) + bth2 |= 1 << 31; /* Request ACK. */ + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP + * + * This is called from ipath_rc_rcv() and only uses the receive + * side QP state. + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. + */ +static void send_rc_ack(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd; + u16 lrh0; + u32 bth0; + u32 hwords; + u32 __iomem *piobuf; + struct ipath_ib_header hdr; + struct ipath_other_headers *ohdr; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->r_head_ack_queue != qp->s_tail_ack_queue || + (qp->s_flags & IPATH_S_ACK_PENDING) || + qp->s_ack_state != OP(ACKNOWLEDGE)) + goto queue_ack; + + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + dd = dev->dd; + if (!(dd->ipath_flags & IPATH_LINKACTIVE)) + goto done; + + piobuf = ipath_getpiobuf(dd, 0, NULL); + if (!piobuf) { + /* + * We are out of PIO buffers at the moment. + * Pass responsibility for sending the ACK to the + * send tasklet so that when a PIO buffer becomes + * available, the ACK is sent ahead of other outgoing + * packets. + */ + spin_lock_irqsave(&qp->s_lock, flags); + goto queue_ack; + } + + /* Construct the header. */ + ohdr = &hdr.u.oth; + lrh0 = IPATH_LRH_BTH; + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */ + hwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + hwords += ipath_make_grh(dev, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, + hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = IPATH_LRH_GRH; + } + /* read pkey_index w/o lock (its atomic) */ + bth0 = ipath_get_pkey(dd, qp->s_pkey_index) | + (OP(ACKNOWLEDGE) << 24) | (1 << 22); + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) | + (qp->r_nak_state << + IPATH_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = ipath_compute_aeth(qp); + lrh0 |= qp->remote_ah_attr.sl << 4; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK); + + writeq(hwords + 1, piobuf); + + if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) { + u32 *hdrp = (u32 *) &hdr; + + ipath_flush_wc(); + __iowrite32_copy(piobuf + 2, hdrp, hwords - 1); + ipath_flush_wc(); + __raw_writel(hdrp[hwords - 1], piobuf + hwords + 1); + } else + __iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords); + + ipath_flush_wc(); + + dev->n_unicast_xmit++; + goto done; + +queue_ack: + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) { + dev->n_rc_qacks++; + qp->s_flags |= IPATH_S_ACK_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from ipath_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct ipath_qp *qp, u32 psn) +{ + u32 n = qp->s_last; + struct ipath_swqe *wqe = get_swqe_ptr(qp, n); + u32 opcode; + + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (ipath_cmp24(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, n); + diff = ipath_cmp24(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; + } + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See ipath_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + qp->s_psn = psn; +} + +/** + * ipath_restart_rc - back up requester to resend the last un-ACKed request + * @qp: the QP to restart + * @psn: packet sequence number for the request + * @wc: the work completion request + * + * The QP s_lock should be held and interrupts disabled. + */ +void ipath_restart_rc(struct ipath_qp *qp, u32 psn) +{ + struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last); + struct ipath_ibdev *dev; + + if (qp->s_retry == 0) { + ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + goto bail; + } + qp->s_retry--; + + /* + * Remove the QP from the timeout queue. + * Note: it may already have been removed by ipath_ib_timer(). + */ + dev = to_idev(qp->ibqp.device); + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + if (!list_empty(&qp->piowait)) + list_del_init(&qp->piowait); + spin_unlock(&dev->pending_lock); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK; + + reset_psn(qp, psn); + ipath_schedule_send(qp); + +bail: + return; +} + +static inline void update_last_psn(struct ipath_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held and interrupts disabled. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ib_wc wc; + enum ib_wc_status status; + struct ipath_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + + /* + * Remove the QP from the timeout queue (or RNR timeout queue). + * If ipath_ib_timer() has already removed it, + * it's OK since we hold the QP s_lock and ipath_restart_rc() + * just won't find anything to restart if we ACK everything. + */ + spin_lock(&dev->pending_lock); + if (!list_empty(&qp->timerwait)) + list_del_init(&qp->timerwait); + spin_unlock(&dev->pending_lock); + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; + wqe = get_swqe_ptr(qp, qp->s_last); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { + /* + * The last valid PSN seen is the previous + * request's. + */ + update_last_psn(qp, wqe->psn - 1); + /* Retry this request. */ + ipath_restart_rc(qp, wqe->psn); + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + *(u64 *) wqe->sg_list[0].vaddr = val; + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if (((qp->s_flags & IPATH_S_FENCE_PENDING) && + !qp->s_num_rd_atomic) || + qp->s_flags & IPATH_S_RDMAR_PENDING) + ipath_schedule_send(qp); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0); + } + qp->s_retry = qp->s_retry_cnt; + /* + * If we are completing a request which is in the process of + * being resent, we can stop resending it since we know the + * responder has already seen it. + */ + if (qp->s_last == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_last = qp->s_cur; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } else { + if (++qp->s_last >= qp->s_size) + qp->s_last = 0; + if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur) + qp->s_draining = 0; + if (qp->s_last == qp->s_tail) + break; + wqe = get_swqe_ptr(qp, qp->s_last); + } + } + + switch (aeth >> 29) { + case 0: /* ACK */ + dev->n_rc_acks++; + /* If this is a partial ACK, reset the retransmit timer. */ + if (qp->s_last != qp->s_tail) { + spin_lock(&dev->pending_lock); + if (list_empty(&qp->timerwait)) + list_add_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + /* + * If we get a partial ACK for a resent operation, + * we can stop resending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (ipath_cmp24(qp->s_psn, psn) <= 0) { + reset_psn(qp, psn + 1); + ipath_schedule_send(qp); + } + } else if (ipath_cmp24(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + ipath_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, psn); + ret = 1; + goto bail; + + case 1: /* RNR NAK */ + dev->n_rnr_naks++; + if (qp->s_last == qp->s_tail) + goto bail; + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + + if (wqe->wr.opcode == IB_WR_RDMA_READ) + dev->n_rc_resends++; + else + dev->n_rc_resends += + (qp->s_psn - psn) & IPATH_PSN_MASK; + + reset_psn(qp, psn); + + qp->s_rnr_timeout = + ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK]; + ipath_insert_rnr_queue(qp); + ipath_schedule_send(qp); + goto bail; + + case 3: /* NAK */ + if (qp->s_last == qp->s_tail) + goto bail; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) & + IPATH_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + dev->n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + ipath_restart_rc(qp, psn); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + dev->n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + dev->n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + dev->n_other_naks++; + class_b: + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail; + + default: /* 2: reserved */ + reserved: + /* Ignore reserved NAK codes. */ + goto bail; + } + +bail: + return ret; +} + +/** + * ipath_rc_rcv_resp - process an incoming RC response packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, u32 tlen, + struct ipath_qp *qp, + u32 opcode, + u32 psn, u32 hdrsize, u32 pmtu, + int header_in_data) +{ + struct ipath_swqe *wqe; + enum ib_wc_status status; + unsigned long flags; + int diff; + u32 pad; + u32 aeth; + u64 val; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto ack_done; + + /* Ignore invalid responses. */ + if (ipath_cmp24(psn, qp->s_next_psn) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = ipath_cmp24(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if ((aeth >> 29) == 0) + ipath_get_credit(qp, aeth); + } + goto ack_done; + } + + if (unlikely(qp->s_last == qp->s_tail)) + goto ack_done; + wqe = get_swqe_ptr(qp, qp->s_last); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + if (!header_in_data) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64) be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else + val = be64_to_cpu(((__be64 *) data)[0]); + } else + val = 0; + if (!do_rc_ack(qp, aeth, psn, opcode, val) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + hdrsize += 4; + wqe = get_swqe_ptr(qp, qp->s_last); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + qp->r_flags &= ~IPATH_R_RDMAR_SEQ; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); + goto ack_done; + } + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + read_middle: + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* We got a response so update the timeout. */ + spin_lock(&dev->pending_lock); + if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait)) + list_move_tail(&qp->timerwait, + &dev->pending[dev->pending_index]); + spin_unlock(&dev->pending_lock); + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else + aeth = be32_to_cpu(((__be32 *) data)[0]); + if (!do_rc_ack(qp, aeth, psn, opcode, 0)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 8))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = get_swqe_ptr(qp, qp->s_last); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) { + dev->n_rdma_seq++; + if (qp->r_flags & IPATH_R_RDMAR_SEQ) + goto ack_done; + qp->r_flags |= IPATH_R_RDMAR_SEQ; + ipath_restart_rc(qp, qp->s_last_psn + 1); + goto ack_done; + } + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for the AETH header (4) and + * ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 8))) + goto ack_len_err; + read_last: + tlen -= hdrsize + pad + 8; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + if (!header_in_data) + aeth = be32_to_cpu(ohdr->u.aeth); + else { + aeth = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } + ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen); + (void) do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + ipath_send_complete(qp, wqe, status); + ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR); +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +/** + * ipath_rc_rcv_error - process an incoming duplicate or error RC packet + * @dev: the device this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * @header_in_data: true if part of the header data is in the data buffer + * + * This is called from ipath_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev, + struct ipath_other_headers *ohdr, + void *data, + struct ipath_qp *qp, + u32 opcode, + u32 psn, + int diff, + int header_in_data) +{ + struct ipath_ack_entry *e; + u8 i, prev; + int old_req; + unsigned long flags; + + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + goto send_ack; + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + */ + psn &= IPATH_PSN_MASK; + e = NULL; + old_req = 1; + + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this now that we hold the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock_done; + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = IPATH_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (ipath_cmp24(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req) + goto unlock_done; + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = ((psn - e->psn) & IPATH_PSN_MASK) * + ib_mtu_enum_to_int(qp->path_mtu); + len = be32_to_cpu(reth->length); + if (unlikely(offset + len > e->rdma_sge.sge.sge_length)) + goto unlock_done; + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + ok = ipath_rkey_ok(qp, &e->rdma_sge, + len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; + } + e->psn = psn; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8) opcode || old_req) + goto unlock_done; + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = prev; + break; + } + + default: + if (old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + /* + * Try to send a simple ACK to work around a Mellanox bug + * which doesn't accept a RDMA read response or atomic + * response as an ACK for earlier SENDs or RDMA writes. + */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue && + !(qp->s_flags & IPATH_S_ACK_PENDING) && + qp->s_ack_state == OP(ACKNOWLEDGE)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->s_ack_queue[i].psn - 1; + goto send_ack; + } + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_tail_ack_queue = i; + break; + } + qp->r_nak_state = 0; + ipath_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = ipath_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + if (n == qp->s_tail_ack_queue) { + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); + } +} + +/** + * ipath_rc_rcv - process an incoming RC packet + * @dev: the device this packet came in on + * @hdr: the header of this packet + * @has_grh: true if the header has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from ipath_qp_rcv() to process an incoming RC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + u32 opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int diff; + struct ib_reth *reth; + int header_in_data; + unsigned long flags; + + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 4 + * bytes of the BTH header (PSN) is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, header_in_data); + goto done; + } + + /* Compute 24 bits worth of difference. */ + diff = ipath_cmp24(psn, qp->r_psn); + if (unlikely(diff)) { + if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode, + psn, diff, header_in_data)) + goto done; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + memset(&wc, 0, sizeof wc); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): + send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + /* FALLTHROUGH */ + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + case OP(RDMA_WRITE_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + ipath_copy_sge(&qp->r_sge, data, tlen); + qp->r_msn++; + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(qp, &qp->r_sge, + qp->r_len, vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto send_last; + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + goto send_last_imm; + + case OP(RDMA_READ_REQUEST): { + struct ipath_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; + } else { + e->rdma_sge.sg_list = NULL; + e->rdma_sge.num_sge = 0; + e->rdma_sge.sge.mr = NULL; + e->rdma_sge.sge.vaddr = NULL; + e->rdma_sge.sge.length = 0; + e->rdma_sge.sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + + goto unlock; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + struct ipath_ack_entry *e; + u64 vaddr; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > IPATH_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + /* Double check we can process this while holding the s_lock. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) + goto unlock; + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + ipath_update_ack_queue(qp, next); + } + if (!header_in_data) + ateth = &ohdr->u.atomic_eth; + else + ateth = (struct ib_atomic_eth *)data; + vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, + sizeof(u64), vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = be64_to_cpu(ateth->swap_data); + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + e->opcode = opcode; + e->sent = 0; + e->psn = psn & IPATH_PSN_MASK; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + ipath_schedule_send(qp); + + goto unlock; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & (1 << 31)) + goto send_ack; + goto done; + +rnr_nak: + qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + goto send_ack; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + ipath_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + send_rc_ack(qp); + goto done; + +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return; +} diff --git a/drivers/staging/rdma/ipath/ipath_registers.h b/drivers/staging/rdma/ipath/ipath_registers.h new file mode 100644 index 000000000..8f44d0cf3 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_registers.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _IPATH_REGISTERS_H +#define _IPATH_REGISTERS_H + +/* + * This file should only be included by kernel source, and by the diags. It + * defines the registers, and their contents, for InfiniPath chips. + */ + +/* + * These are the InfiniPath register and buffer bit definitions, + * that are visible to software, and needed only by the kernel + * and diag code. A few, that are visible to protocol and user + * code are in ipath_common.h. Some bits are specific + * to a given chip implementation, and have been moved to the + * chip-specific source file + */ + +/* kr_revision bits */ +#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0 +#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF +#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8 +#define INFINIPATH_R_ARCH_MASK 0xFF +#define INFINIPATH_R_ARCH_SHIFT 16 +#define INFINIPATH_R_SOFTWARE_MASK 0xFF +#define INFINIPATH_R_SOFTWARE_SHIFT 24 +#define INFINIPATH_R_BOARDID_MASK 0xFF +#define INFINIPATH_R_BOARDID_SHIFT 32 + +/* kr_control bits */ +#define INFINIPATH_C_FREEZEMODE 0x00000002 +#define INFINIPATH_C_LINKENABLE 0x00000004 + +/* kr_sendctrl bits */ +#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16 +#define INFINIPATH_S_UPDTHRESH_SHIFT 24 +#define INFINIPATH_S_UPDTHRESH_MASK 0x1f + +#define IPATH_S_ABORT 0 +#define IPATH_S_PIOINTBUFAVAIL 1 +#define IPATH_S_PIOBUFAVAILUPD 2 +#define IPATH_S_PIOENABLE 3 +#define IPATH_S_SDMAINTENABLE 9 +#define IPATH_S_SDMASINGLEDESCRIPTOR 10 +#define IPATH_S_SDMAENABLE 11 +#define IPATH_S_SDMAHALT 12 +#define IPATH_S_DISARM 31 + +#define INFINIPATH_S_ABORT (1U << IPATH_S_ABORT) +#define INFINIPATH_S_PIOINTBUFAVAIL (1U << IPATH_S_PIOINTBUFAVAIL) +#define INFINIPATH_S_PIOBUFAVAILUPD (1U << IPATH_S_PIOBUFAVAILUPD) +#define INFINIPATH_S_PIOENABLE (1U << IPATH_S_PIOENABLE) +#define INFINIPATH_S_SDMAINTENABLE (1U << IPATH_S_SDMAINTENABLE) +#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \ + (1U << IPATH_S_SDMASINGLEDESCRIPTOR) +#define INFINIPATH_S_SDMAENABLE (1U << IPATH_S_SDMAENABLE) +#define INFINIPATH_S_SDMAHALT (1U << IPATH_S_SDMAHALT) +#define INFINIPATH_S_DISARM (1U << IPATH_S_DISARM) + +/* kr_rcvctrl bits that are the same on multiple chips */ +#define INFINIPATH_R_PORTENABLE_SHIFT 0 +#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38) + +/* kr_intstatus, kr_intclear, kr_intmask bits */ +#define INFINIPATH_I_SDMAINT 0x8000000000000000ULL +#define INFINIPATH_I_SDMADISABLED 0x4000000000000000ULL +#define INFINIPATH_I_ERROR 0x0000000080000000ULL +#define INFINIPATH_I_SPIOSENT 0x0000000040000000ULL +#define INFINIPATH_I_SPIOBUFAVAIL 0x0000000020000000ULL +#define INFINIPATH_I_GPIO 0x0000000010000000ULL +#define INFINIPATH_I_JINT 0x0000000004000000ULL + +/* kr_errorstatus, kr_errorclear, kr_errormask bits */ +#define INFINIPATH_E_RFORMATERR 0x0000000000000001ULL +#define INFINIPATH_E_RVCRC 0x0000000000000002ULL +#define INFINIPATH_E_RICRC 0x0000000000000004ULL +#define INFINIPATH_E_RMINPKTLEN 0x0000000000000008ULL +#define INFINIPATH_E_RMAXPKTLEN 0x0000000000000010ULL +#define INFINIPATH_E_RLONGPKTLEN 0x0000000000000020ULL +#define INFINIPATH_E_RSHORTPKTLEN 0x0000000000000040ULL +#define INFINIPATH_E_RUNEXPCHAR 0x0000000000000080ULL +#define INFINIPATH_E_RUNSUPVL 0x0000000000000100ULL +#define INFINIPATH_E_REBP 0x0000000000000200ULL +#define INFINIPATH_E_RIBFLOW 0x0000000000000400ULL +#define INFINIPATH_E_RBADVERSION 0x0000000000000800ULL +#define INFINIPATH_E_RRCVEGRFULL 0x0000000000001000ULL +#define INFINIPATH_E_RRCVHDRFULL 0x0000000000002000ULL +#define INFINIPATH_E_RBADTID 0x0000000000004000ULL +#define INFINIPATH_E_RHDRLEN 0x0000000000008000ULL +#define INFINIPATH_E_RHDR 0x0000000000010000ULL +#define INFINIPATH_E_RIBLOSTLINK 0x0000000000020000ULL +#define INFINIPATH_E_SENDSPECIALTRIGGER 0x0000000008000000ULL +#define INFINIPATH_E_SDMADISABLED 0x0000000010000000ULL +#define INFINIPATH_E_SMINPKTLEN 0x0000000020000000ULL +#define INFINIPATH_E_SMAXPKTLEN 0x0000000040000000ULL +#define INFINIPATH_E_SUNDERRUN 0x0000000080000000ULL +#define INFINIPATH_E_SPKTLEN 0x0000000100000000ULL +#define INFINIPATH_E_SDROPPEDSMPPKT 0x0000000200000000ULL +#define INFINIPATH_E_SDROPPEDDATAPKT 0x0000000400000000ULL +#define INFINIPATH_E_SPIOARMLAUNCH 0x0000000800000000ULL +#define INFINIPATH_E_SUNEXPERRPKTNUM 0x0000001000000000ULL +#define INFINIPATH_E_SUNSUPVL 0x0000002000000000ULL +#define INFINIPATH_E_SENDBUFMISUSE 0x0000004000000000ULL +#define INFINIPATH_E_SDMAGENMISMATCH 0x0000008000000000ULL +#define INFINIPATH_E_SDMAOUTOFBOUND 0x0000010000000000ULL +#define INFINIPATH_E_SDMATAILOUTOFBOUND 0x0000020000000000ULL +#define INFINIPATH_E_SDMABASE 0x0000040000000000ULL +#define INFINIPATH_E_SDMA1STDESC 0x0000080000000000ULL +#define INFINIPATH_E_SDMARPYTAG 0x0000100000000000ULL +#define INFINIPATH_E_SDMADWEN 0x0000200000000000ULL +#define INFINIPATH_E_SDMAMISSINGDW 0x0000400000000000ULL +#define INFINIPATH_E_SDMAUNEXPDATA 0x0000800000000000ULL +#define INFINIPATH_E_IBSTATUSCHANGED 0x0001000000000000ULL +#define INFINIPATH_E_INVALIDADDR 0x0002000000000000ULL +#define INFINIPATH_E_RESET 0x0004000000000000ULL +#define INFINIPATH_E_HARDWARE 0x0008000000000000ULL +#define INFINIPATH_E_SDMADESCADDRMISALIGN 0x0010000000000000ULL +#define INFINIPATH_E_INVALIDEEPCMD 0x0020000000000000ULL + +/* + * this is used to print "common" packet errors only when the + * __IPATH_ERRPKTDBG bit is set in ipath_debug. + */ +#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \ + | INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \ + | INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \ + | INFINIPATH_E_REBP ) + +/* Convenience for decoding Send DMA errors */ +#define INFINIPATH_E_SDMAERRS ( \ + INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \ + INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \ + INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \ + INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \ + INFINIPATH_E_SDMAUNEXPDATA | \ + INFINIPATH_E_SDMADESCADDRMISALIGN | \ + INFINIPATH_E_SDMADISABLED | \ + INFINIPATH_E_SENDBUFMISUSE) + +/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */ +/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo + * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2: expTID, 3: eagerTID + * bit 4: flag buffer, 5: datainfo, 6: header info */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL +/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF 0x1ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC 0x2ULL +#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL +/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */ +#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF 0x01ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ 0x02ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID 0x04ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF 0x10ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL +#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO 0x40ULL +/* waldo specific -- find the rest in ipath_6110.c */ +#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR 0x0000000400000000ULL +/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */ +#define INFINIPATH_HWE_MEMBISTFAILED 0x0040000000000000ULL + +/* kr_hwdiagctrl bits */ +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL +#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40 +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL +#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44 +#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR 0x0000000400000000ULL +#define INFINIPATH_DC_COUNTERDISABLE 0x1000000000000000ULL +#define INFINIPATH_DC_COUNTERWREN 0x2000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL +#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL + +/* kr_ibcctrl bits */ +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0 +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL +#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8 +#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1 +/* cycle through TS1/TS2 till OK */ +#define INFINIPATH_IBCC_LINKINITCMD_POLL 2 +/* wait for TS1, then go on */ +#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3 +#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16 +#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL +#define INFINIPATH_IBCC_LINKCMD_DOWN 1 /* move to 0x11 */ +#define INFINIPATH_IBCC_LINKCMD_ARMED 2 /* move to 0x21 */ +#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */ +#define INFINIPATH_IBCC_LINKCMD_SHIFT 18 +#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL +#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20 +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32 +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL +#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36 +#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL +#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40 +#define INFINIPATH_IBCC_LOOPBACK 0x8000000000000000ULL +#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL + +/* kr_ibcstatus bits */ +#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0 +#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7 + +#define INFINIPATH_IBCS_TXREADY 0x40000000 +#define INFINIPATH_IBCS_TXCREDITOK 0x80000000 +/* link training states (shift by + INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */ +#define INFINIPATH_IBCS_LT_STATE_DISABLED 0x00 +#define INFINIPATH_IBCS_LT_STATE_LINKUP 0x01 +#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE 0x02 +#define INFINIPATH_IBCS_LT_STATE_POLLQUIET 0x03 +#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY 0x04 +#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET 0x05 +#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE 0x08 +#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG 0x09 +#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT 0x0a +#define INFINIPATH_IBCS_LT_STATE_CFGIDLE 0x0b +#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN 0x0c +#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT 0x0e +#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE 0x0f +/* link state machine states (shift by ibcs_ls_shift) */ +#define INFINIPATH_IBCS_L_STATE_DOWN 0x0 +#define INFINIPATH_IBCS_L_STATE_INIT 0x1 +#define INFINIPATH_IBCS_L_STATE_ARM 0x2 +#define INFINIPATH_IBCS_L_STATE_ACTIVE 0x3 +#define INFINIPATH_IBCS_L_STATE_ACT_DEFER 0x4 + + +/* kr_extstatus bits */ +#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1 +#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL +#define INFINIPATH_EXTS_GPIOIN_SHIFT 48 + +/* kr_extctrl bits */ +#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32 +#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL +#define INFINIPATH_EXTC_GPIOOE_SHIFT 48 +#define INFINIPATH_EXTC_SERDESENABLE 0x80000000ULL +#define INFINIPATH_EXTC_SERDESCONNECT 0x40000000ULL +#define INFINIPATH_EXTC_SERDESENTRUNKING 0x20000000ULL +#define INFINIPATH_EXTC_SERDESDISRXFIFO 0x10000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK1 0x08000000ULL +#define INFINIPATH_EXTC_SERDESENPLPBK2 0x04000000ULL +#define INFINIPATH_EXTC_SERDESENENCDEC 0x02000000ULL +#define INFINIPATH_EXTC_LED1SECPORT_ON 0x00000020ULL +#define INFINIPATH_EXTC_LED2SECPORT_ON 0x00000010ULL +#define INFINIPATH_EXTC_LED1PRIPORT_ON 0x00000008ULL +#define INFINIPATH_EXTC_LED2PRIPORT_ON 0x00000004ULL +#define INFINIPATH_EXTC_LEDGBLOK_ON 0x00000002ULL +#define INFINIPATH_EXTC_LEDGBLERR_OFF 0x00000001ULL + +/* kr_partitionkey bits */ +#define INFINIPATH_PKEY_SIZE 16 +#define INFINIPATH_PKEY_MASK 0xFFFF +#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF + +/* kr_serdesconfig0 bits */ +#define INFINIPATH_SERDC0_RESET_MASK 0xfULL /* overal reset bits */ +#define INFINIPATH_SERDC0_RESET_PLL 0x10000000ULL /* pll reset */ +/* tx idle enables (per lane) */ +#define INFINIPATH_SERDC0_TXIDLE 0xF000ULL +/* rx detect enables (per lane) */ +#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL +/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */ +#define INFINIPATH_SERDC0_L1PWR_DN 0xF0ULL + +/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */ +#define INFINIPATH_XGXS_RX_POL_SHIFT 19 +#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL + + +/* + * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our + * PIO send buffers. This is well beyond anything currently + * defined in the InfiniBand spec. + */ +#define IPATH_PIO_MAXIBHDR 128 + +typedef u64 ipath_err_t; + +/* The following change with the type of device, so + * need to be part of the ipath_devdata struct, or + * we could have problems plugging in devices of + * different types (e.g. one HT, one PCIE) + * in one system, to be managed by one driver. + * On the other hand, this file is may also be included + * by other code, so leave the declarations here + * temporarily. Minor footprint issue if common-model + * linker used, none if C89+ linker used. + */ + +/* mask of defined bits for various registers */ +extern u64 infinipath_i_bitsextant; +extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant; + +/* masks that are different in various chips, or only exist in some chips */ +extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask; + +/* + * These are the infinipath general register numbers (not offsets). + * The kernel registers are used directly, those beyond the kernel + * registers are calculated from one of the base registers. The use of + * an integer type doesn't allow type-checking as thorough as, say, + * an enum but allows for better hiding of chip differences. + */ +typedef const u16 ipath_kreg, /* infinipath general registers */ + ipath_creg, /* infinipath counter registers */ + ipath_sreg; /* kernel-only, infinipath send registers */ + +/* + * These are the chip registers common to all infinipath chips, and + * used both by the kernel and the diagnostics or other user code. + * They are all implemented such that 64 bit accesses work. + * Some implement no more than 32 bits. Because 64 bit reads + * require 2 HT cmds on opteron, we access those with 32 bit + * reads for efficiency (they are written as 64 bits, since + * the extra 32 bits are nearly free on writes, and it slightly reduces + * complexity). The rest are all accessed as 64 bits. + */ +struct ipath_kregs { + /* These are the 32 bit group */ + ipath_kreg kr_control; + ipath_kreg kr_counterregbase; + ipath_kreg kr_intmask; + ipath_kreg kr_intstatus; + ipath_kreg kr_pagealign; + ipath_kreg kr_portcnt; + ipath_kreg kr_rcvtidbase; + ipath_kreg kr_rcvtidcnt; + ipath_kreg kr_rcvegrbase; + ipath_kreg kr_rcvegrcnt; + ipath_kreg kr_scratch; + ipath_kreg kr_sendctrl; + ipath_kreg kr_sendpiobufbase; + ipath_kreg kr_sendpiobufcnt; + ipath_kreg kr_sendpiosize; + ipath_kreg kr_sendregbase; + ipath_kreg kr_userregbase; + /* These are the 64 bit group */ + ipath_kreg kr_debugport; + ipath_kreg kr_debugportselect; + ipath_kreg kr_errorclear; + ipath_kreg kr_errormask; + ipath_kreg kr_errorstatus; + ipath_kreg kr_extctrl; + ipath_kreg kr_extstatus; + ipath_kreg kr_gpio_clear; + ipath_kreg kr_gpio_mask; + ipath_kreg kr_gpio_out; + ipath_kreg kr_gpio_status; + ipath_kreg kr_hwdiagctrl; + ipath_kreg kr_hwerrclear; + ipath_kreg kr_hwerrmask; + ipath_kreg kr_hwerrstatus; + ipath_kreg kr_ibcctrl; + ipath_kreg kr_ibcstatus; + ipath_kreg kr_intblocked; + ipath_kreg kr_intclear; + ipath_kreg kr_interruptconfig; + ipath_kreg kr_mdio; + ipath_kreg kr_partitionkey; + ipath_kreg kr_rcvbthqp; + ipath_kreg kr_rcvbufbase; + ipath_kreg kr_rcvbufsize; + ipath_kreg kr_rcvctrl; + ipath_kreg kr_rcvhdrcnt; + ipath_kreg kr_rcvhdrentsize; + ipath_kreg kr_rcvhdrsize; + ipath_kreg kr_rcvintmembase; + ipath_kreg kr_rcvintmemsize; + ipath_kreg kr_revision; + ipath_kreg kr_sendbuffererror; + ipath_kreg kr_sendpioavailaddr; + ipath_kreg kr_serdesconfig0; + ipath_kreg kr_serdesconfig1; + ipath_kreg kr_serdesstatus; + ipath_kreg kr_txintmembase; + ipath_kreg kr_txintmemsize; + ipath_kreg kr_xgxsconfig; + ipath_kreg kr_ibpllcfg; + /* use these two (and the following N ports) only with + * ipath_k*_kreg64_port(); not *kreg64() */ + ipath_kreg kr_rcvhdraddr; + ipath_kreg kr_rcvhdrtailaddr; + + /* remaining registers are not present on all types of infinipath + chips */ + ipath_kreg kr_rcvpktledcnt; + ipath_kreg kr_pcierbuftestreg0; + ipath_kreg kr_pcierbuftestreg1; + ipath_kreg kr_pcieq0serdesconfig0; + ipath_kreg kr_pcieq0serdesconfig1; + ipath_kreg kr_pcieq0serdesstatus; + ipath_kreg kr_pcieq1serdesconfig0; + ipath_kreg kr_pcieq1serdesconfig1; + ipath_kreg kr_pcieq1serdesstatus; + ipath_kreg kr_hrtbt_guid; + ipath_kreg kr_ibcddrctrl; + ipath_kreg kr_ibcddrstatus; + ipath_kreg kr_jintreload; + + /* send dma related regs */ + ipath_kreg kr_senddmabase; + ipath_kreg kr_senddmalengen; + ipath_kreg kr_senddmatail; + ipath_kreg kr_senddmahead; + ipath_kreg kr_senddmaheadaddr; + ipath_kreg kr_senddmabufmask0; + ipath_kreg kr_senddmabufmask1; + ipath_kreg kr_senddmabufmask2; + ipath_kreg kr_senddmastatus; + + /* SerDes related regs (IBA7220-only) */ + ipath_kreg kr_ibserdesctrl; + ipath_kreg kr_ib_epbacc; + ipath_kreg kr_ib_epbtrans; + ipath_kreg kr_pcie_epbacc; + ipath_kreg kr_pcie_epbtrans; + ipath_kreg kr_ib_ddsrxeq; +}; + +struct ipath_cregs { + ipath_creg cr_badformatcnt; + ipath_creg cr_erricrccnt; + ipath_creg cr_errlinkcnt; + ipath_creg cr_errlpcrccnt; + ipath_creg cr_errpkey; + ipath_creg cr_errrcvflowctrlcnt; + ipath_creg cr_err_rlencnt; + ipath_creg cr_errslencnt; + ipath_creg cr_errtidfull; + ipath_creg cr_errtidvalid; + ipath_creg cr_errvcrccnt; + ipath_creg cr_ibstatuschange; + ipath_creg cr_intcnt; + ipath_creg cr_invalidrlencnt; + ipath_creg cr_invalidslencnt; + ipath_creg cr_lbflowstallcnt; + ipath_creg cr_iblinkdowncnt; + ipath_creg cr_iblinkerrrecovcnt; + ipath_creg cr_ibsymbolerrcnt; + ipath_creg cr_pktrcvcnt; + ipath_creg cr_pktrcvflowctrlcnt; + ipath_creg cr_pktsendcnt; + ipath_creg cr_pktsendflowcnt; + ipath_creg cr_portovflcnt; + ipath_creg cr_rcvebpcnt; + ipath_creg cr_rcvovflcnt; + ipath_creg cr_rxdroppktcnt; + ipath_creg cr_senddropped; + ipath_creg cr_sendstallcnt; + ipath_creg cr_sendunderruncnt; + ipath_creg cr_unsupvlcnt; + ipath_creg cr_wordrcvcnt; + ipath_creg cr_wordsendcnt; + ipath_creg cr_vl15droppedpktcnt; + ipath_creg cr_rxotherlocalphyerrcnt; + ipath_creg cr_excessbufferovflcnt; + ipath_creg cr_locallinkintegrityerrcnt; + ipath_creg cr_rxvlerrcnt; + ipath_creg cr_rxdlidfltrcnt; + ipath_creg cr_psstat; + ipath_creg cr_psstart; + ipath_creg cr_psinterval; + ipath_creg cr_psrcvdatacount; + ipath_creg cr_psrcvpktscount; + ipath_creg cr_psxmitdatacount; + ipath_creg cr_psxmitpktscount; + ipath_creg cr_psxmitwaitcount; +}; + +#endif /* _IPATH_REGISTERS_H */ diff --git a/drivers/staging/rdma/ipath/ipath_ruc.c b/drivers/staging/rdma/ipath/ipath_ruc.c new file mode 100644 index 000000000..1f95bbaf7 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_ruc.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* + * Convert the AETH RNR timeout code into the number of milliseconds. + */ +const u32 ib_ipath_rnr_table[32] = { + 656, /* 0 */ + 1, /* 1 */ + 1, /* 2 */ + 1, /* 3 */ + 1, /* 4 */ + 1, /* 5 */ + 1, /* 6 */ + 1, /* 7 */ + 1, /* 8 */ + 1, /* 9 */ + 1, /* A */ + 1, /* B */ + 1, /* C */ + 1, /* D */ + 2, /* E */ + 2, /* F */ + 3, /* 10 */ + 4, /* 11 */ + 6, /* 12 */ + 8, /* 13 */ + 11, /* 14 */ + 16, /* 15 */ + 21, /* 16 */ + 31, /* 17 */ + 41, /* 18 */ + 62, /* 19 */ + 82, /* 1A */ + 123, /* 1B */ + 164, /* 1C */ + 246, /* 1D */ + 328, /* 1E */ + 492 /* 1F */ +}; + +/** + * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device + * @qp: the QP + * + * Called with the QP s_lock held and interrupts disabled. + * XXX Use a simple list for now. We might need a priority + * queue if we have lots of QPs waiting for RNR timeouts + * but that should be rare. + */ +void ipath_insert_rnr_queue(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + + /* We already did a spin_lock_irqsave(), so just use spin_lock */ + spin_lock(&dev->pending_lock); + if (list_empty(&dev->rnrwait)) + list_add(&qp->timerwait, &dev->rnrwait); + else { + struct list_head *l = &dev->rnrwait; + struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp, + timerwait); + + while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) { + qp->s_rnr_timeout -= nqp->s_rnr_timeout; + l = l->next; + if (l->next == &dev->rnrwait) { + nqp = NULL; + break; + } + nqp = list_entry(l->next, struct ipath_qp, + timerwait); + } + if (nqp) + nqp->s_rnr_timeout -= qp->s_rnr_timeout; + list_add(&qp->timerwait, l); + } + spin_unlock(&dev->pending_lock); +} + +/** + * ipath_init_sge - Validate a RWQE and fill in the SGE state + * @qp: the QP + * + * Return 1 if OK. + */ +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss) +{ + int i, j, ret; + struct ib_wc wc; + + *lengthp = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + *lengthp += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ret = 1; + goto bail; + +bad_lkey: + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + +/** + * ipath_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return 0 if no RWQE is available, otherwise return 1. + * + * Can be called from interrupt level. + */ +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only) +{ + unsigned long flags; + struct ipath_rq *rq; + struct ipath_rwq *wq; + struct ipath_srq *srq; + struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + do { + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = get_rwqe_ptr(rq, tail); + if (++tail >= rq->size) + tail = 0; + if (wr_id_only) + break; + qp->r_sge.sg_list = qp->r_sg_list; + } while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge)); + qp->r_wr_id = wqe->wr_id; + wq->tail = tail; + + ret = 1; + set_bit(IPATH_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} + +/** + * ipath_ruc_loopback - handle UC and RC lookback requests + * @sqp: the sending QP + * + * This is called from ipath_do_send() to + * forward a WQE addressed to the same HCA. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +static void ipath_ruc_loopback(struct ipath_qp *sqp) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ipath_swqe *wqe; + struct ipath_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= IPATH_S_BUSY; + +again: + if (sqp->s_last == sqp->s_head) + goto clr_busy; + wqe = get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work reqeust. */ + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof wc); + send_status = IB_WC_SUCCESS; + + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + if (!ipath_get_rwqe(qp, 0)) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + if (!ipath_get_rwqe(qp, 1)) + goto rnr_nak; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + if (wqe->length == 0) + break; + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length, + wqe->wr.wr.rdma.remote_addr, + wqe->wr.wr.rdma.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64), + wqe->wr.wr.atomic.remote_addr, + wqe->wr.wr.atomic.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *) qp->r_sge.sge.vaddr; + sdata = wqe->wr.wr.atomic.compare_add; + *(u64 *) sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64) atomic64_add_return(sdata, maddr) - sdata : + (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, + sdata, wqe->wr.wr.atomic.swap); + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + ipath_copy_sge(&qp->r_sge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + ipath_send_complete(sqp, wqe, send_status); + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK)) + goto clr_busy; + sqp->s_flags |= IPATH_S_WAITING; + dev->n_rnr_naks++; + sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer]; + ipath_insert_rnr_queue(sqp); + goto clr_busy; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + ipath_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + ipath_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~IPATH_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + if (qp && atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp) +{ + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) || + qp->ibqp.qp_type == IB_QPT_SMI) { + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + } +} + +/** + * ipath_no_bufs_available - tell the layer driver we need buffers + * @qp: the QP that caused the problem + * @dev: the device we ran out of buffers on + * + * Called when we run out of PIO buffers. + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int ipath_no_bufs_available(struct ipath_qp *qp, + struct ipath_ibdev *dev) +{ + unsigned long flags; + int ret = 1; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, ipath_ib_piobufavail() + * could be called. Therefore, put QP on the piowait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_piowait++; + qp->s_flags |= IPATH_S_WAITING; + qp->s_flags &= ~IPATH_S_BUSY; + spin_lock(&dev->pending_lock); + if (list_empty(&qp->piowait)) + list_add_tail(&qp->piowait, &dev->piowait); + spin_unlock(&dev->pending_lock); + } else + ret = 0; + spin_unlock_irqrestore(&qp->s_lock, flags); + if (ret) + want_buffer(dev->dd, qp); + return ret; +} + +/** + * ipath_make_grh - construct a GRH header + * @dev: a pointer to the ipath device + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((6 << 28) | + (grh->traffic_class << 20) | + grh->flow_label); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = 0x1B; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = dev->gid_prefix; + hdr->sgid.global.interface_id = dev->dd->ipath_guid; + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2) +{ + u16 lrh0; + u32 nwords; + u32 extra_bytes; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = IPATH_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + } + lrh0 |= qp->remote_ah_attr.sl << 4; + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22)); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[2] = cpu_to_be32(bth2); +} + +/** + * ipath_do_send - perform a send on a QP + * @data: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, two threads could send packets out of order. + */ +void ipath_do_send(unsigned long data) +{ + struct ipath_qp *qp = (struct ipath_qp *)data; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + int (*make_req)(struct ipath_qp *qp); + unsigned long flags; + + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC) && + qp->remote_ah_attr.dlid == dev->dd->ipath_lid) { + ipath_ruc_loopback(qp); + goto bail; + } + + if (qp->ibqp.qp_type == IB_QPT_RC) + make_req = ipath_make_rc_req; + else if (qp->ibqp.qp_type == IB_QPT_UC) + make_req = ipath_make_uc_req; + else + make_req = ipath_make_ud_req; + + spin_lock_irqsave(&qp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) || + !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) { + spin_unlock_irqrestore(&qp->s_lock, flags); + goto bail; + } + + qp->s_flags |= IPATH_S_BUSY; + + spin_unlock_irqrestore(&qp->s_lock, flags); + +again: + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + /* + * If no PIO bufs are available, return. An interrupt will + * call ipath_ib_piobufavail() when one is available. + */ + if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords, + qp->s_cur_sge, qp->s_cur_size)) { + if (ipath_no_bufs_available(qp, dev)) + goto bail; + } + dev->n_unicast_xmit++; + /* Record that we sent the packet and s_hdr is empty. */ + qp->s_hdrwords = 0; + } + + if (make_req(qp)) + goto again; + +bail:; +} + +/* + * This should be called with s_lock held. + */ +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) + return; + + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof wc); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + old_last = last = qp->s_last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} diff --git a/drivers/staging/rdma/ipath/ipath_sdma.c b/drivers/staging/rdma/ipath/ipath_sdma.c new file mode 100644 index 000000000..17a517766 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_sdma.c @@ -0,0 +1,818 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */ + +static void vl15_watchdog_enq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) { + unsigned long interval = (HZ + 19) / 20; + dd->ipath_sdma_vl15_timer.expires = jiffies + interval; + add_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_deq(struct ipath_devdata *dd) +{ + /* ipath_sdma_lock must already be held */ + if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) { + unsigned long interval = (HZ + 19) / 20; + mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval); + } else { + del_timer(&dd->ipath_sdma_vl15_timer); + } +} + +static void vl15_watchdog_timeout(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) { + ipath_dbg("vl15 watchdog timeout - clearing\n"); + ipath_cancel_sends(dd, 1); + ipath_hol_down(dd); + } else { + ipath_dbg("vl15 watchdog timeout - " + "condition already cleared\n"); + } +} + +static void unmap_desc(struct ipath_devdata *dd, unsigned head) +{ + __le64 *descqp = &dd->ipath_sdma_descq[head].qw[0]; + u64 desc[2]; + dma_addr_t addr; + size_t len; + + desc[0] = le64_to_cpu(descqp[0]); + desc[1] = le64_to_cpu(descqp[1]); + + addr = (desc[1] << 32) | (desc[0] >> 32); + len = (desc[0] >> 14) & (0x7ffULL << 2); + dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE); +} + +/* + * ipath_sdma_lock should be locked before calling this. + */ +int ipath_sdma_make_progress(struct ipath_devdata *dd) +{ + struct list_head *lp = NULL; + struct ipath_sdma_txreq *txp = NULL; + u16 dmahead; + u16 start_idx = 0; + int progress = 0; + + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, list); + start_idx = txp->start_idx; + } + + /* + * Read the SDMA head register in order to know that the + * interrupt clear has been written to the chip. + * Otherwise, we may not get an interrupt for the last + * descriptor in the queue. + */ + dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead); + /* sanity check return value for error handling (chip reset, etc.) */ + if (dmahead >= dd->ipath_sdma_descq_cnt) + goto done; + + while (dd->ipath_sdma_descq_head != dmahead) { + if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC && + dd->ipath_sdma_descq_head == start_idx) { + unmap_desc(dd, dd->ipath_sdma_descq_head); + start_idx++; + if (start_idx == dd->ipath_sdma_descq_cnt) + start_idx = 0; + } + + /* increment free count and head */ + dd->ipath_sdma_descq_removed++; + if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt) + dd->ipath_sdma_descq_head = 0; + + if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) { + /* move to notify list */ + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(lp, &dd->ipath_sdma_notifylist); + if (!list_empty(&dd->ipath_sdma_activelist)) { + lp = dd->ipath_sdma_activelist.next; + txp = list_entry(lp, struct ipath_sdma_txreq, + list); + start_idx = txp->start_idx; + } else { + lp = NULL; + txp = NULL; + } + } + progress = 1; + } + + if (progress) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + +done: + return progress; +} + +static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list) +{ + struct ipath_sdma_txreq *txp, *txp_next; + + list_for_each_entry_safe(txp, txp_next, list, list) { + list_del_init(&txp->list); + + if (txp->callback) + (*txp->callback)(txp->callback_cookie, + txp->callback_status); + } +} + +static void sdma_notify_taskbody(struct ipath_devdata *dd) +{ + unsigned long flags; + struct list_head list; + + INIT_LIST_HEAD(&list); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + list_splice_init(&dd->ipath_sdma_notifylist, &list); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + ipath_sdma_notify(dd, &list); + + /* + * The IB verbs layer needs to see the callback before getting + * the call to ipath_ib_piobufavail() because the callback + * handles releasing resources the next send will need. + * Otherwise, we could do these calls in + * ipath_sdma_make_progress(). + */ + ipath_ib_piobufavail(dd->verbs_dev); +} + +static void sdma_notify_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *)opaque; + + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + sdma_notify_taskbody(dd); +} + +static void dump_sdma_state(struct ipath_devdata *dd) +{ + unsigned long reg; + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus); + ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl); + ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0); + ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1); + ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2); + ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail); + ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg); + + reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead); + ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg); +} + +static void sdma_abort_task(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + u64 status; + unsigned long flags; + + if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + return; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK; + + /* nothing to do */ + if (status == IPATH_SDMA_ABORT_NONE) + goto unlock; + + /* ipath_sdma_abort() is done, waiting for interrupt */ + if (status == IPATH_SDMA_ABORT_DISARMED) { + if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout)) + goto resched_noprint; + /* give up, intr got lost somewhere */ + ipath_dbg("give up waiting for SDMADISABLED intr\n"); + __set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + status = IPATH_SDMA_ABORT_ABORTED; + } + + /* everything is stopped, time to clean up and restart */ + if (status == IPATH_SDMA_ABORT_ABORTED) { + struct ipath_sdma_txreq *txp, *txpnext; + u64 hwstatus; + int notify = 0; + + hwstatus = ipath_read_kreg64(dd, + dd->ipath_kregs->kr_senddmastatus); + + if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG | + IPATH_SDMA_STATUS_ABORT_IN_PROG | + IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) || + !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) { + if (dd->ipath_sdma_reset_wait > 0) { + /* not done shutting down sdma */ + --dd->ipath_sdma_reset_wait; + goto resched; + } + ipath_cdbg(VERBOSE, "gave up waiting for quiescent " + "status after SDMA reset, continuing\n"); + dump_sdma_state(dd); + } + + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, + &dd->ipath_sdma_activelist, list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + notify = 1; + } + if (notify) + tasklet_hi_schedule(&dd->ipath_sdma_notify_task); + + /* reset our notion of head and tail */ + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_head_dma[0] = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added; + + /* Reset SendDmaLenGen */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, + (u64) dd->ipath_sdma_descq_cnt | (1ULL << 18)); + + /* done with sdma state for a bit */ + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + /* + * Don't restart sdma here (with the exception + * below). Wait until link is up to ACTIVE. VL15 MADs + * used to bring the link up use PIO, and multiple link + * transitions otherwise cause the sdma engine to be + * stopped and started multiple times. + * The disable is done here, including the shadow, + * so the state is kept consistent. + * See ipath_restart_sdma() for the actual starting + * of sdma. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* make sure I see next message */ + dd->ipath_sdma_abort_jiffies = 0; + + /* + * Not everything that takes SDMA offline is a link + * status change. If the link was up, restart SDMA. + */ + if (dd->ipath_flags & IPATH_LINKACTIVE) + ipath_restart_sdma(dd); + + goto done; + } + +resched: + /* + * for now, keep spinning + * JAG - this is bad to just have default be a loop without + * state change + */ + if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) { + ipath_dbg("looping with status 0x%08lx\n", + dd->ipath_sdma_status); + dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ; + } +resched_noprint: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + tasklet_hi_schedule(&dd->ipath_sdma_abort_task); + return; + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +done: + return; +} + +/* + * This is called from interrupt context. + */ +void ipath_sdma_intr(struct ipath_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + (void) ipath_sdma_make_progress(dd); + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +} + +static int alloc_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + + /* Allocate memory for SendDMA descriptor FIFO */ + dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev, + SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL); + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "failed to allocate SendDMA descriptor " + "FIFO memory\n"); + ret = -ENOMEM; + goto done; + } + + dd->ipath_sdma_descq_cnt = + SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc); + + /* Allocate memory for DMA of head register to memory */ + dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev, + PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL); + if (!dd->ipath_sdma_head_dma) { + ipath_dev_err(dd, "failed to allocate SendDMA head memory\n"); + ret = -ENOMEM; + goto cleanup_descq; + } + dd->ipath_sdma_head_dma[0] = 0; + + init_timer(&dd->ipath_sdma_vl15_timer); + dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout; + dd->ipath_sdma_vl15_timer.data = (unsigned long)dd; + atomic_set(&dd->ipath_sdma_vl15_count, 0); + + goto done; + +cleanup_descq: + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + (void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys); + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; +done: + return ret; +} + +int setup_sdma(struct ipath_devdata *dd) +{ + int ret = 0; + unsigned i, n; + u64 tmp64; + u64 senddmabufmask[3] = { 0 }; + unsigned long flags; + + ret = alloc_sdma(dd); + if (ret) + goto done; + + if (!dd->ipath_sdma_descq) { + ipath_dev_err(dd, "SendDMA memory not allocated\n"); + goto done; + } + + /* + * Set initial status as if we had been up, then gone down. + * This lets initial start on transition to ACTIVE be the + * same as restart after link flap. + */ + dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED; + dd->ipath_sdma_abort_jiffies = 0; + dd->ipath_sdma_generation = 0; + dd->ipath_sdma_descq_tail = 0; + dd->ipath_sdma_descq_head = 0; + dd->ipath_sdma_descq_removed = 0; + dd->ipath_sdma_descq_added = 0; + + /* Set SendDmaBase */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, + dd->ipath_sdma_descq_phys); + /* Set SendDmaLenGen */ + tmp64 = dd->ipath_sdma_descq_cnt; + tmp64 |= 1<<18; /* enable generation checking */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64); + /* Set SendDmaTail */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, + dd->ipath_sdma_descq_tail); + /* Set SendDmaHeadAddr */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, + dd->ipath_sdma_head_phys); + + /* + * Reserve all the former "kernel" piobufs, using high number range + * so we get as many 4K buffers as possible + */ + n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; + i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved; + ipath_chg_pioavailkernel(dd, i, n - i , 0); + for (; i < n; ++i) { + unsigned word = i / 64; + unsigned bit = i & 63; + BUG_ON(word >= 3); + senddmabufmask[word] |= 1ULL << bit; + } + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, + senddmabufmask[0]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, + senddmabufmask[1]); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, + senddmabufmask[2]); + + INIT_LIST_HEAD(&dd->ipath_sdma_activelist); + INIT_LIST_HEAD(&dd->ipath_sdma_notifylist); + + tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task, + (unsigned long) dd); + tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task, + (unsigned long) dd); + + /* + * No use to turn on SDMA here, as link is probably not ACTIVE + * Just mark it RUNNING and enable the interrupt, and let the + * ipath_restart_sdma() on link transition to ACTIVE actually + * enable it. + */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + __set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + +done: + return ret; +} + +void teardown_sdma(struct ipath_devdata *dd) +{ + struct ipath_sdma_txreq *txp, *txpnext; + unsigned long flags; + dma_addr_t sdma_head_phys = 0; + dma_addr_t sdma_descq_phys = 0; + void *sdma_descq = NULL; + void *sdma_head_dma = NULL; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + __clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + __set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + tasklet_kill(&dd->ipath_sdma_abort_task); + tasklet_kill(&dd->ipath_sdma_notify_task); + + /* turn off sdma */ + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, + dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + /* dequeue all "sent" requests */ + list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist, + list) { + txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN; + if (txp->flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_deq(dd); + list_move_tail(&txp->list, &dd->ipath_sdma_notifylist); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + sdma_notify_taskbody(dd); + + del_timer_sync(&dd->ipath_sdma_vl15_timer); + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + dd->ipath_sdma_abort_jiffies = 0; + + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0); + + if (dd->ipath_sdma_head_dma) { + sdma_head_dma = (void *) dd->ipath_sdma_head_dma; + sdma_head_phys = dd->ipath_sdma_head_phys; + dd->ipath_sdma_head_dma = NULL; + dd->ipath_sdma_head_phys = 0; + } + + if (dd->ipath_sdma_descq) { + sdma_descq = dd->ipath_sdma_descq; + sdma_descq_phys = dd->ipath_sdma_descq_phys; + dd->ipath_sdma_descq = NULL; + dd->ipath_sdma_descq_phys = 0; + } + + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + if (sdma_head_dma) + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + sdma_head_dma, sdma_head_phys); + + if (sdma_descq) + dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ, + sdma_descq, sdma_descq_phys); +} + +/* + * [Re]start SDMA, if we use it, and it's not already OK. + * This is called on transition to link ACTIVE, either the first or + * subsequent times. + */ +void ipath_restart_sdma(struct ipath_devdata *dd) +{ + unsigned long flags; + int needed = 1; + + if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + goto bail; + + /* + * First, make sure we should, which is to say, + * check that we are "RUNNING" (not in teardown) + * and not "SHUTDOWN" + */ + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status) + || test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status)) + needed = 0; + else { + __clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status); + __clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status); + } + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + if (!needed) { + ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n", + dd->ipath_sdma_status); + goto bail; + } + spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags); + /* + * First clear, just to be safe. Enable is only done + * in chip on 0->1 transition + */ + dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE; + ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl); + ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch); + spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags); + + /* notify upper layers */ + ipath_ib_piobufavail(dd->verbs_dev); + +bail: + return; +} + +static inline void make_sdma_desc(struct ipath_devdata *dd, + u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset) +{ + WARN_ON(addr & 3); + /* SDmaPhyAddr[47:32] */ + sdmadesc[1] = addr >> 32; + /* SDmaPhyAddr[31:0] */ + sdmadesc[0] = (addr & 0xfffffffcULL) << 32; + /* SDmaGeneration[1:0] */ + sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30; + /* SDmaDwordCount[10:0] */ + sdmadesc[0] |= (dwlen & 0x7ffULL) << 16; + /* SDmaBufOffset[12:2] */ + sdmadesc[0] |= dwoffset & 0x7ffULL; +} + +/* + * This function queues one IB packet onto the send DMA queue per call. + * The caller is responsible for checking: + * 1) The number of send DMA descriptor entries is less than the size of + * the descriptor queue. + * 2) The IB SGE addresses and lengths are 32-bit aligned + * (except possibly the last SGE's length) + * 3) The SGE addresses are suitable for passing to dma_map_single(). + */ +int ipath_sdma_verbs_send(struct ipath_devdata *dd, + struct ipath_sge_state *ss, u32 dwords, + struct ipath_verbs_txreq *tx) +{ + + unsigned long flags; + struct ipath_sge *sge; + int ret = 0; + u16 tail; + __le64 *descqp; + u64 sdmadesc[2]; + u32 dwoffset; + dma_addr_t addr; + + if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + tx->map_len + (dwords<<2), dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto fail; + } + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + +retry: + if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) { + ret = -EBUSY; + goto unlock; + } + + if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) { + if (ipath_sdma_make_progress(dd)) + goto retry; + ret = -ENOBUFS; + goto unlock; + } + + addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr, + tx->map_len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto ioerr; + + dwoffset = tx->map_len >> 2; + make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0); + + /* SDmaFirstDesc */ + sdmadesc[0] |= 1ULL << 12; + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; /* SDmaUseLargeBuf */ + + /* write to the descq */ + tail = dd->ipath_sdma_descq_tail; + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC) + tx->txreq.start_idx = tail; + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + + sge = &ss->sge; + while (dwords) { + u32 dw; + u32 len; + + len = dwords << 2; + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + dw = (len + 3) >> 2; + addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, addr)) + goto unmap; + make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset); + /* SDmaUseLargeBuf has to be set in every descriptor */ + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF) + sdmadesc[0] |= 1ULL << 14; + /* write to the descq */ + *descqp++ = cpu_to_le64(sdmadesc[0]); + *descqp++ = cpu_to_le64(sdmadesc[1]); + + /* increment the tail */ + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + descqp = &dd->ipath_sdma_descq[0].qw[0]; + ++dd->ipath_sdma_generation; + } + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + + dwoffset += dw; + dwords -= dw; + } + + if (!tail) + descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0]; + descqp -= 2; + /* SDmaLastDesc */ + descqp[0] |= cpu_to_le64(1ULL << 11); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) { + /* SDmaIntReq */ + descqp[0] |= cpu_to_le64(1ULL << 15); + } + + /* Commit writes to memory and advance the tail on the chip */ + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + + tx->txreq.next_descq_idx = tail; + tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK; + dd->ipath_sdma_descq_tail = tail; + dd->ipath_sdma_descq_added += tx->txreq.sg_count; + list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist); + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15) + vl15_watchdog_enq(dd); + goto unlock; + +unmap: + while (tail != dd->ipath_sdma_descq_tail) { + if (!tail) + tail = dd->ipath_sdma_descq_cnt - 1; + else + tail--; + unmap_desc(dd, tail); + } +ioerr: + ret = -EIO; +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); +fail: + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_srq.c b/drivers/staging/rdma/ipath/ipath_srq.c new file mode 100644 index 000000000..26271984b --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_srq.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/** + * ipath_post_srq_receive - post a receive on a shared receive queue + * @ibsrq: the SRQ to post the receive on + * @wr: the list of work requests to post + * @bad_wr: the first WR to cause a problem is put here + * + * This may be called from interrupt context. + */ +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_rwq *wq; + unsigned long flags; + int ret; + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > srq->rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&srq->rq.lock, flags); + wq = srq->rq.wq; + next = wq->head + 1; + if (next >= srq->rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&srq->rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&srq->rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&srq->rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_create_srq - create a shared receive queue + * @ibpd: the protection domain of the SRQ to create + * @srq_init_attr: the attributes of the SRQ + * @udata: data from libipathverbs when creating a user SRQ + */ +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibpd->device); + struct ipath_srq *srq; + u32 sz; + struct ib_srq *ret; + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + ret = ERR_PTR(-ENOSYS); + goto done; + } + + if (srq_init_attr->attr.max_wr == 0) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) || + (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) { + ret = ERR_PTR(-EINVAL); + goto done; + } + + srq = kmalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + ret = ERR_PTR(-ENOMEM); + goto done; + } + + /* + * Need to use vmalloc() if we want to support large #s of entries. + */ + srq->rq.size = srq_init_attr->attr.max_wr + 1; + srq->rq.max_sge = srq_init_attr->attr.max_sge; + sz = sizeof(struct ib_sge) * srq->rq.max_sge + + sizeof(struct ipath_rwqe); + srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz); + if (!srq->rq.wq) { + ret = ERR_PTR(-ENOMEM); + goto bail_srq; + } + + /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->outlen >= sizeof(__u64)) { + int err; + u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz; + + srq->ip = + ipath_create_mmap_info(dev, s, + ibpd->uobject->context, + srq->rq.wq); + if (!srq->ip) { + ret = ERR_PTR(-ENOMEM); + goto bail_wq; + } + + err = ib_copy_to_udata(udata, &srq->ip->offset, + sizeof(srq->ip->offset)); + if (err) { + ret = ERR_PTR(err); + goto bail_ip; + } + } else + srq->ip = NULL; + + /* + * ib_create_srq() will initialize srq->ibsrq. + */ + spin_lock_init(&srq->rq.lock); + srq->rq.wq->head = 0; + srq->rq.wq->tail = 0; + srq->limit = srq_init_attr->attr.srq_limit; + + spin_lock(&dev->n_srqs_lock); + if (dev->n_srqs_allocated == ib_ipath_max_srqs) { + spin_unlock(&dev->n_srqs_lock); + ret = ERR_PTR(-ENOMEM); + goto bail_ip; + } + + dev->n_srqs_allocated++; + spin_unlock(&dev->n_srqs_lock); + + if (srq->ip) { + spin_lock_irq(&dev->pending_lock); + list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + + ret = &srq->ibsrq; + goto done; + +bail_ip: + kfree(srq->ip); +bail_wq: + vfree(srq->rq.wq); +bail_srq: + kfree(srq); +done: + return ret; +} + +/** + * ipath_modify_srq - modify a shared receive queue + * @ibsrq: the SRQ to modify + * @attr: the new attributes of the SRQ + * @attr_mask: indicates which attributes to modify + * @udata: user data for ipathverbs.so + */ +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_rwq *wq; + int ret = 0; + + if (attr_mask & IB_SRQ_MAX_WR) { + struct ipath_rwq *owq; + struct ipath_rwqe *p; + u32 sz, size, n, head, tail; + + /* Check that the requested sizes are below the limits. */ + if ((attr->max_wr > ib_ipath_max_srq_wrs) || + ((attr_mask & IB_SRQ_LIMIT) ? + attr->srq_limit : srq->limit) > attr->max_wr) { + ret = -EINVAL; + goto bail; + } + + sz = sizeof(struct ipath_rwqe) + + srq->rq.max_sge * sizeof(struct ib_sge); + size = attr->max_wr + 1; + wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz); + if (!wq) { + ret = -ENOMEM; + goto bail; + } + + /* Check that we can write the offset to mmap. */ + if (udata && udata->inlen >= sizeof(__u64)) { + __u64 offset_addr; + __u64 offset = 0; + + ret = ib_copy_from_udata(&offset_addr, udata, + sizeof(offset_addr)); + if (ret) + goto bail_free; + udata->outbuf = + (void __user *) (unsigned long) offset_addr; + ret = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (ret) + goto bail_free; + } + + spin_lock_irq(&srq->rq.lock); + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + owq = srq->rq.wq; + head = owq->head; + if (head >= srq->rq.size) + head = 0; + tail = owq->tail; + if (tail >= srq->rq.size) + tail = 0; + n = head; + if (n < tail) + n += srq->rq.size - tail; + else + n -= tail; + if (size <= n) { + ret = -EINVAL; + goto bail_unlock; + } + n = 0; + p = wq->wq; + while (tail != head) { + struct ipath_rwqe *wqe; + int i; + + wqe = get_rwqe_ptr(&srq->rq, tail); + p->wr_id = wqe->wr_id; + p->num_sge = wqe->num_sge; + for (i = 0; i < wqe->num_sge; i++) + p->sg_list[i] = wqe->sg_list[i]; + n++; + p = (struct ipath_rwqe *)((char *) p + sz); + if (++tail >= srq->rq.size) + tail = 0; + } + srq->rq.wq = wq; + srq->rq.size = size; + wq->head = n; + wq->tail = 0; + if (attr_mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + + vfree(owq); + + if (srq->ip) { + struct ipath_mmap_info *ip = srq->ip; + struct ipath_ibdev *dev = to_idev(srq->ibsrq.device); + u32 s = sizeof(struct ipath_rwq) + size * sz; + + ipath_update_mmap_info(dev, ip, s, wq); + + /* + * Return the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata && udata->inlen >= sizeof(__u64)) { + ret = ib_copy_to_udata(udata, &ip->offset, + sizeof(ip->offset)); + if (ret) + goto bail; + } + + spin_lock_irq(&dev->pending_lock); + if (list_empty(&ip->pending_mmaps)) + list_add(&ip->pending_mmaps, + &dev->pending_mmaps); + spin_unlock_irq(&dev->pending_lock); + } + } else if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irq(&srq->rq.lock); + if (attr->srq_limit >= srq->rq.size) + ret = -EINVAL; + else + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); + } + goto bail; + +bail_unlock: + spin_unlock_irq(&srq->rq.lock); +bail_free: + vfree(wq); +bail: + return ret; +} + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + + attr->max_wr = srq->rq.size - 1; + attr->max_sge = srq->rq.max_sge; + attr->srq_limit = srq->limit; + return 0; +} + +/** + * ipath_destroy_srq - destroy a shared receive queue + * @ibsrq: the SRQ to destroy + */ +int ipath_destroy_srq(struct ib_srq *ibsrq) +{ + struct ipath_srq *srq = to_isrq(ibsrq); + struct ipath_ibdev *dev = to_idev(ibsrq->device); + + spin_lock(&dev->n_srqs_lock); + dev->n_srqs_allocated--; + spin_unlock(&dev->n_srqs_lock); + if (srq->ip) + kref_put(&srq->ip->ref, ipath_release_mmap_info); + else + vfree(srq->rq.wq); + kfree(srq); + + return 0; +} diff --git a/drivers/staging/rdma/ipath/ipath_stats.c b/drivers/staging/rdma/ipath/ipath_stats.c new file mode 100644 index 000000000..f63e143e3 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_stats.c @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_kernel.h" + +struct infinipath_stats ipath_stats; + +/** + * ipath_snap_cntr - snapshot a chip counter + * @dd: the infinipath device + * @creg: the counter to snapshot + * + * called from add_timer and user counter read calls, to deal with + * counters that wrap in "human time". The words sent and received, and + * the packets sent and received are all that we worry about. For now, + * at least, we don't worry about error counters, because if they wrap + * that quickly, we probably don't care. We may eventually just make this + * handle all the counters. word counters can wrap in about 20 seconds + * of full bandwidth traffic, packet counters in a few hours. + */ + +u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg) +{ + u32 val, reg64 = 0; + u64 val64; + unsigned long t0, t1; + u64 ret; + + t0 = jiffies; + /* If fast increment counters are only 32 bits, snapshot them, + * and maintain them as 64bit values in the driver */ + if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) && + (creg == dd->ipath_cregs->cr_wordsendcnt || + creg == dd->ipath_cregs->cr_wordrcvcnt || + creg == dd->ipath_cregs->cr_pktsendcnt || + creg == dd->ipath_cregs->cr_pktrcvcnt)) { + val64 = ipath_read_creg(dd, creg); + val = val64 == ~0ULL ? ~0U : 0; + reg64 = 1; + } else /* val64 just to keep gcc quiet... */ + val64 = val = ipath_read_creg32(dd, creg); + /* + * See if a second has passed. This is just a way to detect things + * that are quite broken. Normally this should take just a few + * cycles (the check is for long enough that we don't care if we get + * pre-empted.) An Opteron HT O read timeout is 4 seconds with + * normal NB values + */ + t1 = jiffies; + if (time_before(t0 + HZ, t1) && val == -1) { + ipath_dev_err(dd, "Error! Read counter 0x%x timed out\n", + creg); + ret = 0ULL; + goto bail; + } + if (reg64) { + ret = val64; + goto bail; + } + + if (creg == dd->ipath_cregs->cr_wordsendcnt) { + if (val != dd->ipath_lastsword) { + dd->ipath_sword += val - dd->ipath_lastsword; + dd->ipath_lastsword = val; + } + val64 = dd->ipath_sword; + } else if (creg == dd->ipath_cregs->cr_wordrcvcnt) { + if (val != dd->ipath_lastrword) { + dd->ipath_rword += val - dd->ipath_lastrword; + dd->ipath_lastrword = val; + } + val64 = dd->ipath_rword; + } else if (creg == dd->ipath_cregs->cr_pktsendcnt) { + if (val != dd->ipath_lastspkts) { + dd->ipath_spkts += val - dd->ipath_lastspkts; + dd->ipath_lastspkts = val; + } + val64 = dd->ipath_spkts; + } else if (creg == dd->ipath_cregs->cr_pktrcvcnt) { + if (val != dd->ipath_lastrpkts) { + dd->ipath_rpkts += val - dd->ipath_lastrpkts; + dd->ipath_lastrpkts = val; + } + val64 = dd->ipath_rpkts; + } else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->ibsymsnap; + val64 -= dd->ibsymdelta; + } else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) { + if (dd->ibdeltainprog) + val64 -= val64 - dd->iblnkerrsnap; + val64 -= dd->iblnkerrdelta; + } else + val64 = (u64) val; + + ret = val64; + +bail: + return ret; +} + +/** + * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports + * @dd: the infinipath device + * + * print the delta of egrfull/hdrqfull errors for kernel ports no more than + * every 5 seconds. User processes are printed at close, but kernel doesn't + * close, so... Separate routine so may call from other places someday, and + * so function name when printed by _IPATH_INFO is meaningfull + */ +static void ipath_qcheck(struct ipath_devdata *dd) +{ + static u64 last_tot_hdrqfull; + struct ipath_portdata *pd = dd->ipath_pd[0]; + size_t blen = 0; + char buf[128]; + u32 hdrqtail; + + *buf = 0; + if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) { + blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u", + pd->port_hdrqfull - + dd->ipath_p0_hdrqfull); + dd->ipath_p0_hdrqfull = pd->port_hdrqfull; + } + if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%srcvegrfull %llu", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_etidfull - + dd->ipath_last_tidfull)); + dd->ipath_last_tidfull = ipath_stats.sps_etidfull; + } + + /* + * this is actually the number of hdrq full interrupts, not actual + * events, but at the moment that's mostly what I'm interested in. + * Actual count, etc. is in the counters, if needed. For production + * users this won't ordinarily be printed. + */ + + if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) && + ipath_stats.sps_hdrqfull != last_tot_hdrqfull) { + blen += snprintf(buf + blen, sizeof buf - blen, + "%shdrqfull %llu (all ports)", + blen ? ", " : "", + (unsigned long long) + (ipath_stats.sps_hdrqfull - + last_tot_hdrqfull)); + last_tot_hdrqfull = ipath_stats.sps_hdrqfull; + } + if (blen) + ipath_dbg("%s\n", buf); + + hdrqtail = ipath_get_hdrqtail(pd); + if (pd->port_head != hdrqtail) { + if (dd->ipath_lastport0rcv_cnt == + ipath_stats.sps_port0pkts) { + ipath_cdbg(PKT, "missing rcv interrupts? " + "port0 hd=%x tl=%x; port0pkts %llx; write" + " hd (w/intr)\n", + pd->port_head, hdrqtail, + (unsigned long long) + ipath_stats.sps_port0pkts); + ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail | + dd->ipath_rhdrhead_intr_off, pd->port_port); + } + dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts; + } +} + +static void ipath_chk_errormask(struct ipath_devdata *dd) +{ + static u32 fixed; + u32 ctrl; + unsigned long errormask; + unsigned long hwerrs; + + if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED)) + return; + + errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask); + + if (errormask == dd->ipath_errormask) + return; + fixed++; + + hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus); + ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); + + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + + if ((hwerrs & dd->ipath_hwerrmask) || + (ctrl & INFINIPATH_C_FREEZEMODE)) { + /* force re-interrupt of pending events, just in case */ + ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL); + ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL); + dev_info(&dd->pcidev->dev, + "errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n", + fixed, errormask, (unsigned long)dd->ipath_errormask, + ctrl, hwerrs); + } else + ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n", + fixed, errormask, + (unsigned long)dd->ipath_errormask); +} + + +/** + * ipath_get_faststats - get word counters from chip before they overflow + * @opaque - contains a pointer to the infinipath device ipath_devdata + * + * called from add_timer + */ +void ipath_get_faststats(unsigned long opaque) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) opaque; + int i; + static unsigned cnt; + unsigned long flags; + u64 traffic_wds; + + /* + * don't access the chip while running diags, or memory diags can + * fail + */ + if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) || + ipath_diag_inuse) + /* but re-arm the timer, for diags case; won't hurt other */ + goto done; + + /* + * We now try to maintain a "active timer", based on traffic + * exceeding a threshold, so we need to check the word-counts + * even if they are 64-bit. + */ + traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) + + ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + spin_lock_irqsave(&dd->ipath_eep_st_lock, flags); + traffic_wds -= dd->ipath_traffic_wds; + dd->ipath_traffic_wds += traffic_wds; + if (traffic_wds >= IPATH_TRAFFIC_ACTIVE_THRESHOLD) + atomic_add(5, &dd->ipath_active_time); /* S/B #define */ + spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags); + + if (dd->ipath_flags & IPATH_32BITCOUNTERS) { + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + } + + ipath_qcheck(dd); + + /* + * deal with repeat error suppression. Doesn't really matter if + * last error was almost a full interval ago, or just a few usecs + * ago; still won't get more than 2 per interval. We may want + * longer intervals for this eventually, could do with mod, counter + * or separate timer. Also see code in ipath_handle_errors() and + * ipath_handle_hwerrors(). + */ + + if (dd->ipath_lasterror) + dd->ipath_lasterror = 0; + if (dd->ipath_lasthwerror) + dd->ipath_lasthwerror = 0; + if (dd->ipath_maskederrs + && time_after(jiffies, dd->ipath_unmasktime)) { + char ebuf[256]; + int iserr; + iserr = ipath_decode_err(dd, ebuf, sizeof ebuf, + dd->ipath_maskederrs); + if (dd->ipath_maskederrs & + ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL | + INFINIPATH_E_PKTERRS)) + ipath_dev_err(dd, "Re-enabling masked errors " + "(%s)\n", ebuf); + else { + /* + * rcvegrfull and rcvhdrqfull are "normal", for some + * types of processes (mostly benchmarks) that send + * huge numbers of messages, while not processing + * them. So only complain about these at debug + * level. + */ + if (iserr) + ipath_dbg( + "Re-enabling queue full errors (%s)\n", + ebuf); + else + ipath_cdbg(ERRPKT, "Re-enabling packet" + " problem interrupt (%s)\n", ebuf); + } + + /* re-enable masked errors */ + dd->ipath_errormask |= dd->ipath_maskederrs; + ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, + dd->ipath_errormask); + dd->ipath_maskederrs = 0; + } + + /* limit qfull messages to ~one per minute per port */ + if ((++cnt & 0x10)) { + for (i = (int) dd->ipath_cfgports; --i >= 0; ) { + struct ipath_portdata *pd = dd->ipath_pd[i]; + + if (pd && pd->port_lastrcvhdrqtail != -1) + pd->port_lastrcvhdrqtail = -1; + } + } + + ipath_chk_errormask(dd); +done: + mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5); +} diff --git a/drivers/staging/rdma/ipath/ipath_sysfs.c b/drivers/staging/rdma/ipath/ipath_sysfs.c new file mode 100644 index 000000000..75558f33f --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_sysfs.c @@ -0,0 +1,1238 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +/** + * ipath_parse_ushort - parse an unsigned short value in an arbitrary base + * @str: the string containing the number + * @valp: where to put the result + * + * returns the number of bytes consumed, or negative value on error + */ +int ipath_parse_ushort(const char *str, unsigned short *valp) +{ + unsigned long val; + char *end; + int ret; + + if (!isdigit(str[0])) { + ret = -EINVAL; + goto bail; + } + + val = simple_strtoul(str, &end, 0); + + if (val > 0xffff) { + ret = -EINVAL; + goto bail; + } + + *valp = val; + + ret = end + 1 - str; + if (ret == 0) + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_version(struct device_driver *dev, char *buf) +{ + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version); +} + +static ssize_t show_num_units(struct device_driver *dev, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%d\n", + ipath_count_units(NULL, NULL, NULL)); +} + +static ssize_t show_status(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long) *(dd->ipath_statusp)); + +bail: + return ret; +} + +static const char *ipath_status_str[] = { + "Initted", + "Disabled", + "Admin_Disabled", + "", /* This used to be the old "OIB_SMA" status. */ + "", /* This used to be the old "SMA" status. */ + "Present", + "IB_link_up", + "IB_configured", + "NoIBcable", + "Fatal_Hardware_Error", + NULL, +}; + +static ssize_t show_status_str(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int i, any; + u64 s; + ssize_t ret; + + if (!dd->ipath_statusp) { + ret = -EINVAL; + goto bail; + } + + s = *(dd->ipath_statusp); + *buf = '\0'; + for (any = i = 0; s && ipath_status_str[i]; i++) { + if (s & 1) { + if (any && strlcat(buf, " ", PAGE_SIZE) >= + PAGE_SIZE) + /* overflow */ + break; + if (strlcat(buf, ipath_status_str[i], + PAGE_SIZE) >= PAGE_SIZE) + break; + any = 1; + } + s >>= 1; + } + if (any) + strlcat(buf, "\n", PAGE_SIZE); + + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_boardversion(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion); +} + +static ssize_t show_localbus_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info); +} + +static ssize_t show_lmc(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc); +} + +static ssize_t store_lmc(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lmc = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lmc); + if (ret < 0) + goto invalid; + + if (lmc > 7) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, dd->ipath_lid, lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc); +bail: + return ret; +} + +static ssize_t show_lid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid); +} + +static ssize_t store_lid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 lid = 0; + int ret; + + ret = ipath_parse_ushort(buf, &lid); + if (ret < 0) + goto invalid; + + if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) { + ret = -EINVAL; + goto invalid; + } + + ipath_set_lid(dd, lid, dd->ipath_lmc); + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid); +bail: + return ret; +} + +static ssize_t show_mlid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid); +} + +static ssize_t store_mlid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 mlid; + int ret; + + ret = ipath_parse_ushort(buf, &mlid); + if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE) + goto invalid; + + dd->ipath_mlid = mlid; + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MLID\n"); +bail: + return ret; +} + +static ssize_t show_guid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u8 *guid; + + guid = (u8 *) & (dd->ipath_guid); + + return scnprintf(buf, PAGE_SIZE, + "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n", + guid[0], guid[1], guid[2], guid[3], + guid[4], guid[5], guid[6], guid[7]); +} + +static ssize_t store_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + unsigned short guid[8]; + __be64 new_guid; + u8 *ng; + int i; + + if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx", + &guid[0], &guid[1], &guid[2], &guid[3], + &guid[4], &guid[5], &guid[6], &guid[7]) != 8) + goto invalid; + + ng = (u8 *) &new_guid; + + for (i = 0; i < 8; i++) { + if (guid[i] > 0xff) + goto invalid; + ng[i] = guid[i]; + } + + if (new_guid == 0) + goto invalid; + + dd->ipath_guid = new_guid; + dd->ipath_nguid = 1; + if (dd->verbs_dev) + dd->verbs_dev->ibdev.node_guid = new_guid; + + ret = strlen(buf); + goto bail; + +invalid: + ipath_dev_err(dd, "attempt to set invalid GUID\n"); + ret = -EINVAL; + +bail: + return ret; +} + +static ssize_t show_nguid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid); +} + +static ssize_t show_nports(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + /* Return the number of user ports available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1); +} + +static ssize_t show_serial(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + buf[sizeof dd->ipath_serial] = '\0'; + memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial); + strcat(buf, "\n"); + return strlen(buf); +} + +static ssize_t show_unit(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit); +} + +static ssize_t show_jint_max_packets(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets); +} + +static ssize_t store_jint_max_packets(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_max_packets.\n"); + else + dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v); + + return ret; +} + +static ssize_t show_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks); +} + +static ssize_t store_jint_idle_ticks(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + u16 v = 0; + int ret; + + ret = ipath_parse_ushort(buf, &v); + if (ret < 0) + ipath_dev_err(dd, "invalid jint_idle_ticks.\n"); + else + dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets); + + return ret; +} + +#define DEVICE_COUNTER(name, attr) \ + static ssize_t show_counter_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct ipath_devdata *dd = dev_get_drvdata(dev); \ + return scnprintf(\ + buf, PAGE_SIZE, "%llu\n", (unsigned long long) \ + ipath_snap_cntr( \ + dd, offsetof(struct infinipath_counters, \ + attr) / sizeof(u64))); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL); + +DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt); +DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt); +DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt); +DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt); +DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt); +DEVICE_COUNTER(lb_ints, LBIntCnt); +DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt); +DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt); +DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt); +DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt); +DEVICE_COUNTER(rx_dwords, RxDwordCnt); +DEVICE_COUNTER(rx_ebps, RxEBPCnt); +DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt); +DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt); +DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt); +DEVICE_COUNTER(rx_len_errs, RxLenErrCnt); +DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt); +DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt); +DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt); +DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt); +DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt); +DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt); +DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt); +DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt); +DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt); +DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt); +DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt); +DEVICE_COUNTER(tx_dwords, TxDwordCnt); +DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt); +DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt); +DEVICE_COUNTER(tx_len_errs, TxLenErrCnt); +DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt); +DEVICE_COUNTER(tx_underruns, TxUnderrunCnt); +DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt); + +static struct attribute *dev_counter_attributes[] = { + &dev_attr_ib_link_downeds.attr, + &dev_attr_ib_link_err_recoveries.attr, + &dev_attr_ib_status_changes.attr, + &dev_attr_ib_symbol_errs.attr, + &dev_attr_lb_flow_stalls.attr, + &dev_attr_lb_ints.attr, + &dev_attr_rx_bad_formats.attr, + &dev_attr_rx_buf_ovfls.attr, + &dev_attr_rx_data_pkts.attr, + &dev_attr_rx_dropped_pkts.attr, + &dev_attr_rx_dwords.attr, + &dev_attr_rx_ebps.attr, + &dev_attr_rx_flow_ctrl_errs.attr, + &dev_attr_rx_flow_pkts.attr, + &dev_attr_rx_icrc_errs.attr, + &dev_attr_rx_len_errs.attr, + &dev_attr_rx_link_problems.attr, + &dev_attr_rx_lpcrc_errs.attr, + &dev_attr_rx_max_min_len_errs.attr, + &dev_attr_rx_p0_hdr_egr_ovfls.attr, + &dev_attr_rx_p1_hdr_egr_ovfls.attr, + &dev_attr_rx_p2_hdr_egr_ovfls.attr, + &dev_attr_rx_p3_hdr_egr_ovfls.attr, + &dev_attr_rx_p4_hdr_egr_ovfls.attr, + &dev_attr_rx_p5_hdr_egr_ovfls.attr, + &dev_attr_rx_p6_hdr_egr_ovfls.attr, + &dev_attr_rx_p7_hdr_egr_ovfls.attr, + &dev_attr_rx_p8_hdr_egr_ovfls.attr, + &dev_attr_rx_pkey_mismatches.attr, + &dev_attr_rx_tid_full_errs.attr, + &dev_attr_rx_tid_valid_errs.attr, + &dev_attr_rx_vcrc_errs.attr, + &dev_attr_tx_data_pkts.attr, + &dev_attr_tx_dropped_pkts.attr, + &dev_attr_tx_dwords.attr, + &dev_attr_tx_flow_pkts.attr, + &dev_attr_tx_flow_stalls.attr, + &dev_attr_tx_len_errs.attr, + &dev_attr_tx_max_min_len_errs.attr, + &dev_attr_tx_underruns.attr, + &dev_attr_tx_unsup_vl_errs.attr, + NULL +}; + +static struct attribute_group dev_counter_attr_group = { + .name = "counters", + .attrs = dev_counter_attributes +}; + +static ssize_t store_reset(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5)) { + ret = -EINVAL; + goto bail; + } + + if (dd->ipath_flags & IPATH_DISABLED) { + /* + * post-reset init would re-enable interrupts, etc. + * so don't allow reset on disabled devices. Not + * perfect error, but about the best choice. + */ + dev_info(dev,"Unit %d is disabled, can't reset\n", + dd->ipath_unit); + ret = -EINVAL; + goto bail; + } + ret = ipath_reset_device(dd->ipath_unit); +bail: + return ret<0 ? ret : count; +} + +static ssize_t store_link_state(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 state; + + ret = ipath_parse_ushort(buf, &state); + if (ret < 0) + goto invalid; + + r = ipath_set_linkstate(dd, state); + if (r < 0) { + ret = r; + goto bail; + } + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid link state\n"); +bail: + return ret; +} + +static ssize_t show_mtu(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu); +} + +static ssize_t store_mtu(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 mtu = 0; + int r; + + ret = ipath_parse_ushort(buf, &mtu); + if (ret < 0) + goto invalid; + + r = ipath_set_mtu(dd, mtu); + if (r < 0) + ret = r; + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid MTU\n"); +bail: + return ret; +} + +static ssize_t show_enabled(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + return scnprintf(buf, PAGE_SIZE, "%u\n", + (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1); +} + +static ssize_t store_enabled(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + ssize_t ret; + u16 enable = 0; + + ret = ipath_parse_ushort(buf, &enable); + if (ret < 0) { + ipath_dev_err(dd, "attempt to use non-numeric on enable\n"); + goto bail; + } + + if (enable) { + if (!(dd->ipath_flags & IPATH_DISABLED)) + goto bail; + + dev_info(dev, "Enabling unit %d\n", dd->ipath_unit); + /* same as post-reset */ + ret = ipath_init_chip(dd, 1); + if (ret) + ipath_dev_err(dd, "Failed to enable unit %d\n", + dd->ipath_unit); + else { + dd->ipath_flags &= ~IPATH_DISABLED; + *dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED; + } + } + else if (!(dd->ipath_flags & IPATH_DISABLED)) { + dev_info(dev, "Disabling unit %d\n", dd->ipath_unit); + ipath_shutdown_device(dd); + dd->ipath_flags |= IPATH_DISABLED; + *dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED; + } + +bail: + return ret; +} + +static ssize_t store_rx_pol_inv(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret < 0) + goto invalid; + + r = ipath_set_rx_pol_inv(dd, val); + if (r < 0) { + ret = r; + goto bail; + } + + goto bail; +invalid: + ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n"); +bail: + return ret; +} + +static ssize_t store_led_override(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret > 0) + ipath_set_led_override(dd, val); + else + ipath_dev_err(dd, "attempt to set invalid LED override\n"); + return ret; +} + +static ssize_t show_logged_errs(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int idx, count; + + /* force consistency with actual EEPROM */ + if (ipath_update_eeprom_log(dd) != 0) + return -ENXIO; + + count = 0; + for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) { + count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c", + dd->ipath_eep_st_errs[idx], + idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' '); + } + + return count; +} + +/* + * New sysfs entries to control various IB config. These all turn into + * accesses via ipath_f_get/set_ib_cfg. + * + * Get/Set heartbeat enable. Or of 1=enabled, 2=auto + */ +static ssize_t show_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_hrtbt_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 3) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n"); + goto bail; + } + + /* + * Set the "intentional" heartbeat enable per either of + * "Enable" and "Auto", as these are normally set together. + * This bit is consulted when leaving loopback mode, + * because entering loopback mode overrides it and automatically + * disables heartbeat. + */ + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val); + if (r < 0) + ret = r; + else if (val == IPATH_IB_HRTBT_OFF) + dd->ipath_flags |= IPATH_NO_HRTBT; + else + dd->ipath_flags &= ~IPATH_NO_HRTBT; + +bail: + return ret; +} + +/* + * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric, + * _not_ the particular encoding of any given chip) + */ +static ssize_t show_lwid_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lwid_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > 3)) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Width (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link width */ +static ssize_t show_lwid(struct device *dev, + struct device_attribute *attr, + char *buf) + +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR. + */ +static ssize_t show_spd_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_spd_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR))) + ret = -EINVAL; + if (ret < 0) { + ipath_dev_err(dd, + "attempt to set invalid Link Speed (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* Get current link speed */ +static ssize_t show_spd(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +/* + * Get/Set RX polarity-invert enable. 0=no, 1=yes. + */ +static ssize_t show_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_rx_polinv_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ipath_dev_err(dd, + "attempt to set invalid Rx Polarity (enable)\n"); + ret = -EINVAL; + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +/* + * Get/Set RX lane-reversal enable. 0=no, 1=yes. + */ +static ssize_t show_lanerev_enb(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + + ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB); + if (ret >= 0) + ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret); + return ret; +} + +static ssize_t store_lanerev_enb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, r; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret >= 0 && val > 1) { + ret = -EINVAL; + ipath_dev_err(dd, + "attempt to set invalid Lane reversal (enable)\n"); + goto bail; + } + + r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val); + if (r < 0) + ret = r; + +bail: + return ret; +} + +static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL); +static DRIVER_ATTR(version, S_IRUGO, show_version, NULL); + +static struct attribute *driver_attributes[] = { + &driver_attr_num_units.attr, + &driver_attr_version.attr, + NULL +}; + +static struct attribute_group driver_attr_group = { + .attrs = driver_attributes +}; + +static ssize_t store_tempsense(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret, stat; + u16 val; + + ret = ipath_parse_ushort(buf, &val); + if (ret <= 0) { + ipath_dev_err(dd, "attempt to set invalid tempsense config\n"); + goto bail; + } + /* If anything but the highest limit, enable T_CRIT_A "interrupt" */ + stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0); + if (stat) { + ipath_dev_err(dd, "Unable to set tempsense config\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF)); + if (stat) { + ipath_dev_err(dd, "Unable to set local Tcrit\n"); + ret = -1; + goto bail; + } + stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8)); + if (stat) { + ipath_dev_err(dd, "Unable to set remote Tcrit\n"); + ret = -1; + goto bail; + } + +bail: + return ret; +} + +/* + * dump tempsense regs. in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ipath_devdata *dd = dev_get_drvdata(dev); + int ret; + int idx; + u8 regvals[8]; + + ret = -ENXIO; + for (idx = 0; idx < 8; ++idx) { + if (idx == 6) + continue; + ret = ipath_tempsense_read(dd, idx); + if (ret < 0) + break; + regvals[idx] = ret; + } + if (idx == 8) + ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n", + *(signed char *)(regvals), + *(signed char *)(regvals + 1), + regvals[2], regvals[3], + *(signed char *)(regvals + 5), + *(signed char *)(regvals + 7)); + return ret; +} + +const struct attribute_group *ipath_driver_attr_groups[] = { + &driver_attr_group, + NULL, +}; + +static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid); +static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc); +static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid); +static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state); +static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid); +static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu); +static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled); +static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL); +static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL); +static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(status, S_IRUGO, show_status, NULL); +static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL); +static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv); +static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override); +static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL); +static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); +static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO, + show_jint_max_packets, store_jint_max_packets); +static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO, + show_jint_idle_ticks, store_jint_idle_ticks); +static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO, + show_tempsense, store_tempsense); + +static struct attribute *dev_attributes[] = { + &dev_attr_guid.attr, + &dev_attr_lmc.attr, + &dev_attr_lid.attr, + &dev_attr_link_state.attr, + &dev_attr_mlid.attr, + &dev_attr_mtu.attr, + &dev_attr_nguid.attr, + &dev_attr_nports.attr, + &dev_attr_serial.attr, + &dev_attr_status.attr, + &dev_attr_status_str.attr, + &dev_attr_boardversion.attr, + &dev_attr_unit.attr, + &dev_attr_enabled.attr, + &dev_attr_rx_pol_inv.attr, + &dev_attr_led_override.attr, + &dev_attr_logged_errors.attr, + &dev_attr_tempsense.attr, + &dev_attr_localbus_info.attr, + NULL +}; + +static struct attribute_group dev_attr_group = { + .attrs = dev_attributes +}; + +static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb, + store_hrtbt_enb); +static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb, + store_lwid_enb); +static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL); +static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb, + store_spd_enb); +static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL); +static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb, + store_rx_polinv_enb); +static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb, + store_lanerev_enb); + +static struct attribute *dev_ibcfg_attributes[] = { + &dev_attr_hrtbt_enable.attr, + &dev_attr_link_width_enable.attr, + &dev_attr_link_width.attr, + &dev_attr_link_speed_enable.attr, + &dev_attr_link_speed.attr, + &dev_attr_rx_pol_inv_enable.attr, + &dev_attr_rx_lane_rev_enable.attr, + NULL +}; + +static struct attribute_group dev_ibcfg_attr_group = { + .attrs = dev_ibcfg_attributes +}; + +/** + * ipath_expose_reset - create a device reset file + * @dev: the device structure + * + * Only expose a file that lets us reset the device after someone + * enters diag mode. A device reset is quite likely to crash the + * machine entirely, so we don't want to normally make it + * available. + * + * Called with ipath_mutex held. + */ +int ipath_expose_reset(struct device *dev) +{ + static int exposed; + int ret; + + if (!exposed) { + ret = device_create_file(dev, &dev_attr_reset); + exposed = 1; + } + else + ret = 0; + + return ret; +} + +int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd) +{ + int ret; + + ret = sysfs_create_group(&dev->kobj, &dev_attr_group); + if (ret) + goto bail; + + ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group); + if (ret) + goto bail_attrs; + + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + ret = device_create_file(dev, &dev_attr_jint_idle_ticks); + if (ret) + goto bail_counter; + ret = device_create_file(dev, &dev_attr_jint_max_packets); + if (ret) + goto bail_idle; + + ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group); + if (ret) + goto bail_max; + } + + return 0; + +bail_max: + device_remove_file(dev, &dev_attr_jint_max_packets); +bail_idle: + device_remove_file(dev, &dev_attr_jint_idle_ticks); +bail_counter: + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); +bail_attrs: + sysfs_remove_group(&dev->kobj, &dev_attr_group); +bail: + return ret; +} + +void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd) +{ + sysfs_remove_group(&dev->kobj, &dev_counter_attr_group); + + if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) { + sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group); + device_remove_file(dev, &dev_attr_jint_idle_ticks); + device_remove_file(dev, &dev_attr_jint_max_packets); + } + + sysfs_remove_group(&dev->kobj, &dev_attr_group); + + device_remove_file(dev, &dev_attr_reset); +} diff --git a/drivers/staging/rdma/ipath/ipath_uc.c b/drivers/staging/rdma/ipath/ipath_uc.c new file mode 100644 index 000000000..22e60998f --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_uc.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +/** + * ipath_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_uc_req(struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + struct ipath_swqe *wqe; + unsigned long flags; + u32 hwords; + u32 bth0; + u32 len; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + ohdr = &qp->s_hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &qp->s_hdr.u.l.oth; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + bth0 = 1 << 22; /* Set M bit */ + + /* Get the next send request. */ + wqe = get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_ipath_state_ops[qp->state] & + IPATH_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + if (qp->s_cur == qp->s_head) + goto bail; + /* + * Start a new request. + */ + qp->s_psn = wqe->psn = qp->s_next_psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_len = len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_ONLY); + else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->wr.wr.rdma.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->wr.wr.rdma.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_ONLY); + else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) + qp->s_state = OP(SEND_LAST); + else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) + qp->s_state = OP(RDMA_WRITE_LAST); + else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + ipath_make_ruc_header(to_idev(qp->ibqp.device), + qp, ohdr, bth0 | (qp->s_state << 24), + qp->s_next_psn++ & IPATH_PSN_MASK); +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_uc_rcv - handle an incoming UC packet + * @dev: the device the packet came in on + * @hdr: the header of the packet + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from ipath_qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu); + struct ib_reth *reth; + int header_in_data; + + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid)) + goto done; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12; /* LRH + BTH */ + psn = be32_to_cpu(ohdr->bth[2]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12; /* LRH + GRH + BTH */ + /* + * The header with GRH is 60 bytes and the + * core driver sets the eager header buffer + * size to 56 bytes so the last 4 bytes of + * the BTH header (PSN) is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + psn = be32_to_cpu(((__be32 *) data)[0]); + data += sizeof(__be32); + } else + psn = be32_to_cpu(ohdr->bth[2]); + } + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + + memset(&wc, 0, sizeof wc); + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; + inv: + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + dev->n_pkt_drops++; + goto done; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + send_first: + if (qp->r_flags & IPATH_R_REUSE_SGE) { + qp->r_flags &= ~IPATH_R_REUSE_SGE; + qp->r_sge = qp->s_rdma_read_sge; + } else if (!ipath_get_rwqe(qp, 0)) { + dev->n_pkt_drops++; + goto done; + } + /* Save the WQE so we can reuse it in case of an error. */ + qp->s_rdma_read_sge = qp->r_sge; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto send_last; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): + send_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + /* FALLTHROUGH */ + case OP(SEND_LAST): + send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto done; + } + wc.opcode = IB_WC_RECV; + last_imm: + ipath_copy_sge(&qp->r_sge, data, tlen); + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ + rdma_first: + /* RETH comes after BTH */ + if (!header_in_data) + reth = &ohdr->u.rc.reth; + else { + reth = (struct ib_reth *)data; + data += sizeof(*reth); + } + hdrsize += sizeof(*reth); + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len, + vaddr, rkey, + IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) { + dev->n_pkt_drops++; + goto done; + } + } else { + qp->r_sge.sg_list = NULL; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + dev->n_pkt_drops++; + goto done; + } + if (opcode == OP(RDMA_WRITE_ONLY)) + goto rdma_last; + else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + goto rdma_last_imm; + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) { + dev->n_pkt_drops++; + goto done; + } + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, pmtu); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + rdma_last_imm: + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else { + /* Immediate data comes after BTH */ + wc.ex.imm_data = ohdr->u.imm_data; + } + hdrsize += 4; + wc.wc_flags = IB_WC_WITH_IMM; + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; + else if (!ipath_get_rwqe(qp, 1)) { + dev->n_pkt_drops++; + goto done; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + goto last_imm; + + case OP(RDMA_WRITE_LAST): + rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* XXX LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) { + dev->n_pkt_drops++; + goto done; + } + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) { + dev->n_pkt_drops++; + goto done; + } + ipath_copy_sge(&qp->r_sge, data, tlen); + break; + + default: + /* Drop packet for unknown opcodes. */ + dev->n_pkt_drops++; + goto done; + } + qp->r_psn++; + qp->r_state = opcode; +done: + return; +} diff --git a/drivers/staging/rdma/ipath/ipath_ud.c b/drivers/staging/rdma/ipath/ipath_ud.c new file mode 100644 index 000000000..e8a2a9152 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_ud.c @@ -0,0 +1,580 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "ipath_verbs.h" +#include "ipath_kernel.h" + +/** + * ipath_ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from ipath_make_ud_req() to forward a WQE addressed + * to the same HCA. + * Note that the receive interrupt handler may be calling ipath_ud_rcv() + * while this is being called. + */ +static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe) +{ + struct ipath_ibdev *dev = to_idev(sqp->ibqp.device); + struct ipath_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct ipath_rq *rq; + struct ipath_srq *srq; + struct ipath_sge_state rsge; + struct ipath_sge *sge; + struct ipath_rwq *wq; + struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + struct ib_wc wc; + u32 tail; + u32 rlen; + u32 length; + + qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn); + if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + goto done; + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (unlikely(qp->ibqp.qp_num && + ((int) swqe->wr.wr.ud.remote_qkey < 0 ? + sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto drop; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof wc); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + /* + * This would be a lot simpler if we could call ipath_get_rwqe() + * but that uses state that the receive interrupt handler uses + * so we would need to lock out receive interrupts while doing + * local loopback. + */ + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + /* + * Get the next work request entry to find where to put the data. + * Note that it is safe to drop the lock after changing rq->tail + * since ipath_post_receive() won't fill the empty slot. + */ + spin_lock_irqsave(&rq->lock, flags); + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + wqe = get_rwqe_ptr(rq, tail); + rsge.sg_list = qp->r_ud_sg_list; + if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + /* Silently drop packets which are too big. */ + if (wc.byte_len > rlen) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto drop; + } + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + wc.wr_id = wqe->wr_id; + if (handler) { + u32 n; + + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + } else + spin_unlock_irqrestore(&rq->lock, flags); + } else + spin_unlock_irqrestore(&rq->lock, flags); + + ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr; + if (ah_attr->ah_flags & IB_AH_GRH) { + ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); + wc.wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&rsge, sizeof(struct ib_grh)); + sge = swqe->sg_list; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + ipath_copy_sge(&rsge, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--swqe->wr.num_sge) + sge++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc.pkey_index = 0; + wc.slid = dev->dd->ipath_lid | + (ah_attr->src_path_bits & + ((1 << dev->dd->ipath_lmc) - 1)); + wc.sl = ah_attr->sl; + wc.dlid_path_bits = + ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); +drop: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +done:; +} + +/** + * ipath_make_ud_req - construct a UD request packet + * @qp: the QP + * + * Return 1 if constructed; otherwise, return 0. + */ +int ipath_make_ud_req(struct ipath_qp *qp) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct ipath_swqe *wqe; + unsigned long flags; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int ret = 0; + int next_cur; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + if (qp->s_last == qp->s_head) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + wqe = get_swqe_ptr(qp, qp->s_last); + ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done; + } + + if (qp->s_cur == qp->s_head) + goto bail; + + wqe = get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr; + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) { + if (ah_attr->dlid != IPATH_PERMISSIVE_LID) + dev->n_multicast_xmit++; + else + dev->n_unicast_xmit++; + } else { + dev->n_unicast_xmit++; + lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1); + if (unlikely(lid == dev->dd->ipath_lid)) { + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * XXX Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (atomic_read(&qp->s_dma_busy)) { + qp->s_flags |= IPATH_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, flags); + ipath_ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done; + } + } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_dmult = ah_attr->static_rate; + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); + lrh0 = IPATH_LRH_GRH; + ohdr = &qp->s_hdr.u.l.oth; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + lrh0 = IPATH_LRH_BTH; + ohdr = &qp->s_hdr.u.oth; + } + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + lrh0 |= ah_attr->sl << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + qp->s_hdr.lrh[0] = cpu_to_be16(lrh0); + qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); /* DEST LID */ + qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + + SIZE_OF_CRC); + lid = dev->dd->ipath_lid; + if (lid) { + lid |= ah_attr->src_path_bits & + ((1 << dev->dd->ipath_lmc) - 1); + qp->s_hdr.lrh[3] = cpu_to_be16(lid); + } else + qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= 1 << 23; + bth0 |= extra_bytes << 20; + bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY : + ipath_get_pkey(dev->dd, qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + /* + * Use the multicast QP if the destination LID is a multicast LID. + */ + ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID ? + cpu_to_be32(IPATH_MULTICAST_QPN) : + cpu_to_be32(wqe->wr.wr.ud.remote_qpn); + ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ? + qp->qkey : wqe->wr.wr.ud.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + +done: + ret = 1; + goto unlock; + +bail: + qp->s_flags &= ~IPATH_S_BUSY; +unlock: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_ud_rcv - receive an incoming UD packet + * @dev: the device the packet came in on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp) +{ + struct ipath_other_headers *ohdr; + int opcode; + u32 hdrsize; + u32 pad; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + u16 dlid; + int header_in_data; + + /* Check for GRH */ + if (!has_grh) { + ohdr = &hdr->u.oth; + hdrsize = 8 + 12 + 8; /* LRH + BTH + DETH */ + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + header_in_data = 0; + } else { + ohdr = &hdr->u.l.oth; + hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */ + /* + * The header with GRH is 68 bytes and the core driver sets + * the eager header buffer size to 56 bytes so the last 12 + * bytes of the IB header is in the data buffer. + */ + header_in_data = dev->dd->ipath_rcvhdrentsize == 16; + if (header_in_data) { + qkey = be32_to_cpu(((__be32 *) data)[1]); + src_qp = be32_to_cpu(((__be32 *) data)[2]); + data += 12; + } else { + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]); + } + } + src_qp &= IPATH_QPN_MASK; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) { + dev->n_pkt_drops++; + goto bail; + } + if (unlikely(qkey != qp->qkey)) { + /* XXX OK to lose a count once in a while. */ + dev->qkey_violations++; + dev->n_pkt_drops++; + goto bail; + } + } else if (hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) { + struct ib_smp *smp = (struct ib_smp *) data; + + if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev->n_pkt_drops++; + goto bail; + } + } + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + if (header_in_data) { + wc.ex.imm_data = *(__be32 *) data; + data += sizeof(__be32); + } else + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + hdrsize += sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else { + dev->n_pkt_drops++; + goto bail; + } + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) { + /* Drop incomplete packets. */ + dev->n_pkt_drops++; + goto bail; + } + tlen -= hdrsize + pad + 4; + + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely((qp->ibqp.qp_num == 0 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) || + (qp->ibqp.qp_num == 1 && + (tlen != 256 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) { + dev->n_pkt_drops++; + goto bail; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & IPATH_R_REUSE_SGE) + qp->r_flags &= ~IPATH_R_REUSE_SGE; + else if (!ipath_get_rwqe(qp, 0)) { + /* + * Count VL15 packets dropped due to no receive buffer. + * Otherwise, count them as buffer overruns since usually, + * the HW will be able to receive packets even if there are + * no QPs with posted receive buffers. + */ + if (qp->ibqp.qp_num == 0) + dev->n_vl15_dropped++; + else + dev->rcv_errors++; + goto bail; + } + /* Silently drop packets which are too big. */ + if (wc.byte_len > qp->r_len) { + qp->r_flags |= IPATH_R_REUSE_SGE; + dev->n_pkt_drops++; + goto bail; + } + if (has_grh) { + ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh)); + wc.wc_flags |= IB_WC_GRH; + } else + ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); + ipath_copy_sge(&qp->r_sge, data, + wc.byte_len - sizeof(struct ib_grh)); + if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) + goto bail; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + /* XXX do we know which pkey matched? Only needed for GSI. */ + wc.pkey_index = 0; + wc.slid = be16_to_cpu(hdr->lrh[3]); + wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF; + dlid = be16_to_cpu(hdr->lrh[1]); + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 : + dlid & ((1 << dev->dd->ipath_lmc) - 1); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(1 << 23)) != 0); + +bail:; +} diff --git a/drivers/staging/rdma/ipath/ipath_user_pages.c b/drivers/staging/rdma/ipath/ipath_user_pages.c new file mode 100644 index 000000000..1da1252dc --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_user_pages.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "ipath_kernel.h" + +static void __ipath_release_user_pages(struct page **p, size_t num_pages, + int dirty) +{ + size_t i; + + for (i = 0; i < num_pages; i++) { + ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i, + (unsigned long) num_pages, p[i]); + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } +} + +/* call with current->mm->mmap_sem held */ +static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + unsigned long lock_limit; + size_t got; + int ret; + + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages > lock_limit) { + ret = -ENOMEM; + goto bail; + } + + ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n", + (unsigned long) num_pages, start_page); + + for (got = 0; got < num_pages; got += ret) { + ret = get_user_pages(current, current->mm, + start_page + got * PAGE_SIZE, + num_pages - got, 1, 1, + p + got, NULL); + if (ret < 0) + goto bail_release; + } + + current->mm->pinned_vm += num_pages; + + ret = 0; + goto bail; + +bail_release: + __ipath_release_user_pages(p, got, 0); +bail: + return ret; +} + +/** + * ipath_map_page - a safety wrapper around pci_map_page() + * + * A dma_addr of all 0's is interpreted by the chip as "disabled". + * Unfortunately, it can also be a valid dma_addr returned on some + * architectures. + * + * The powerpc iommu assigns dma_addrs in ascending order, so we don't + * have to bother with retries or mapping a dummy page to insure we + * don't just get the same mapping again. + * + * I'm sure we won't be so lucky with other iommu's, so FIXME. + */ +dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page, + unsigned long offset, size_t size, int direction) +{ + dma_addr_t phys; + + phys = pci_map_page(hwdev, page, offset, size, direction); + + if (phys == 0) { + pci_unmap_page(hwdev, phys, size, direction); + phys = pci_map_page(hwdev, page, offset, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * ipath_map_single - a safety wrapper around pci_map_single() + * + * Same idea as ipath_map_page(). + */ +dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size, + int direction) +{ + dma_addr_t phys; + + phys = pci_map_single(hwdev, ptr, size, direction); + + if (phys == 0) { + pci_unmap_single(hwdev, phys, size, direction); + phys = pci_map_single(hwdev, ptr, size, direction); + /* + * FIXME: If we get 0 again, we should keep this page, + * map another, then free the 0 page. + */ + } + + return phys; +} + +/** + * ipath_get_user_pages - lock user pages into memory + * @start_page: the start page + * @num_pages: the number of pages + * @p: the output page structures + * + * This function takes a given start page (page aligned user virtual + * address) and pins it and the following specified number of pages. For + * now, num_pages is always 1, but that will probably change at some point + * (because caller is doing expected sends on a single virtually contiguous + * buffer, so we can do all pages at once). + */ +int ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p) +{ + int ret; + + down_write(¤t->mm->mmap_sem); + + ret = __ipath_get_user_pages(start_page, num_pages, p); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void ipath_release_user_pages(struct page **p, size_t num_pages) +{ + down_write(¤t->mm->mmap_sem); + + __ipath_release_user_pages(p, num_pages, 1); + + current->mm->pinned_vm -= num_pages; + + up_write(¤t->mm->mmap_sem); +} + +struct ipath_user_pages_work { + struct work_struct work; + struct mm_struct *mm; + unsigned long num_pages; +}; + +static void user_pages_account(struct work_struct *_work) +{ + struct ipath_user_pages_work *work = + container_of(_work, struct ipath_user_pages_work, work); + + down_write(&work->mm->mmap_sem); + work->mm->pinned_vm -= work->num_pages; + up_write(&work->mm->mmap_sem); + mmput(work->mm); + kfree(work); +} + +void ipath_release_user_pages_on_close(struct page **p, size_t num_pages) +{ + struct ipath_user_pages_work *work; + struct mm_struct *mm; + + __ipath_release_user_pages(p, num_pages, 1); + + mm = get_task_mm(current); + if (!mm) + return; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) + goto bail_mm; + + INIT_WORK(&work->work, user_pages_account); + work->mm = mm; + work->num_pages = num_pages; + + queue_work(ib_wq, &work->work); + return; + +bail_mm: + mmput(mm); + return; +} diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.c b/drivers/staging/rdma/ipath/ipath_user_sdma.c new file mode 100644 index 000000000..cc04b7ba3 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_user_sdma.c @@ -0,0 +1,875 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_user_sdma.h" + +/* minimum size of header */ +#define IPATH_USER_SDMA_MIN_HEADER_LENGTH 64 +/* expected size of headers (for dma_pool) */ +#define IPATH_USER_SDMA_EXP_HEADER_LENGTH 64 +/* length mask in PBC (lower 11 bits) */ +#define IPATH_PBC_LENGTH_MASK ((1 << 11) - 1) + +struct ipath_user_sdma_pkt { + u8 naddr; /* dimension of addr (1..3) ... */ + u32 counter; /* sdma pkts queued counter for this entry */ + u64 added; /* global descq number of entries */ + + struct { + u32 offset; /* offset for kvaddr, addr */ + u32 length; /* length in page */ + u8 put_page; /* should we put_page? */ + u8 dma_mapped; /* is page dma_mapped? */ + struct page *page; /* may be NULL (coherent mem) */ + void *kvaddr; /* FIXME: only for pio hack */ + dma_addr_t addr; + } addr[4]; /* max pages, any more and we coalesce */ + struct list_head list; /* list element */ +}; + +struct ipath_user_sdma_queue { + /* + * pkts sent to dma engine are queued on this + * list head. the type of the elements of this + * list are struct ipath_user_sdma_pkt... + */ + struct list_head sent; + + /* headers with expected length are allocated from here... */ + char header_cache_name[64]; + struct dma_pool *header_cache; + + /* packets are allocated from the slab cache... */ + char pkt_slab_name[64]; + struct kmem_cache *pkt_slab; + + /* as packets go on the queued queue, they are counted... */ + u32 counter; + u32 sent_counter; + + /* dma page table */ + struct rb_root dma_pages_root; + + /* protect everything above... */ + struct mutex lock; +}; + +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport) +{ + struct ipath_user_sdma_queue *pq = + kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL); + + if (!pq) + goto done; + + pq->counter = 0; + pq->sent_counter = 0; + INIT_LIST_HEAD(&pq->sent); + + mutex_init(&pq->lock); + + snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name), + "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport); + pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name, + sizeof(struct ipath_user_sdma_pkt), + 0, 0, NULL); + + if (!pq->pkt_slab) + goto err_kfree; + + snprintf(pq->header_cache_name, sizeof(pq->header_cache_name), + "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport); + pq->header_cache = dma_pool_create(pq->header_cache_name, + dev, + IPATH_USER_SDMA_EXP_HEADER_LENGTH, + 4, 0); + if (!pq->header_cache) + goto err_slab; + + pq->dma_pages_root = RB_ROOT; + + goto done; + +err_slab: + kmem_cache_destroy(pq->pkt_slab); +err_kfree: + kfree(pq); + pq = NULL; + +done: + return pq; +} + +static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt, + int i, size_t offset, size_t len, + int put_page, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->addr[i].offset = offset; + pkt->addr[i].length = len; + pkt->addr[i].put_page = put_page; + pkt->addr[i].dma_mapped = dma_mapped; + pkt->addr[i].page = page; + pkt->addr[i].kvaddr = kvaddr; + pkt->addr[i].addr = dma_addr; +} + +static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt, + u32 counter, size_t offset, + size_t len, int dma_mapped, + struct page *page, + void *kvaddr, dma_addr_t dma_addr) +{ + pkt->naddr = 1; + pkt->counter = counter; + ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page, + kvaddr, dma_addr); +} + +/* we've too many pages in the iovec, coalesce to a single page */ +static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) { + int ret = 0; + struct page *page = alloc_page(GFP_KERNEL); + void *mpage_save; + char *mpage; + int i; + int len = 0; + dma_addr_t dma_addr; + + if (!page) { + ret = -ENOMEM; + goto done; + } + + mpage = kmap(page); + mpage_save = mpage; + for (i = 0; i < niov; i++) { + int cfur; + + cfur = copy_from_user(mpage, + iov[i].iov_base, iov[i].iov_len); + if (cfur) { + ret = -EFAULT; + goto free_unmap; + } + + mpage += iov[i].iov_len; + len += iov[i].iov_len; + } + + dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len, + DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_unmap; + } + + ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save, + dma_addr); + pkt->naddr = 2; + + goto done; + +free_unmap: + kunmap(page); + __free_page(page); +done: + return ret; +} + +/* how many pages in this iovec element? */ +static int ipath_user_sdma_num_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long) iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +/* truncate length to page boundary */ +static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len) +{ + const unsigned long offset = addr & ~PAGE_MASK; + + return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len; +} + +static void ipath_user_sdma_free_pkt_frag(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + int frag) +{ + const int i = frag; + + if (pkt->addr[i].page) { + if (pkt->addr[i].dma_mapped) + dma_unmap_page(dev, + pkt->addr[i].addr, + pkt->addr[i].length, + DMA_TO_DEVICE); + + if (pkt->addr[i].kvaddr) + kunmap(pkt->addr[i].page); + + if (pkt->addr[i].put_page) + put_page(pkt->addr[i].page); + else + __free_page(pkt->addr[i].page); + } else if (pkt->addr[i].kvaddr) + /* free coherent mem from cache... */ + dma_pool_free(pq->header_cache, + pkt->addr[i].kvaddr, pkt->addr[i].addr); +} + +/* return number of pages pinned... */ +static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, + unsigned long addr, int tlen, int npages) +{ + struct page *pages[2]; + int j; + int ret; + + ret = get_user_pages_fast(addr, npages, 0, pages); + if (ret != npages) { + int i; + + for (i = 0; i < ret; i++) + put_page(pages[i]); + + ret = -ENOMEM; + goto done; + } + + for (j = 0; j < npages; j++) { + /* map the pages... */ + const int flen = + ipath_user_sdma_page_length(addr, tlen); + dma_addr_t dma_addr = + dma_map_page(&dd->pcidev->dev, + pages[j], 0, flen, DMA_TO_DEVICE); + unsigned long fofs = addr & ~PAGE_MASK; + + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto done; + } + + ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1, + pages[j], kmap(pages[j]), + dma_addr); + + pkt->naddr++; + addr += flen; + tlen -= flen; + } + +done: + return ret; +} + +static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov) +{ + int ret = 0; + unsigned long idx; + + for (idx = 0; idx < niov; idx++) { + const int npages = ipath_user_sdma_num_pages(iov + idx); + const unsigned long addr = (unsigned long) iov[idx].iov_base; + + ret = ipath_user_sdma_pin_pages(dd, pkt, + addr, iov[idx].iov_len, + npages); + if (ret < 0) + goto free_pkt; + } + + goto done; + +free_pkt: + for (idx = 0; idx < pkt->naddr; idx++) + ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx); + +done: + return ret; +} + +static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct ipath_user_sdma_pkt *pkt, + const struct iovec *iov, + unsigned long niov, int npages) +{ + int ret = 0; + + if (npages >= ARRAY_SIZE(pkt->addr)) + ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov); + else + ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov); + + return ret; +} + +/* free a packet list -- return counter value of last packet */ +static void ipath_user_sdma_free_pkt_list(struct device *dev, + struct ipath_user_sdma_queue *pq, + struct list_head *list) +{ + struct ipath_user_sdma_pkt *pkt, *pkt_next; + + list_for_each_entry_safe(pkt, pkt_next, list, list) { + int i; + + for (i = 0; i < pkt->naddr; i++) + ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i); + + kmem_cache_free(pq->pkt_slab, pkt); + } +} + +/* + * copy headers, coalesce etc -- pq->lock must be held + * + * we queue all the packets to list, returning the + * number of bytes total. list must be empty initially, + * as, if there is an error we clean it... + */ +static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *list, + const struct iovec *iov, + unsigned long niov, + int maxpkts) +{ + unsigned long idx = 0; + int ret = 0; + int npkts = 0; + struct page *page = NULL; + __le32 *pbc; + dma_addr_t dma_addr; + struct ipath_user_sdma_pkt *pkt = NULL; + size_t len; + size_t nw; + u32 counter = pq->counter; + int dma_mapped = 0; + + while (idx < niov && npkts < maxpkts) { + const unsigned long addr = (unsigned long) iov[idx].iov_base; + const unsigned long idx_save = idx; + unsigned pktnw; + unsigned pktnwc; + int nfrags = 0; + int npages = 0; + int cfur; + + dma_mapped = 0; + len = iov[idx].iov_len; + nw = len >> 2; + page = NULL; + + pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL); + if (!pkt) { + ret = -ENOMEM; + goto free_list; + } + + if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH || + len > PAGE_SIZE || len & 3 || addr & 3) { + ret = -EINVAL; + goto free_pkt; + } + + if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH) + pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL, + &dma_addr); + else + pbc = NULL; + + if (!pbc) { + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto free_pkt; + } + pbc = kmap(page); + } + + cfur = copy_from_user(pbc, iov[idx].iov_base, len); + if (cfur) { + ret = -EFAULT; + goto free_pbc; + } + + /* + * this assignment is a bit strange. it's because the + * the pbc counts the number of 32 bit words in the full + * packet _except_ the first word of the pbc itself... + */ + pktnwc = nw - 1; + + /* + * pktnw computation yields the number of 32 bit words + * that the caller has indicated in the PBC. note that + * this is one less than the total number of words that + * goes to the send DMA engine as the first 32 bit word + * of the PBC itself is not counted. Armed with this count, + * we can verify that the packet is consistent with the + * iovec lengths. + */ + pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK; + if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) { + ret = -EINVAL; + goto free_pbc; + } + + + idx++; + while (pktnwc < pktnw && idx < niov) { + const size_t slen = iov[idx].iov_len; + const unsigned long faddr = + (unsigned long) iov[idx].iov_base; + + if (slen & 3 || faddr & 3 || !slen || + slen > PAGE_SIZE) { + ret = -EINVAL; + goto free_pbc; + } + + npages++; + if ((faddr & PAGE_MASK) != + ((faddr + slen - 1) & PAGE_MASK)) + npages++; + + pktnwc += slen >> 2; + idx++; + nfrags++; + } + + if (pktnwc != pktnw) { + ret = -EINVAL; + goto free_pbc; + } + + if (page) { + dma_addr = dma_map_page(&dd->pcidev->dev, + page, 0, len, DMA_TO_DEVICE); + if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) { + ret = -ENOMEM; + goto free_pbc; + } + + dma_mapped = 1; + } + + ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped, + page, pbc, dma_addr); + + if (nfrags) { + ret = ipath_user_sdma_init_payload(dd, pq, pkt, + iov + idx_save + 1, + nfrags, npages); + if (ret < 0) + goto free_pbc_dma; + } + + counter++; + npkts++; + + list_add_tail(&pkt->list, list); + } + + ret = idx; + goto done; + +free_pbc_dma: + if (dma_mapped) + dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE); +free_pbc: + if (page) { + kunmap(page); + __free_page(page); + } else + dma_pool_free(pq->header_cache, pbc, dma_addr); +free_pkt: + kmem_cache_free(pq->pkt_slab, pkt); +free_list: + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list); +done: + return ret; +} + +static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq, + u32 c) +{ + pq->sent_counter = c; +} + +/* try to clean out queue -- needs pq->lock */ +static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + struct list_head free_list; + struct ipath_user_sdma_pkt *pkt; + struct ipath_user_sdma_pkt *pkt_prev; + int ret = 0; + + INIT_LIST_HEAD(&free_list); + + list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) { + s64 descd = dd->ipath_sdma_descq_removed - pkt->added; + + if (descd < 0) + break; + + list_move_tail(&pkt->list, &free_list); + + /* one more packet cleaned */ + ret++; + } + + if (!list_empty(&free_list)) { + u32 counter; + + pkt = list_entry(free_list.prev, + struct ipath_user_sdma_pkt, list); + counter = pkt->counter; + + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + ipath_user_sdma_set_complete_counter(pq, counter); + } + + return ret; +} + +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq) +{ + if (!pq) + return; + + kmem_cache_destroy(pq->pkt_slab); + dma_pool_destroy(pq->header_cache); + kfree(pq); +} + +/* clean descriptor queue, returns > 0 if some elements cleaned */ +static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd) +{ + int ret; + unsigned long flags; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + ret = ipath_sdma_make_progress(dd); + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +/* we're in close, drain packets so that we can cleanup successfully... */ +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int i; + + if (!pq) + return; + + for (i = 0; i < 100; i++) { + mutex_lock(&pq->lock); + if (list_empty(&pq->sent)) { + mutex_unlock(&pq->lock); + break; + } + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + msleep(10); + } + + if (!list_empty(&pq->sent)) { + struct list_head free_list; + + printk(KERN_INFO "drain: lists not empty: forcing!\n"); + INIT_LIST_HEAD(&free_list); + mutex_lock(&pq->lock); + list_splice_init(&pq->sent, &free_list); + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list); + mutex_unlock(&pq->lock); + } +} + +static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd, + u64 addr, u64 dwlen, u64 dwoffset) +{ + return cpu_to_le64(/* SDmaPhyAddr[31:0] */ + ((addr & 0xfffffffcULL) << 32) | + /* SDmaGeneration[1:0] */ + ((dd->ipath_sdma_generation & 3ULL) << 30) | + /* SDmaDwordCount[10:0] */ + ((dwlen & 0x7ffULL) << 16) | + /* SDmaBufOffset[12:2] */ + (dwoffset & 0x7ffULL)); +} + +static inline __le64 ipath_sdma_make_first_desc0(__le64 descq) +{ + return descq | cpu_to_le64(1ULL << 12); +} + +static inline __le64 ipath_sdma_make_last_desc0(__le64 descq) +{ + /* last */ /* dma head */ + return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13); +} + +static inline __le64 ipath_sdma_make_desc1(u64 addr) +{ + /* SDmaPhyAddr[47:32] */ + return cpu_to_le64(addr >> 32); +} + +static void ipath_user_sdma_send_frag(struct ipath_devdata *dd, + struct ipath_user_sdma_pkt *pkt, int idx, + unsigned ofs, u16 tail) +{ + const u64 addr = (u64) pkt->addr[idx].addr + + (u64) pkt->addr[idx].offset; + const u64 dwlen = (u64) pkt->addr[idx].length / 4; + __le64 *descqp; + __le64 descq0; + + descqp = &dd->ipath_sdma_descq[tail].qw[0]; + + descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs); + if (idx == 0) + descq0 = ipath_sdma_make_first_desc0(descq0); + if (idx == pkt->naddr - 1) + descq0 = ipath_sdma_make_last_desc0(descq0); + + descqp[0] = descq0; + descqp[1] = ipath_sdma_make_desc1(addr); +} + +/* pq->lock must be held, get packets on the wire... */ +static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + struct list_head *pktlist) +{ + int ret = 0; + unsigned long flags; + u16 tail; + + if (list_empty(pktlist)) + return 0; + + if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE))) + return -ECOMM; + + spin_lock_irqsave(&dd->ipath_sdma_lock, flags); + + if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) { + ret = -ECOMM; + goto unlock; + } + + tail = dd->ipath_sdma_descq_tail; + while (!list_empty(pktlist)) { + struct ipath_user_sdma_pkt *pkt = + list_entry(pktlist->next, struct ipath_user_sdma_pkt, + list); + int i; + unsigned ofs = 0; + u16 dtail = tail; + + if (pkt->naddr > ipath_sdma_descq_freecnt(dd)) + goto unlock_check_tail; + + for (i = 0; i < pkt->naddr; i++) { + ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail); + ofs += pkt->addr[i].length >> 2; + + if (++tail == dd->ipath_sdma_descq_cnt) { + tail = 0; + ++dd->ipath_sdma_generation; + } + } + + if ((ofs<<2) > dd->ipath_ibmaxlen) { + ipath_dbg("packet size %X > ibmax %X, fail\n", + ofs<<2, dd->ipath_ibmaxlen); + ret = -EMSGSIZE; + goto unlock; + } + + /* + * if the packet is >= 2KB mtu equivalent, we have to use + * the large buffers, and have to mark each descriptor as + * part of a large buffer packet. + */ + if (ofs >= IPATH_SMALLBUF_DWORDS) { + for (i = 0; i < pkt->naddr; i++) { + dd->ipath_sdma_descq[dtail].qw[0] |= + cpu_to_le64(1ULL << 14); + if (++dtail == dd->ipath_sdma_descq_cnt) + dtail = 0; + } + } + + dd->ipath_sdma_descq_added += pkt->naddr; + pkt->added = dd->ipath_sdma_descq_added; + list_move_tail(&pkt->list, &pq->sent); + ret++; + } + +unlock_check_tail: + /* advance the tail on the chip if necessary */ + if (dd->ipath_sdma_descq_tail != tail) { + wmb(); + ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail); + dd->ipath_sdma_descq_tail = tail; + } + +unlock: + spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags); + + return ret; +} + +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim) +{ + int ret = 0; + struct list_head list; + int npkts = 0; + + INIT_LIST_HEAD(&list); + + mutex_lock(&pq->lock); + + if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + while (dim) { + const int mxp = 8; + + ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); + if (ret <= 0) + goto done_unlock; + else { + dim -= ret; + iov += ret; + } + + /* force packets onto the sdma hw queue... */ + if (!list_empty(&list)) { + /* + * lazily clean hw queue. the 4 is a guess of about + * how many sdma descriptors a packet will take (it + * doesn't have to be perfect). + */ + if (ipath_sdma_descq_freecnt(dd) < ret * 4) { + ipath_user_sdma_hwqueue_clean(dd); + ipath_user_sdma_queue_clean(dd, pq); + } + + ret = ipath_user_sdma_push_pkts(dd, pq, &list); + if (ret < 0) + goto done_unlock; + else { + npkts += ret; + pq->counter += ret; + + if (!list_empty(&list)) + goto done_unlock; + } + } + } + +done_unlock: + if (!list_empty(&list)) + ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list); + mutex_unlock(&pq->lock); + + return (ret < 0) ? ret : npkts; +} + +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq) +{ + int ret = 0; + + mutex_lock(&pq->lock); + ipath_user_sdma_hwqueue_clean(dd); + ret = ipath_user_sdma_queue_clean(dd, pq); + mutex_unlock(&pq->lock); + + return ret; +} + +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq) +{ + return pq->sent_counter; +} + +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq) +{ + return pq->counter; +} + diff --git a/drivers/staging/rdma/ipath/ipath_user_sdma.h b/drivers/staging/rdma/ipath/ipath_user_sdma.h new file mode 100644 index 000000000..fc76316c4 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_user_sdma.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include + +struct ipath_user_sdma_queue; + +struct ipath_user_sdma_queue * +ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport); +void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq); + +int ipath_user_sdma_writev(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq, + const struct iovec *iov, + unsigned long dim); + +int ipath_user_sdma_make_progress(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); + +void ipath_user_sdma_queue_drain(struct ipath_devdata *dd, + struct ipath_user_sdma_queue *pq); + +u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq); +u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq); diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c new file mode 100644 index 000000000..ed2bbc2f7 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_verbs.c @@ -0,0 +1,2365 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" +#include "ipath_verbs.h" +#include "ipath_common.h" + +static unsigned int ib_ipath_qp_table_size = 251; +module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +unsigned int ib_ipath_lkey_table_size = 12; +module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int ib_ipath_max_pds = 0xFFFF; +module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int ib_ipath_max_ahs = 0xFFFF; +module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int ib_ipath_max_cqes = 0x2FFFF; +module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int ib_ipath_max_cqs = 0x1FFFF; +module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int ib_ipath_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int ib_ipath_max_qps = 16384; +module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int ib_ipath_max_sges = 0x60; +module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int ib_ipath_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint, + S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int ib_ipath_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int ib_ipath_max_srqs = 1024; +module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int ib_ipath_max_srq_sges = 128; +module_param_named(max_srq_sges, ib_ipath_max_srq_sges, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int ib_ipath_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs, + uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +static unsigned int ib_ipath_disable_sma; +module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(disable_sma, "Disable the SMA"); + +/* + * Note that it is OK to post send work requests in the SQE and ERR + * states; ipath_do_send() will process them and generate error + * completions as per IB 1.2 C10-96. + */ +const int ib_ipath_state_ops[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = 0, + [IB_QPS_INIT] = IPATH_POST_RECV_OK, + [IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK, + [IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK | + IPATH_PROCESS_NEXT_SEND_OK, + [IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK, + [IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, + [IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV | + IPATH_POST_SEND_OK | IPATH_FLUSH_SEND, +}; + +struct ipath_ucontext { + struct ib_ucontext ibucontext; +}; + +static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext + *ibucontext) +{ + return container_of(ibucontext, struct ipath_ucontext, ibucontext); +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_ipath_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * System image GUID. + */ +static __be64 sys_image_guid; + +/** + * ipath_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + */ +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(sge->vaddr, data, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/* + * Count the number of DMA descriptors needed to send length bytes of data. + * Don't modify the ipath_sge_state to get the count. + * Return zero if any of the segments is not aligned. + */ +static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sg_list = ss->sg_list; + struct ipath_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 ndesc = 1; /* count the header */ + + while (length) { + u32 len = sge.length; + + if (len > length) + len = length; + if (len > sge.sge_length) + len = sge.sge_length; + BUG_ON(len == 0); + if (((long) sge.vaddr & (sizeof(u32) - 1)) || + (len != length && (len & (sizeof(u32) - 1)))) { + ndesc = 0; + break; + } + ndesc++; + sge.vaddr += len; + sge.length -= len; + sge.sge_length -= len; + if (sge.sge_length == 0) { + if (--num_sge) + sge = *sg_list++; + } else if (sge.length == 0 && sge.mr != NULL) { + if (++sge.n >= IPATH_SEGSZ) { + if (++sge.m >= sge.mr->mapsz) + break; + sge.n = 0; + } + sge.vaddr = + sge.mr->map[sge.m]->segs[sge.n].vaddr; + sge.length = + sge.mr->map[sge.m]->segs[sge.n].length; + } + length -= len; + } + return ndesc; +} + +/* + * Copy from the SGEs to the data buffer. + */ +static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss, + u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + BUG_ON(len == 0); + memcpy(data, sge->vaddr, len); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } +} + +/** + * ipath_post_one_send - post one RC, UC, or UD send work request + * @qp: the QP to post on + * @wr: the work request to send + */ +static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr) +{ + struct ipath_swqe *wqe; + u32 next; + int i; + int j; + int acc; + int ret; + unsigned long flags; + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + + spin_lock_irqsave(&qp->s_lock, flags); + + if (qp->ibqp.qp_type != IB_QPT_SMI && + !(dd->ipath_flags & IPATH_LINKACTIVE)) { + ret = -ENETDOWN; + goto bail; + } + + /* Check that state is OK to post send. */ + if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) + goto bail_inval; + + /* IB spec says that num_sge == 0 is OK. */ + if (wr->num_sge > qp->s_max_sge) + goto bail_inval; + + /* + * Don't allow RDMA reads or atomic operations on UC or + * undefined operations. + * Make sure buffer is large enough to hold the result for atomics. + */ + if (qp->ibqp.qp_type == IB_QPT_UC) { + if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) + goto bail_inval; + } else if (qp->ibqp.qp_type == IB_QPT_UD) { + /* Check UD opcode */ + if (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM) + goto bail_inval; + /* Check UD destination address PD */ + if (qp->ibqp.pd != wr->wr.ud.ah->pd) + goto bail_inval; + } else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) + goto bail_inval; + else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP && + (wr->num_sge == 0 || + wr->sg_list[0].length < sizeof(u64) || + wr->sg_list[0].addr & (sizeof(u64) - 1))) + goto bail_inval; + else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) + goto bail_inval; + + next = qp->s_head + 1; + if (next >= qp->s_size) + next = 0; + if (next == qp->s_last) { + ret = -ENOMEM; + goto bail; + } + + wqe = get_swqe_ptr(qp, qp->s_head); + wqe->wr = *wr; + wqe->length = 0; + if (wr->num_sge) { + acc = wr->opcode >= IB_WR_RDMA_READ ? + IB_ACCESS_LOCAL_WRITE : 0; + for (i = 0, j = 0; i < wr->num_sge; i++) { + u32 length = wr->sg_list[i].length; + int ok; + + if (length == 0) + continue; + ok = ipath_lkey_ok(qp, &wqe->sg_list[j], + &wr->sg_list[i], acc); + if (!ok) + goto bail_inval; + wqe->length += length; + j++; + } + wqe->wr.num_sge = j; + } + if (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) { + if (wqe->length > 0x80000000U) + goto bail_inval; + } else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu) + goto bail_inval; + wqe->ssn = qp->s_ssn++; + qp->s_head = next; + + ret = 0; + goto bail; + +bail_inval: + ret = -EINVAL; +bail: + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +/** + * ipath_post_send - post a send on a QP + * @ibqp: the QP to post the send on + * @wr: the list of work requests to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct ib_send_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + int err = 0; + + for (; wr; wr = wr->next) { + err = ipath_post_one_send(qp, wr); + if (err) { + *bad_wr = wr; + goto bail; + } + } + + /* Try to do the send work in the caller's context. */ + ipath_do_send((unsigned long) qp); + +bail: + return err; +} + +/** + * ipath_post_receive - post a receive on a QP + * @ibqp: the QP to post the receive on + * @wr: the WR to post + * @bad_wr: the first bad WR is put here + * + * This may be called from interrupt context. + */ +static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_rwq *wq = qp->r_rq.wq; + unsigned long flags; + int ret; + + /* Check that state is OK to post receive. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + for (; wr; wr = wr->next) { + struct ipath_rwqe *wqe; + u32 next; + int i; + + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { + *bad_wr = wr; + ret = -EINVAL; + goto bail; + } + + spin_lock_irqsave(&qp->r_rq.lock, flags); + next = wq->head + 1; + if (next >= qp->r_rq.size) + next = 0; + if (next == wq->tail) { + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + *bad_wr = wr; + ret = -ENOMEM; + goto bail; + } + + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); + wqe->wr_id = wr->wr_id; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + /* Make sure queue entry is written before the head index. */ + smp_wmb(); + wq->head = next; + spin_unlock_irqrestore(&qp->r_rq.lock, flags); + } + ret = 0; + +bail: + return ret; +} + +/** + * ipath_qp_rcv - processing an incoming packet on a QP + * @dev: the device the packet came on + * @hdr: the packet header + * @has_grh: true if the packet has a GRH + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from ipath_ib_rcv() to process an incoming packet + * for the given QP. + * Called at interrupt level. + */ +static void ipath_qp_rcv(struct ipath_ibdev *dev, + struct ipath_ib_header *hdr, int has_grh, + void *data, u32 tlen, struct ipath_qp *qp) +{ + /* Check for valid receive state. */ + if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) { + dev->n_pkt_drops++; + return; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + if (ib_ipath_disable_sma) + break; + /* FALLTHROUGH */ + case IB_QPT_UD: + ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_RC: + ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + case IB_QPT_UC: + ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp); + break; + + default: + break; + } +} + +/** + * ipath_ib_rcv - process an incoming packet + * @arg: the device pointer + * @rhdr: the header of the packet + * @data: the packet data + * @tlen: the packet length + * + * This is called from ipath_kreceive() to process an incoming packet at + * interrupt level. Tlen is the length of the header + data + CRC in bytes. + */ +void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data, + u32 tlen) +{ + struct ipath_ib_header *hdr = rhdr; + struct ipath_other_headers *ohdr; + struct ipath_qp *qp; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + if (unlikely(dev == NULL)) + goto bail; + + if (unlikely(tlen < 24)) { /* LRH+BTH+CRC */ + dev->rcv_errors++; + goto bail; + } + + /* Check for a valid destination LID (see ch. 7.11.1). */ + lid = be16_to_cpu(hdr->lrh[1]); + if (lid < IPATH_MULTICAST_LID_BASE) { + lid &= ~((1 << dev->dd->ipath_lmc) - 1); + if (unlikely(lid != dev->dd->ipath_lid)) { + dev->rcv_errors++; + goto bail; + } + } + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == IPATH_LRH_BTH) + ohdr = &hdr->u.oth; + else if (lnh == IPATH_LRH_GRH) + ohdr = &hdr->u.l.oth; + else { + dev->rcv_errors++; + goto bail; + } + + opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f; + dev->opstats[opcode].n_bytes += tlen; + dev->opstats[opcode].n_packets++; + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK; + if (qp_num == IPATH_MULTICAST_QPN) { + struct ipath_mcast *mcast; + struct ipath_mcast_qp *p; + + if (lnh != IPATH_LRH_GRH) { + dev->n_pkt_drops++; + goto bail; + } + mcast = ipath_mcast_find(&hdr->u.l.grh.dgid); + if (mcast == NULL) { + dev->n_pkt_drops++; + goto bail; + } + dev->n_multicast_rcv++; + list_for_each_entry_rcu(p, &mcast->qp_list, list) + ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp); + /* + * Notify ipath_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + qp = ipath_lookup_qpn(&dev->qp_table, qp_num); + if (qp) { + dev->n_unicast_rcv++; + ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data, + tlen, qp); + /* + * Notify ipath_destroy_qp() if it is waiting + * for us to finish. + */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } else + dev->n_pkt_drops++; + } + +bail:; +} + +/** + * ipath_ib_timer - verbs timer + * @arg: the device pointer + * + * This is called from ipath_do_rcv_timer() at interrupt level to check for + * QPs which need retransmits and to collect performance numbers. + */ +static void ipath_ib_timer(struct ipath_ibdev *dev) +{ + struct ipath_qp *resend = NULL; + struct ipath_qp *rnr = NULL; + struct list_head *last; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + return; + + spin_lock_irqsave(&dev->pending_lock, flags); + /* Start filling the next pending queue. */ + if (++dev->pending_index >= ARRAY_SIZE(dev->pending)) + dev->pending_index = 0; + /* Save any requests still in the new queue, they have timed out. */ + last = &dev->pending[dev->pending_index]; + while (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + list_del_init(&qp->timerwait); + qp->timer_next = resend; + resend = qp; + atomic_inc(&qp->refcount); + } + last = &dev->rnrwait; + if (!list_empty(last)) { + qp = list_entry(last->next, struct ipath_qp, timerwait); + if (--qp->s_rnr_timeout == 0) { + do { + list_del_init(&qp->timerwait); + qp->timer_next = rnr; + rnr = qp; + atomic_inc(&qp->refcount); + if (list_empty(last)) + break; + qp = list_entry(last->next, struct ipath_qp, + timerwait); + } while (qp->s_rnr_timeout == 0); + } + } + /* + * We should only be in the started state if pma_sample_start != 0 + */ + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED && + --dev->pma_sample_start == 0) { + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING; + ipath_snapshot_counters(dev->dd, &dev->ipath_sword, + &dev->ipath_rword, + &dev->ipath_spkts, + &dev->ipath_rpkts, + &dev->ipath_xmit_wait); + } + if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) { + if (dev->pma_sample_interval == 0) { + u64 ta, tb, tc, td, te; + + dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE; + ipath_snapshot_counters(dev->dd, &ta, &tb, + &tc, &td, &te); + + dev->ipath_sword = ta - dev->ipath_sword; + dev->ipath_rword = tb - dev->ipath_rword; + dev->ipath_spkts = tc - dev->ipath_spkts; + dev->ipath_rpkts = td - dev->ipath_rpkts; + dev->ipath_xmit_wait = te - dev->ipath_xmit_wait; + } + else + dev->pma_sample_interval--; + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + /* XXX What if timer fires again while this is running? */ + while (resend != NULL) { + qp = resend; + resend = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_last != qp->s_tail && + ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) { + dev->n_timeouts++; + ipath_restart_rc(qp, qp->s_last_psn + 1); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + while (rnr != NULL) { + qp = rnr; + rnr = qp->timer_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +static void update_sge(struct ipath_sge_state *ss, u32 length) +{ + struct ipath_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr != NULL) { + if (++sge->n >= IPATH_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +#ifdef __LITTLE_ENDIAN +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data <<= ((sizeof(u32) - n) * BITS_PER_BYTE); + data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#else +static inline u32 get_upper_bits(u32 data, u32 shift) +{ + return data << shift; +} + +static inline u32 set_upper_bits(u32 data, u32 shift) +{ + return data >> shift; +} + +static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off) +{ + data >>= ((sizeof(u32) - n) * BITS_PER_BYTE); + data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE); + return data; +} +#endif + +static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss, + u32 length, unsigned flush_wc) +{ + u32 extra = 0; + u32 data = 0; + u32 last; + + while (1) { + u32 len = ss->sge.length; + u32 off; + + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + BUG_ON(len == 0); + /* If the source address is not aligned, try to align it. */ + off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1); + if (off) { + u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr & + ~(sizeof(u32) - 1)); + u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE); + u32 y; + + y = sizeof(u32) - off; + if (len > y) + len = y; + if (len + extra >= sizeof(u32)) { + data |= set_upper_bits(v, extra * + BITS_PER_BYTE); + len = sizeof(u32) - extra; + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, len, extra); + if (len == length) { + last = data; + break; + } + extra += len; + } + } else if (extra) { + /* Source address is aligned. */ + u32 *addr = (u32 *) ss->sge.vaddr; + int shift = extra * BITS_PER_BYTE; + int ushift = 32 - shift; + u32 l = len; + + while (l >= sizeof(u32)) { + u32 v = *addr; + + data |= set_upper_bits(v, shift); + __raw_writel(data, piobuf); + data = get_upper_bits(v, ushift); + piobuf++; + addr++; + l -= sizeof(u32); + } + /* + * We still have 'extra' number of bytes leftover. + */ + if (l) { + u32 v = *addr; + + if (l + extra >= sizeof(u32)) { + data |= set_upper_bits(v, shift); + len -= l + extra - sizeof(u32); + if (len == length) { + last = data; + break; + } + __raw_writel(data, piobuf); + piobuf++; + extra = 0; + data = 0; + } else { + /* Clear unused upper bytes */ + data |= clear_upper_bytes(v, l, + extra); + if (len == length) { + last = data; + break; + } + extra += l; + } + } else if (len == length) { + last = data; + break; + } + } else if (len == length) { + u32 w; + + /* + * Need to round up for the last dword in the + * packet. + */ + w = (len + 3) >> 2; + __iowrite32_copy(piobuf, ss->sge.vaddr, w - 1); + piobuf += w - 1; + last = ((u32 *) ss->sge.vaddr)[w - 1]; + break; + } else { + u32 w = len >> 2; + + __iowrite32_copy(piobuf, ss->sge.vaddr, w); + piobuf += w; + + extra = len & (sizeof(u32) - 1); + if (extra) { + u32 v = ((u32 *) ss->sge.vaddr)[w]; + + /* Clear unused upper bytes */ + data = clear_upper_bytes(v, extra, 0); + } + } + update_sge(ss, len); + length -= len; + } + /* Update address before sending packet. */ + update_sge(ss, length); + if (flush_wc) { + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(last, piobuf); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __raw_writel(last, piobuf); +} + +/* + * Convert IB rate to delay multiplier. + */ +unsigned ipath_ib_rate_to_mult(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_2_5_GBPS: return 8; + case IB_RATE_5_GBPS: return 4; + case IB_RATE_10_GBPS: return 2; + case IB_RATE_20_GBPS: return 1; + default: return 0; + } +} + +/* + * Convert delay multiplier to IB rate + */ +static enum ib_rate ipath_mult_to_ib_rate(unsigned mult) +{ + switch (mult) { + case 8: return IB_RATE_2_5_GBPS; + case 4: return IB_RATE_5_GBPS; + case 2: return IB_RATE_10_GBPS; + case 1: return IB_RATE_20_GBPS; + default: return IB_RATE_PORT_CURRENT; + } +} + +static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev) +{ + struct ipath_verbs_txreq *tx = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + if (!list_empty(&dev->txreq_free)) { + struct list_head *l = dev->txreq_free.next; + + list_del(l); + tx = list_entry(l, struct ipath_verbs_txreq, txreq.list); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + return tx; +} + +static inline void put_txreq(struct ipath_ibdev *dev, + struct ipath_verbs_txreq *tx) +{ + unsigned long flags; + + spin_lock_irqsave(&dev->pending_lock, flags); + list_add(&tx->txreq.list, &dev->txreq_free); + spin_unlock_irqrestore(&dev->pending_lock, flags); +} + +static void sdma_complete(void *cookie, int status) +{ + struct ipath_verbs_txreq *tx = cookie; + struct ipath_qp *qp = tx->qp; + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if (tx->wqe) + ipath_send_complete(qp, tx->wqe, ibs); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } else if (tx->wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, tx->wqe, ibs); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + + if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF) + kfree(tx->txreq.map_addr); + put_txreq(dev, tx); + + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static void decrement_dma_busy(struct ipath_qp *qp) +{ + unsigned long flags; + + if (atomic_dec_and_test(&qp->s_dma_busy)) { + spin_lock_irqsave(&qp->s_lock, flags); + if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND && + qp->s_last != qp->s_head) || + (qp->s_flags & IPATH_S_WAIT_DMA)) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + wake_up(&qp->wait_dma); + } +} + +/* + * Compute the number of clock cycles of delay before sending the next packet. + * The multipliers reflect the number of clocks for the fastest rate so + * one tick at 4xDDR is 8 ticks at 1xSDR. + * If the destination port will take longer to receive a packet than + * the outgoing link can send it, we need to delay sending the next packet + * by the difference in time it takes the receiver to receive and the sender + * to send this packet. + * Note that this delay is always correct for UC and RC but not always + * optimal for UD. For UD, the destination HCA can be different for each + * packet, in which case, we could send packets to a different destination + * while "waiting" for the delay. The overhead for doing this without + * HW support is more than just paying the cost of delaying some packets + * unnecessarily. + */ +static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult) +{ + return (rcv_mult > snd_mult) ? + (plen * (rcv_mult - snd_mult) + 1) >> 1 : 0; +} + +static int ipath_verbs_send_dma(struct ipath_qp *qp, + struct ipath_ib_header *hdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + struct ipath_devdata *dd = dev->dd; + struct ipath_verbs_txreq *tx; + u32 *piobuf; + u32 control; + u32 ndesc; + int ret; + + tx = qp->s_tx; + if (tx) { + qp->s_tx = NULL; + /* resend previously constructed packet */ + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx); + if (ret) { + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + tx = get_txreq(dev); + if (!tx) { + ret = -EBUSY; + goto bail; + } + + /* + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + tx->qp = qp; + atomic_inc(&qp->refcount); + tx->wqe = qp->s_wqe; + tx->txreq.callback = sdma_complete; + tx->txreq.callback_cookie = tx; + tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST | + IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC; + if (plen + 1 >= IPATH_SMALLBUF_DWORDS) + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF; + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) { + control |= 1ULL << 31; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15; + } + + if (len) { + /* + * Don't try to DMA if it takes more descriptors than + * the queue holds. + */ + ndesc = ipath_count_sge(ss, len); + if (ndesc >= dd->ipath_sdma_descq_cnt) + ndesc = 0; + } else + ndesc = 1; + if (ndesc) { + tx->hdr.pbc[0] = cpu_to_le32(plen); + tx->hdr.pbc[1] = cpu_to_le32(control); + memcpy(&tx->hdr.hdr, hdr, hdrwords << 2); + tx->txreq.sg_count = ndesc; + tx->map_len = (hdrwords + 2) << 2; + tx->txreq.map_addr = &tx->hdr; + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, ss, dwords, tx); + if (ret) { + /* save ss and length in dwords */ + tx->ss = ss; + tx->len = dwords; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + goto bail; + } + + /* Allocate a buffer and copy the header and payload to it. */ + tx->map_len = (plen + 1) << 2; + piobuf = kmalloc(tx->map_len, GFP_ATOMIC); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto err_tx; + } + tx->txreq.map_addr = piobuf; + tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF; + tx->txreq.sg_count = 1; + + *piobuf++ = (__force u32) cpu_to_le32(plen); + *piobuf++ = (__force u32) cpu_to_le32(control); + memcpy(piobuf, hdr, hdrwords << 2); + ipath_copy_from_sge(piobuf + hdrwords, ss, len); + + atomic_inc(&qp->s_dma_busy); + ret = ipath_sdma_verbs_send(dd, NULL, 0, tx); + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + if (ret) { + tx->ss = NULL; + tx->len = 0; + qp->s_tx = tx; + decrement_dma_busy(qp); + } + dev->n_unaligned++; + goto bail; + +err_tx: + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + put_txreq(dev, tx); +bail: + return ret; +} + +static int ipath_verbs_send_pio(struct ipath_qp *qp, + struct ipath_ib_header *ibhdr, u32 hdrwords, + struct ipath_sge_state *ss, u32 len, + u32 plen, u32 dwords) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 *hdr = (u32 *) ibhdr; + u32 __iomem *piobuf; + unsigned flush_wc; + u32 control; + int ret; + unsigned long flags; + + piobuf = ipath_getpiobuf(dd, plen, NULL); + if (unlikely(piobuf == NULL)) { + ret = -EBUSY; + goto bail; + } + + /* + * Get the saved delay count we computed for the previous packet + * and save the delay count for this packet to be used next time + * we get here. + */ + control = qp->s_pkt_delay; + qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult); + + /* VL15 packets bypass credit check */ + if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15) + control |= 1ULL << 31; + + /* + * Write the length to the control qword plus any needed flags. + * We have to flush after the PBC for correctness on some cpus + * or WC buffer can be written out of order. + */ + writeq(((u64) control << 32) | plen, piobuf); + piobuf += 2; + + flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC; + if (len == 0) { + /* + * If there is just the header portion, must flush before + * writing last word of header for correctness, and after + * the last header word (trigger word). + */ + if (flush_wc) { + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords - 1); + ipath_flush_wc(); + __raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1); + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, hdr, hdrwords); + goto done; + } + + if (flush_wc) + ipath_flush_wc(); + __iowrite32_copy(piobuf, hdr, hdrwords); + piobuf += hdrwords; + + /* The common case is aligned and contained in one segment. */ + if (likely(ss->num_sge == 1 && len <= ss->sge.length && + !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) { + u32 *addr = (u32 *) ss->sge.vaddr; + + /* Update address before sending packet. */ + update_sge(ss, len); + if (flush_wc) { + __iowrite32_copy(piobuf, addr, dwords - 1); + /* must flush early everything before trigger word */ + ipath_flush_wc(); + __raw_writel(addr[dwords - 1], piobuf + dwords - 1); + /* be sure trigger word is written */ + ipath_flush_wc(); + } else + __iowrite32_copy(piobuf, addr, dwords); + goto done; + } + copy_io(piobuf, ss, len, flush_wc); +done: + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + ret = 0; +bail: + return ret; +} + +/** + * ipath_verbs_send - send a packet + * @qp: the QP to send on + * @hdr: the packet header + * @hdrwords: the number of 32-bit words in the header + * @ss: the SGE to send + * @len: the length of the packet in bytes + */ +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len) +{ + struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd; + u32 plen; + int ret; + u32 dwords = (len + 3) >> 2; + + /* + * Calculate the send buffer trigger address. + * The +1 counts for the pbc control dword following the pbc length. + */ + plen = hdrwords + dwords + 1; + + /* + * VL15 packets (IB_QPT_SMI) will always use PIO, so we + * can defer SDMA restart until link goes ACTIVE without + * worrying about just how we got there. + */ + if (qp->ibqp.qp_type == IB_QPT_SMI || + !(dd->ipath_flags & IPATH_HAS_SEND_DMA)) + ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len, + plen, dwords); + else + ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len, + plen, dwords); + + return ret; +} + +int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait) +{ + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + *swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt); + *rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt); + *spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt); + *rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt); + *xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt); + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_get_counters - get various chip counters + * @dd: the infinipath device + * @cntrs: counters are placed here + * + * Return the counters needed by recv_pma_get_portcounters(). + */ +int ipath_get_counters(struct ipath_devdata *dd, + struct ipath_verbs_counters *cntrs) +{ + struct ipath_cregs const *crp = dd->ipath_cregs; + int ret; + + if (!(dd->ipath_flags & IPATH_INITTED)) { + /* no hardware, freeze, etc. */ + ret = -EINVAL; + goto bail; + } + cntrs->symbol_error_counter = + ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt); + cntrs->link_error_recovery_counter = + ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt); + /* + * The link downed counter counts when the other side downs the + * connection. We add in the number of times we downed the link + * due to local link integrity errors to compensate. + */ + cntrs->link_downed_counter = + ipath_snap_cntr(dd, crp->cr_iblinkdowncnt); + cntrs->port_rcv_errors = + ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) + + ipath_snap_cntr(dd, crp->cr_rcvovflcnt) + + ipath_snap_cntr(dd, crp->cr_portovflcnt) + + ipath_snap_cntr(dd, crp->cr_err_rlencnt) + + ipath_snap_cntr(dd, crp->cr_invalidrlencnt) + + ipath_snap_cntr(dd, crp->cr_errlinkcnt) + + ipath_snap_cntr(dd, crp->cr_erricrccnt) + + ipath_snap_cntr(dd, crp->cr_errvcrccnt) + + ipath_snap_cntr(dd, crp->cr_errlpcrccnt) + + ipath_snap_cntr(dd, crp->cr_badformatcnt) + + dd->ipath_rxfc_unsupvl_errs; + if (crp->cr_rxotherlocalphyerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt); + if (crp->cr_rxvlerrcnt) + cntrs->port_rcv_errors += + ipath_snap_cntr(dd, crp->cr_rxvlerrcnt); + cntrs->port_rcv_remphys_errors = + ipath_snap_cntr(dd, crp->cr_rcvebpcnt); + cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt); + cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt); + cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt); + cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt); + cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt); + cntrs->local_link_integrity_errors = + crp->cr_locallinkintegrityerrcnt ? + ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) : + ((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ? + dd->ipath_lli_errs : dd->ipath_lli_errors); + cntrs->excessive_buffer_overrun_errors = + crp->cr_excessbufferovflcnt ? + ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) : + dd->ipath_overrun_thresh_errs; + cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ? + ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0; + + ret = 0; + +bail: + return ret; +} + +/** + * ipath_ib_piobufavail - callback when a PIO buffer is available + * @arg: the device pointer + * + * This is called from ipath_intr() at interrupt level when a PIO buffer is + * available after ipath_verbs_send() returned an error that no buffers were + * available. Return 1 if we consumed all the PIO buffers and we still have + * QPs waiting for buffers (for now, just restart the send tasklet and + * return zero). + */ +int ipath_ib_piobufavail(struct ipath_ibdev *dev) +{ + struct list_head *list; + struct ipath_qp *qplist; + struct ipath_qp *qp; + unsigned long flags; + + if (dev == NULL) + goto bail; + + list = &dev->piowait; + qplist = NULL; + + spin_lock_irqsave(&dev->pending_lock, flags); + while (!list_empty(list)) { + qp = list_entry(list->next, struct ipath_qp, piowait); + list_del_init(&qp->piowait); + qp->pio_next = qplist; + qplist = qp; + atomic_inc(&qp->refcount); + } + spin_unlock_irqrestore(&dev->pending_lock, flags); + + while (qplist != NULL) { + qp = qplist; + qplist = qp->pio_next; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) + ipath_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + +bail: + return 0; +} + +static int ipath_query_device(struct ib_device *ibdev, struct ib_device_attr *props, + struct ib_udata *uhw) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + + if (uhw->inlen || uhw->outlen) + return -EINVAL; + + memset(props, 0, sizeof(*props)); + + props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + props->page_size_cap = PAGE_SIZE; + props->vendor_id = + IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3; + props->vendor_part_id = dev->dd->ipath_deviceid; + props->hw_ver = dev->dd->ipath_pcirev; + + props->sys_image_guid = dev->sys_image_guid; + + props->max_mr_size = ~0ull; + props->max_qp = ib_ipath_max_qps; + props->max_qp_wr = ib_ipath_max_qp_wrs; + props->max_sge = ib_ipath_max_sges; + props->max_sge_rd = ib_ipath_max_sges; + props->max_cq = ib_ipath_max_cqs; + props->max_ah = ib_ipath_max_ahs; + props->max_cqe = ib_ipath_max_cqes; + props->max_mr = dev->lk_table.max; + props->max_fmr = dev->lk_table.max; + props->max_map_per_fmr = 32767; + props->max_pd = ib_ipath_max_pds; + props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC; + props->max_qp_init_rd_atom = 255; + /* props->max_res_rd_atom */ + props->max_srq = ib_ipath_max_srqs; + props->max_srq_wr = ib_ipath_max_srq_wrs; + props->max_srq_sge = ib_ipath_max_srq_sges; + /* props->local_ca_ack_delay */ + props->atomic_cap = IB_ATOMIC_GLOB; + props->max_pkeys = ipath_get_npkeys(dev->dd); + props->max_mcast_grp = ib_ipath_max_mcast_grps; + props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached; + props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * + props->max_mcast_grp; + + return 0; +} + +const u8 ipath_cvt_physportstate[32] = { + [INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED, + [INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP, + [INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL, + [INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP, + [INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] = + IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN, + [INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] = + IB_PHYSPORTSTATE_LINK_ERR_RECOVER, + [0x10] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x11] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x12] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x13] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x14] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x15] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x16] = IB_PHYSPORTSTATE_CFG_TRAIN, + [0x17] = IB_PHYSPORTSTATE_CFG_TRAIN +}; + +u32 ipath_get_cr_errpkey(struct ipath_devdata *dd) +{ + return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey); +} + +static int ipath_query_port(struct ib_device *ibdev, + u8 port, struct ib_port_attr *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_devdata *dd = dev->dd; + enum ib_mtu mtu; + u16 lid = dd->ipath_lid; + u64 ibcstat; + + memset(props, 0, sizeof(*props)); + props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE); + props->lmc = dd->ipath_lmc; + props->sm_lid = dev->sm_lid; + props->sm_sl = dev->sm_sl; + ibcstat = dd->ipath_lastibcstat; + /* map LinkState to IB portinfo values. */ + props->state = ipath_ib_linkstate(dd, ibcstat) + 1; + + /* See phys_state_show() */ + props->phys_state = /* MEA: assumes shift == 0 */ + ipath_cvt_physportstate[dd->ipath_lastibcstat & + dd->ibcs_lts_mask]; + props->port_cap_flags = dev->port_cap_flags; + props->gid_tbl_len = 1; + props->max_msg_sz = 0x80000000; + props->pkey_tbl_len = ipath_get_npkeys(dd); + props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) - + dev->z_pkey_violations; + props->qkey_viol_cntr = dev->qkey_violations; + props->active_width = dd->ipath_link_width_active; + /* See rate_show() */ + props->active_speed = dd->ipath_link_speed_active; + props->max_vl_num = 1; /* VLCap = VL0 */ + props->init_type_reply = 0; + + props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048; + switch (dd->ipath_ibmtu) { + case 4096: + mtu = IB_MTU_4096; + break; + case 2048: + mtu = IB_MTU_2048; + break; + case 1024: + mtu = IB_MTU_1024; + break; + case 512: + mtu = IB_MTU_512; + break; + case 256: + mtu = IB_MTU_256; + break; + default: + mtu = IB_MTU_2048; + } + props->active_mtu = mtu; + props->subnet_timeout = dev->subnet_timeout; + + return 0; +} + +static int ipath_modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) + memcpy(device->node_desc, device_modify->node_desc, 64); + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) + to_idev(device)->sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + + ret = 0; + +bail: + return ret; +} + +static int ipath_modify_port(struct ib_device *ibdev, + u8 port, int port_modify_mask, + struct ib_port_modify *props) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + + dev->port_cap_flags |= props->set_port_cap_mask; + dev->port_cap_flags &= ~props->clr_port_cap_mask; + if (port_modify_mask & IB_PORT_SHUTDOWN) + ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN); + if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR) + dev->qkey_violations = 0; + return 0; +} + +static int ipath_query_gid(struct ib_device *ibdev, u8 port, + int index, union ib_gid *gid) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= 1) { + ret = -EINVAL; + goto bail; + } + gid->global.subnet_prefix = dev->gid_prefix; + gid->global.interface_id = dev->dd->ipath_guid; + + ret = 0; + +bail: + return ret; +} + +static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + struct ipath_pd *pd; + struct ib_pd *ret; + + /* + * This is actually totally arbitrary. Some correctness tests + * assume there's a maximum number of PDs that can be allocated. + * We don't actually have this limit, but we fail the test if + * we allow allocations of more than we report for this value. + */ + + pd = kmalloc(sizeof *pd, GFP_KERNEL); + if (!pd) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock(&dev->n_pds_lock); + if (dev->n_pds_allocated == ib_ipath_max_pds) { + spin_unlock(&dev->n_pds_lock); + kfree(pd); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_pds_allocated++; + spin_unlock(&dev->n_pds_lock); + + /* ib_alloc_pd() will initialize pd->ibpd. */ + pd->user = udata != NULL; + + ret = &pd->ibpd; + +bail: + return ret; +} + +static int ipath_dealloc_pd(struct ib_pd *ibpd) +{ + struct ipath_pd *pd = to_ipd(ibpd); + struct ipath_ibdev *dev = to_idev(ibpd->device); + + spin_lock(&dev->n_pds_lock); + dev->n_pds_allocated--; + spin_unlock(&dev->n_pds_lock); + + kfree(pd); + + return 0; +} + +/** + * ipath_create_ah - create an address handle + * @pd: the protection domain + * @ah_attr: the attributes of the AH + * + * This may be called from interrupt context. + */ +static struct ib_ah *ipath_create_ah(struct ib_pd *pd, + struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah; + struct ib_ah *ret; + struct ipath_ibdev *dev = to_idev(pd->device); + unsigned long flags; + + /* A multicast address requires a GRH (see ch. 8.4.1). */ + if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE && + ah_attr->dlid != IPATH_PERMISSIVE_LID && + !(ah_attr->ah_flags & IB_AH_GRH)) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (ah_attr->dlid == 0) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + if (ah_attr->port_num < 1 || + ah_attr->port_num > pd->device->phys_port_cnt) { + ret = ERR_PTR(-EINVAL); + goto bail; + } + + ah = kmalloc(sizeof *ah, GFP_ATOMIC); + if (!ah) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + if (dev->n_ahs_allocated == ib_ipath_max_ahs) { + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + kfree(ah); + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + dev->n_ahs_allocated++; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + /* ib_create_ah() will initialize ah->ibah. */ + ah->attr = *ah_attr; + ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate); + + ret = &ah->ibah; + +bail: + return ret; +} + +/** + * ipath_destroy_ah - destroy an address handle + * @ibah: the AH to destroy + * + * This may be called from interrupt context. + */ +static int ipath_destroy_ah(struct ib_ah *ibah) +{ + struct ipath_ibdev *dev = to_idev(ibah->device); + struct ipath_ah *ah = to_iah(ibah); + unsigned long flags; + + spin_lock_irqsave(&dev->n_ahs_lock, flags); + dev->n_ahs_allocated--; + spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + + kfree(ah); + + return 0; +} + +static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) +{ + struct ipath_ah *ah = to_iah(ibah); + + *ah_attr = ah->attr; + ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate); + + return 0; +} + +/** + * ipath_get_npkeys - return the size of the PKEY table for port 0 + * @dd: the infinipath device + */ +unsigned ipath_get_npkeys(struct ipath_devdata *dd) +{ + return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys); +} + +/** + * ipath_get_pkey - return the indexed PKEY from the port PKEY table + * @dd: the infinipath device + * @index: the PKEY index + */ +unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index) +{ + unsigned ret; + + /* always a kernel port, no locking needed */ + if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys)) + ret = 0; + else + ret = dd->ipath_pd[0]->port_pkeys[index]; + + return ret; +} + +static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ + struct ipath_ibdev *dev = to_idev(ibdev); + int ret; + + if (index >= ipath_get_npkeys(dev->dd)) { + ret = -EINVAL; + goto bail; + } + + *pkey = ipath_get_pkey(dev->dd, index); + ret = 0; + +bail: + return ret; +} + +/** + * ipath_alloc_ucontext - allocate a ucontest + * @ibdev: the infiniband device + * @udata: not used by the InfiniPath driver + */ + +static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev, + struct ib_udata *udata) +{ + struct ipath_ucontext *context; + struct ib_ucontext *ret; + + context = kmalloc(sizeof *context, GFP_KERNEL); + if (!context) { + ret = ERR_PTR(-ENOMEM); + goto bail; + } + + ret = &context->ibucontext; + +bail: + return ret; +} + +static int ipath_dealloc_ucontext(struct ib_ucontext *context) +{ + kfree(to_iucontext(context)); + return 0; +} + +static int ipath_verbs_register_sysfs(struct ib_device *dev); + +static void __verbs_timer(unsigned long arg) +{ + struct ipath_devdata *dd = (struct ipath_devdata *) arg; + + /* Handle verbs layer timeouts. */ + ipath_ib_timer(dd->verbs_dev); + + mod_timer(&dd->verbs_timer, jiffies + 1); +} + +static int enable_timer(struct ipath_devdata *dd) +{ + /* + * Early chips had a design flaw where the chip and kernel idea + * of the tail register don't always agree, and therefore we won't + * get an interrupt on the next packet received. + * If the board supports per packet receive interrupts, use it. + * Otherwise, the timer function periodically checks for packets + * to cover this case. + * Either way, the timer is needed for verbs layer related + * processing. + */ + if (dd->ipath_flags & IPATH_GPIO_INTR) { + ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect, + 0x2074076542310ULL); + /* Enable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + } + + init_timer(&dd->verbs_timer); + dd->verbs_timer.function = __verbs_timer; + dd->verbs_timer.data = (unsigned long)dd; + dd->verbs_timer.expires = jiffies + 1; + add_timer(&dd->verbs_timer); + + return 0; +} + +static int disable_timer(struct ipath_devdata *dd) +{ + /* Disable GPIO bit 2 interrupt */ + if (dd->ipath_flags & IPATH_GPIO_INTR) { + /* Disable GPIO bit 2 interrupt */ + dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT)); + ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask, + dd->ipath_gpio_mask); + /* + * We might want to undo changes to debugportselect, + * but how? + */ + } + + del_timer_sync(&dd->verbs_timer); + + return 0; +} + +static int ipath_port_immutable(struct ib_device *ibdev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct ib_port_attr attr; + int err; + + err = ipath_query_port(ibdev, port_num, &attr); + if (err) + return err; + + immutable->pkey_tbl_len = attr.pkey_tbl_len; + immutable->gid_tbl_len = attr.gid_tbl_len; + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + + return 0; +} + +/** + * ipath_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return the allocated ipath_ibdev pointer or NULL on error. + */ +int ipath_register_ib_device(struct ipath_devdata *dd) +{ + struct ipath_verbs_counters cntrs; + struct ipath_ibdev *idev; + struct ib_device *dev; + struct ipath_verbs_txreq *tx; + unsigned i; + int ret; + + idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev); + if (idev == NULL) { + ret = -ENOMEM; + goto bail; + } + + dev = &idev->ibdev; + + if (dd->ipath_sdma_descq_cnt) { + tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx, + GFP_KERNEL); + if (tx == NULL) { + ret = -ENOMEM; + goto err_tx; + } + } else + tx = NULL; + idev->txreq_bufs = tx; + + /* Only need to initialize non-zero fields. */ + spin_lock_init(&idev->n_pds_lock); + spin_lock_init(&idev->n_ahs_lock); + spin_lock_init(&idev->n_cqs_lock); + spin_lock_init(&idev->n_qps_lock); + spin_lock_init(&idev->n_srqs_lock); + spin_lock_init(&idev->n_mcast_grps_lock); + + spin_lock_init(&idev->qp_table.lock); + spin_lock_init(&idev->lk_table.lock); + idev->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE); + /* Set the prefix to the default value (see ch. 4.1.1) */ + idev->gid_prefix = cpu_to_be64(0xfe80000000000000ULL); + + ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size); + if (ret) + goto err_qp; + + /* + * The top ib_ipath_lkey_table_size bits are used to index the + * table. The lower 8 bits can be owned by the user (copied from + * the LKEY). The remaining bits act as a generation number or tag. + */ + idev->lk_table.max = 1 << ib_ipath_lkey_table_size; + idev->lk_table.table = kzalloc(idev->lk_table.max * + sizeof(*idev->lk_table.table), + GFP_KERNEL); + if (idev->lk_table.table == NULL) { + ret = -ENOMEM; + goto err_lk; + } + INIT_LIST_HEAD(&idev->pending_mmaps); + spin_lock_init(&idev->pending_lock); + idev->mmap_offset = PAGE_SIZE; + spin_lock_init(&idev->mmap_offset_lock); + INIT_LIST_HEAD(&idev->pending[0]); + INIT_LIST_HEAD(&idev->pending[1]); + INIT_LIST_HEAD(&idev->pending[2]); + INIT_LIST_HEAD(&idev->piowait); + INIT_LIST_HEAD(&idev->rnrwait); + INIT_LIST_HEAD(&idev->txreq_free); + idev->pending_index = 0; + idev->port_cap_flags = + IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP; + if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY) + idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP; + idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + /* Snapshot current HW counters to "clear" them. */ + ipath_get_counters(dd, &cntrs); + idev->z_symbol_error_counter = cntrs.symbol_error_counter; + idev->z_link_error_recovery_counter = + cntrs.link_error_recovery_counter; + idev->z_link_downed_counter = cntrs.link_downed_counter; + idev->z_port_rcv_errors = cntrs.port_rcv_errors; + idev->z_port_rcv_remphys_errors = + cntrs.port_rcv_remphys_errors; + idev->z_port_xmit_discards = cntrs.port_xmit_discards; + idev->z_port_xmit_data = cntrs.port_xmit_data; + idev->z_port_rcv_data = cntrs.port_rcv_data; + idev->z_port_xmit_packets = cntrs.port_xmit_packets; + idev->z_port_rcv_packets = cntrs.port_rcv_packets; + idev->z_local_link_integrity_errors = + cntrs.local_link_integrity_errors; + idev->z_excessive_buffer_overrun_errors = + cntrs.excessive_buffer_overrun_errors; + idev->z_vl15_dropped = cntrs.vl15_dropped; + + for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++) + list_add(&tx->txreq.list, &idev->txreq_free); + + /* + * The system image GUID is supposed to be the same for all + * IB HCAs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!sys_image_guid) + sys_image_guid = dd->ipath_guid; + idev->sys_image_guid = sys_image_guid; + idev->ib_unit = dd->ipath_unit; + idev->dd = dd; + + strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX); + dev->owner = THIS_MODULE; + dev->node_guid = dd->ipath_guid; + dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION; + dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_CREATE_AH) | + (1ull << IB_USER_VERBS_CMD_DESTROY_AH) | + (1ull << IB_USER_VERBS_CMD_QUERY_AH) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = 1; + dev->dma_device = &dd->pcidev->dev; + dev->query_device = ipath_query_device; + dev->modify_device = ipath_modify_device; + dev->query_port = ipath_query_port; + dev->modify_port = ipath_modify_port; + dev->query_pkey = ipath_query_pkey; + dev->query_gid = ipath_query_gid; + dev->alloc_ucontext = ipath_alloc_ucontext; + dev->dealloc_ucontext = ipath_dealloc_ucontext; + dev->alloc_pd = ipath_alloc_pd; + dev->dealloc_pd = ipath_dealloc_pd; + dev->create_ah = ipath_create_ah; + dev->destroy_ah = ipath_destroy_ah; + dev->query_ah = ipath_query_ah; + dev->create_srq = ipath_create_srq; + dev->modify_srq = ipath_modify_srq; + dev->query_srq = ipath_query_srq; + dev->destroy_srq = ipath_destroy_srq; + dev->create_qp = ipath_create_qp; + dev->modify_qp = ipath_modify_qp; + dev->query_qp = ipath_query_qp; + dev->destroy_qp = ipath_destroy_qp; + dev->post_send = ipath_post_send; + dev->post_recv = ipath_post_receive; + dev->post_srq_recv = ipath_post_srq_receive; + dev->create_cq = ipath_create_cq; + dev->destroy_cq = ipath_destroy_cq; + dev->resize_cq = ipath_resize_cq; + dev->poll_cq = ipath_poll_cq; + dev->req_notify_cq = ipath_req_notify_cq; + dev->get_dma_mr = ipath_get_dma_mr; + dev->reg_phys_mr = ipath_reg_phys_mr; + dev->reg_user_mr = ipath_reg_user_mr; + dev->dereg_mr = ipath_dereg_mr; + dev->alloc_fmr = ipath_alloc_fmr; + dev->map_phys_fmr = ipath_map_phys_fmr; + dev->unmap_fmr = ipath_unmap_fmr; + dev->dealloc_fmr = ipath_dealloc_fmr; + dev->attach_mcast = ipath_multicast_attach; + dev->detach_mcast = ipath_multicast_detach; + dev->process_mad = ipath_process_mad; + dev->mmap = ipath_mmap; + dev->dma_ops = &ipath_dma_mapping_ops; + dev->get_port_immutable = ipath_port_immutable; + + snprintf(dev->node_desc, sizeof(dev->node_desc), + IPATH_IDSTR " %s", init_utsname()->nodename); + + ret = ib_register_device(dev, NULL); + if (ret) + goto err_reg; + + ret = ipath_verbs_register_sysfs(dev); + if (ret) + goto err_class; + + enable_timer(dd); + + goto bail; + +err_class: + ib_unregister_device(dev); +err_reg: + kfree(idev->lk_table.table); +err_lk: + kfree(idev->qp_table.table); +err_qp: + kfree(idev->txreq_bufs); +err_tx: + ib_dealloc_device(dev); + ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret); + idev = NULL; + +bail: + dd->verbs_dev = idev; + return ret; +} + +void ipath_unregister_ib_device(struct ipath_ibdev *dev) +{ + struct ib_device *ibdev = &dev->ibdev; + u32 qps_inuse; + + ib_unregister_device(ibdev); + + disable_timer(dev->dd); + + if (!list_empty(&dev->pending[0]) || + !list_empty(&dev->pending[1]) || + !list_empty(&dev->pending[2])) + ipath_dev_err(dev->dd, "pending list not empty!\n"); + if (!list_empty(&dev->piowait)) + ipath_dev_err(dev->dd, "piowait list not empty!\n"); + if (!list_empty(&dev->rnrwait)) + ipath_dev_err(dev->dd, "rnrwait list not empty!\n"); + if (!ipath_mcast_tree_empty()) + ipath_dev_err(dev->dd, "multicast table memory leak!\n"); + /* + * Note that ipath_unregister_ib_device() can be called before all + * the QPs are destroyed! + */ + qps_inuse = ipath_free_all_qps(&dev->qp_table); + if (qps_inuse) + ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n", + qps_inuse); + kfree(dev->qp_table.table); + kfree(dev->lk_table.table); + kfree(dev->txreq_bufs); + ib_dealloc_device(ibdev); +} + +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + + return sprintf(buf, "%x\n", dev->dd->ipath_pcirev); +} + +static ssize_t show_hca(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + int ret; + + ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128); + if (ret < 0) + goto bail; + strcat(buf, "\n"); + ret = strlen(buf); + +bail: + return ret; +} + +static ssize_t show_stats(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct ipath_ibdev *dev = + container_of(device, struct ipath_ibdev, ibdev.dev); + int i; + int len; + + len = sprintf(buf, + "RC resends %d\n" + "RC no QACK %d\n" + "RC ACKs %d\n" + "RC SEQ NAKs %d\n" + "RC RDMA seq %d\n" + "RC RNR NAKs %d\n" + "RC OTH NAKs %d\n" + "RC timeouts %d\n" + "RC RDMA dup %d\n" + "piobuf wait %d\n" + "unaligned %d\n" + "PKT drops %d\n" + "WQE errs %d\n", + dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks, + dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks, + dev->n_other_naks, dev->n_timeouts, + dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned, + dev->n_pkt_drops, dev->n_wqe_errs); + for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) { + const struct ipath_opcode_stats *si = &dev->opstats[i]; + + if (!si->n_packets && !si->n_bytes) + continue; + len += sprintf(buf + len, "%02x %llu/%llu\n", i, + (unsigned long long) si->n_packets, + (unsigned long long) si->n_bytes); + } + return len; +} + +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); +static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL); + +static struct device_attribute *ipath_class_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_hca_type, + &dev_attr_board_id, + &dev_attr_stats +}; + +static int ipath_verbs_register_sysfs(struct ib_device *dev) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) { + ret = device_create_file(&dev->dev, + ipath_class_attributes[i]); + if (ret) + goto bail; + } + return 0; +bail: + for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) + device_remove_file(&dev->dev, ipath_class_attributes[i]); + return ret; +} diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h new file mode 100644 index 000000000..ec167e545 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_verbs.h @@ -0,0 +1,939 @@ +/* + * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IPATH_VERBS_H +#define IPATH_VERBS_H + +#include +#include +#include +#include +#include +#include +#include + +#include "ipath_kernel.h" + +#define IPATH_MAX_RDMA_ATOMIC 4 + +#define QPN_MAX (1 << 24) +#define QPNMAP_ENTRIES (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define IPATH_UVERBS_ABI_VERSION 2 + +/* + * Define an ib_cq_notify value that is not valid so we know when CQ + * notifications are armed. + */ +#define IB_CQ_NONE (IB_CQ_NEXT_COMP + 1) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +/* Flags for checking QP state (see ib_ipath_state_ops[]) */ +#define IPATH_POST_SEND_OK 0x01 +#define IPATH_POST_RECV_OK 0x02 +#define IPATH_PROCESS_RECV_OK 0x04 +#define IPATH_PROCESS_SEND_OK 0x08 +#define IPATH_PROCESS_NEXT_SEND_OK 0x10 +#define IPATH_FLUSH_SEND 0x20 +#define IPATH_FLUSH_RECV 0x40 +#define IPATH_PROCESS_OR_FLUSH_SEND \ + (IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND) + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __attribute__ ((packed)); + +struct ib_atomic_eth { + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __attribute__ ((packed)); + +struct ipath_other_headers { + __be32 bth[3]; + union { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be32 atomic_ack_eth[2]; + } at; + __be32 imm_data; + __be32 aeth; + struct ib_atomic_eth atomic_eth; + } u; +} __attribute__ ((packed)); + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct ipath_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ipath_other_headers oth; + } l; + struct ipath_other_headers oth; + } u; +} __attribute__ ((packed)); + +struct ipath_pio_header { + __le32 pbc[2]; + struct ipath_ib_header hdr; +} __attribute__ ((packed)); + +/* + * There is one struct ipath_mcast for each multicast GID. + * All attached QPs are then stored as a list of + * struct ipath_mcast_qp. + */ +struct ipath_mcast_qp { + struct list_head list; + struct ipath_qp *qp; +}; + +struct ipath_mcast { + struct rb_node rb_node; + union ib_gid mgid; + struct list_head qp_list; + wait_queue_head_t wait; + atomic_t refcount; + int n_attached; +}; + +/* Protection domain */ +struct ipath_pd { + struct ib_pd ibpd; + int user; /* non-zero if created from user space */ +}; + +/* Address Handle */ +struct ipath_ah { + struct ib_ah ibah; + struct ib_ah_attr attr; +}; + +/* + * This structure is used by ipath_mmap() to validate an offset + * when an mmap() request is made. The vm_area_struct then uses + * this as its vm_private_data. + */ +struct ipath_mmap_info { + struct list_head pending_mmaps; + struct ib_ucontext *context; + void *obj; + __u64 offset; + struct kref ref; + unsigned size; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and completion queue entries as a single memory allocation so + * it can be mmap'ed into user space. + */ +struct ipath_cq_wc { + u32 head; /* index of next entry to fill */ + u32 tail; /* index of next ib_poll_cq() entry */ + union { + /* these are actually size ibcq.cqe + 1 */ + struct ib_uverbs_wc uqueue[0]; + struct ib_wc kqueue[0]; + }; +}; + +/* + * The completion queue structure. + */ +struct ipath_cq { + struct ib_cq ibcq; + struct tasklet_struct comptask; + spinlock_t lock; + u8 notify; + u8 triggered; + struct ipath_cq_wc *queue; + struct ipath_mmap_info *ip; +}; + +/* + * A segment is a linear region of low physical memory. + * XXX Maybe we should use phys addr here and kmap()/kunmap(). + * Used by the verbs layer. + */ +struct ipath_seg { + void *vaddr; + size_t length; +}; + +/* The number of ipath_segs that fit in a page. */ +#define IPATH_SEGSZ (PAGE_SIZE / sizeof (struct ipath_seg)) + +struct ipath_segarray { + struct ipath_seg segs[IPATH_SEGSZ]; +}; + +struct ipath_mregion { + struct ib_pd *pd; /* shares refcnt of ibmr.pd */ + u64 user_base; /* User's address for this region */ + u64 iova; /* IB start address of this region */ + size_t length; + u32 lkey; + u32 offset; /* offset (bytes) to start of region */ + int access_flags; + u32 max_segs; /* number of ipath_segs in all the arrays */ + u32 mapsz; /* size of the map array */ + struct ipath_segarray *map[0]; /* the segments */ +}; + +/* + * These keep track of the copy progress within a memory region. + * Used by the verbs layer. + */ +struct ipath_sge { + struct ipath_mregion *mr; + void *vaddr; /* kernel virtual address of segment */ + u32 sge_length; /* length of the SGE */ + u32 length; /* remaining length of the segment */ + u16 m; /* current index: mr->map[m] */ + u16 n; /* current index: mr->map[m]->segs[n] */ +}; + +/* Memory region */ +struct ipath_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ipath_mregion mr; /* must be last */ +}; + +/* + * Send work request queue entry. + * The size of the sg_list is determined when the QP is created and stored + * in qp->s_max_sge. + */ +struct ipath_swqe { + struct ib_send_wr wr; /* don't use wr.sg_list */ + u32 psn; /* first packet sequence number */ + u32 lpsn; /* last packet sequence number */ + u32 ssn; /* send sequence number */ + u32 length; /* total length of data in sg_list */ + struct ipath_sge sg_list[0]; +}; + +/* + * Receive work request queue entry. + * The size of the sg_list is determined when the QP (or SRQ) is created + * and stored in qp->r_rq.max_sge (or srq->rq.max_sge). + */ +struct ipath_rwqe { + u64 wr_id; + u8 num_sge; + struct ib_sge sg_list[0]; +}; + +/* + * This structure is used to contain the head pointer, tail pointer, + * and receive work queue entries as a single memory allocation so + * it can be mmap'ed into user space. + * Note that the wq array elements are variable size so you can't + * just index into the array to get the N'th element; + * use get_rwqe_ptr() instead. + */ +struct ipath_rwq { + u32 head; /* new work requests posted to the head */ + u32 tail; /* receives pull requests from here. */ + struct ipath_rwqe wq[0]; +}; + +struct ipath_rq { + struct ipath_rwq *wq; + spinlock_t lock; + u32 size; /* size of RWQE array */ + u8 max_sge; +}; + +struct ipath_srq { + struct ib_srq ibsrq; + struct ipath_rq rq; + struct ipath_mmap_info *ip; + /* send signal when number of RWQEs < limit */ + u32 limit; +}; + +struct ipath_sge_state { + struct ipath_sge *sg_list; /* next SGE to be used if any */ + struct ipath_sge sge; /* progress state for the current SGE */ + u8 num_sge; + u8 static_rate; +}; + +/* + * This structure holds the information that the send tasklet needs + * to send a RDMA read response or atomic operation. + */ +struct ipath_ack_entry { + u8 opcode; + u8 sent; + u32 psn; + union { + struct ipath_sge_state rdma_sge; + u64 atomic_data; + }; +}; + +/* + * Variables prefixed with s_ are for the requester (sender). + * Variables prefixed with r_ are for the responder (receiver). + * Variables prefixed with ack_ are for responder replies. + * + * Common variables are protected by both r_rq.lock and s_lock in that order + * which only happens in modify_qp() or changing the QP 'state'. + */ +struct ipath_qp { + struct ib_qp ibqp; + struct ipath_qp *next; /* link list for QPN hash table */ + struct ipath_qp *timer_next; /* link list for ipath_ib_timer() */ + struct ipath_qp *pio_next; /* link for ipath_ib_piobufavail() */ + struct list_head piowait; /* link for wait PIO buf */ + struct list_head timerwait; /* link for waiting for timeouts */ + struct ib_ah_attr remote_ah_attr; + struct ipath_ib_header s_hdr; /* next packet header to send */ + atomic_t refcount; + wait_queue_head_t wait; + wait_queue_head_t wait_dma; + struct tasklet_struct s_task; + struct ipath_mmap_info *ip; + struct ipath_sge_state *s_cur_sge; + struct ipath_verbs_txreq *s_tx; + struct ipath_sge_state s_sge; /* current send request data */ + struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1]; + struct ipath_sge_state s_ack_rdma_sge; + struct ipath_sge_state s_rdma_read_sge; + struct ipath_sge_state r_sge; /* current receive data */ + spinlock_t s_lock; + atomic_t s_dma_busy; + u16 s_pkt_delay; + u16 s_hdrwords; /* size of s_hdr in 32 bit words */ + u32 s_cur_size; /* size of send packet in bytes */ + u32 s_len; /* total length of s_sge */ + u32 s_rdma_read_len; /* total length of s_rdma_read_sge */ + u32 s_next_psn; /* PSN for next request */ + u32 s_last_psn; /* last response PSN processed */ + u32 s_psn; /* current packet sequence number */ + u32 s_ack_rdma_psn; /* PSN for sending RDMA read responses */ + u32 s_ack_psn; /* PSN for acking sends and RDMA writes */ + u32 s_rnr_timeout; /* number of milliseconds for RNR timeout */ + u32 r_ack_psn; /* PSN for next ACK or atomic ACK */ + u64 r_wr_id; /* ID for current receive WQE */ + unsigned long r_aflags; + u32 r_len; /* total length of r_sge */ + u32 r_rcv_len; /* receive data len processed */ + u32 r_psn; /* expected rcv packet sequence number */ + u32 r_msn; /* message sequence number */ + u8 state; /* QP state */ + u8 s_state; /* opcode of last packet sent */ + u8 s_ack_state; /* opcode of packet to ACK */ + u8 s_nak_state; /* non-zero if NAK is pending */ + u8 r_state; /* opcode of last packet received */ + u8 r_nak_state; /* non-zero if NAK is pending */ + u8 r_min_rnr_timer; /* retry timeout value for RNR NAKs */ + u8 r_flags; + u8 r_max_rd_atomic; /* max number of RDMA read/atomic to receive */ + u8 r_head_ack_queue; /* index into s_ack_queue[] */ + u8 qp_access_flags; + u8 s_max_sge; /* size of s_wq->sg_list */ + u8 s_retry_cnt; /* number of times to retry */ + u8 s_rnr_retry_cnt; + u8 s_retry; /* requester retry counter */ + u8 s_rnr_retry; /* requester RNR retry counter */ + u8 s_pkey_index; /* PKEY index to use */ + u8 s_max_rd_atomic; /* max number of RDMA read/atomic to send */ + u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ + u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_flags; + u8 s_dmult; + u8 s_draining; + u8 timeout; /* Timeout for this QP */ + enum ib_mtu path_mtu; + u32 remote_qpn; + u32 qkey; /* QKEY for this QP (for UD or RD) */ + u32 s_size; /* send work queue size */ + u32 s_head; /* new entries added here */ + u32 s_tail; /* next entry to process */ + u32 s_cur; /* current work queue entry */ + u32 s_last; /* last un-ACK'ed entry */ + u32 s_ssn; /* SSN of tail entry */ + u32 s_lsn; /* limit sequence number (credit) */ + struct ipath_swqe *s_wq; /* send work queue */ + struct ipath_swqe *s_wqe; + struct ipath_sge *r_ud_sg_list; + struct ipath_rq r_rq; /* receive work queue */ + struct ipath_sge r_sg_list[0]; /* verified SGEs */ +}; + +/* + * Atomic bit definitions for r_aflags. + */ +#define IPATH_R_WRID_VALID 0 + +/* + * Bit definitions for r_flags. + */ +#define IPATH_R_REUSE_SGE 0x01 +#define IPATH_R_RDMAR_SEQ 0x02 + +/* + * Bit definitions for s_flags. + * + * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs + * before processing the next SWQE + * IPATH_S_WAITING - waiting for RNR timeout or send buffer available. + * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE + * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating + * next send completion entry not via send DMA. + */ +#define IPATH_S_SIGNAL_REQ_WR 0x01 +#define IPATH_S_FENCE_PENDING 0x02 +#define IPATH_S_RDMAR_PENDING 0x04 +#define IPATH_S_ACK_PENDING 0x08 +#define IPATH_S_BUSY 0x10 +#define IPATH_S_WAITING 0x20 +#define IPATH_S_WAIT_SSN_CREDIT 0x40 +#define IPATH_S_WAIT_DMA 0x80 + +#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \ + IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA) + +#define IPATH_PSN_CREDIT 512 + +/* + * Since struct ipath_swqe is not a fixed size, we can't simply index into + * struct ipath_qp.s_wq. This function does the array index computation. + */ +static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp, + unsigned n) +{ + return (struct ipath_swqe *)((char *)qp->s_wq + + (sizeof(struct ipath_swqe) + + qp->s_max_sge * + sizeof(struct ipath_sge)) * n); +} + +/* + * Since struct ipath_rwqe is not a fixed size, we can't simply index into + * struct ipath_rwq.wq. This function does the array index computation. + */ +static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq, + unsigned n) +{ + return (struct ipath_rwqe *) + ((char *) rq->wq->wq + + (sizeof(struct ipath_rwqe) + + rq->max_sge * sizeof(struct ib_sge)) * n); +} + +/* + * QPN-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way, + * large bitmaps are not allocated unless large numbers of QPs are used. + */ +struct qpn_map { + atomic_t n_free; + void *page; +}; + +struct ipath_qp_table { + spinlock_t lock; + u32 last; /* last QP number allocated */ + u32 max; /* size of the hash table */ + u32 nmaps; /* size of the map table */ + struct ipath_qp **table; + /* bit map of free numbers */ + struct qpn_map map[QPNMAP_ENTRIES]; +}; + +struct ipath_lkey_table { + spinlock_t lock; + u32 next; /* next unused index (speeds search) */ + u32 gen; /* generation count */ + u32 max; /* size of the table */ + struct ipath_mregion **table; +}; + +struct ipath_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct ipath_ibdev { + struct ib_device ibdev; + struct ipath_devdata *dd; + struct list_head pending_mmaps; + spinlock_t mmap_offset_lock; + u32 mmap_offset; + int ib_unit; /* This is the device number */ + u16 sm_lid; /* in host order */ + u8 sm_sl; + u8 mkeyprot; + /* non-zero when timer is set */ + unsigned long mkey_lease_timeout; + + /* The following fields are really per port. */ + struct ipath_qp_table qp_table; + struct ipath_lkey_table lk_table; + struct list_head pending[3]; /* FIFO of QPs waiting for ACKs */ + struct list_head piowait; /* list for wait PIO buf */ + struct list_head txreq_free; + void *txreq_bufs; + /* list of QPs waiting for RNR timer */ + struct list_head rnrwait; + spinlock_t pending_lock; + __be64 sys_image_guid; /* in network order */ + __be64 gid_prefix; /* in network order */ + __be64 mkey; + + u32 n_pds_allocated; /* number of PDs allocated for device */ + spinlock_t n_pds_lock; + u32 n_ahs_allocated; /* number of AHs allocated for device */ + spinlock_t n_ahs_lock; + u32 n_cqs_allocated; /* number of CQs allocated for device */ + spinlock_t n_cqs_lock; + u32 n_qps_allocated; /* number of QPs allocated for device */ + spinlock_t n_qps_lock; + u32 n_srqs_allocated; /* number of SRQs allocated for device */ + spinlock_t n_srqs_lock; + u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ + spinlock_t n_mcast_grps_lock; + + u64 ipath_sword; /* total dwords sent (sample result) */ + u64 ipath_rword; /* total dwords received (sample result) */ + u64 ipath_spkts; /* total packets sent (sample result) */ + u64 ipath_rpkts; /* total packets received (sample result) */ + /* # of ticks no data sent (sample result) */ + u64 ipath_xmit_wait; + u64 rcv_errors; /* # of packets with SW detected rcv errs */ + u64 n_unicast_xmit; /* total unicast packets sent */ + u64 n_unicast_rcv; /* total unicast packets received */ + u64 n_multicast_xmit; /* total multicast packets sent */ + u64 n_multicast_rcv; /* total multicast packets received */ + u64 z_symbol_error_counter; /* starting count for PMA */ + u64 z_link_error_recovery_counter; /* starting count for PMA */ + u64 z_link_downed_counter; /* starting count for PMA */ + u64 z_port_rcv_errors; /* starting count for PMA */ + u64 z_port_rcv_remphys_errors; /* starting count for PMA */ + u64 z_port_xmit_discards; /* starting count for PMA */ + u64 z_port_xmit_data; /* starting count for PMA */ + u64 z_port_rcv_data; /* starting count for PMA */ + u64 z_port_xmit_packets; /* starting count for PMA */ + u64 z_port_rcv_packets; /* starting count for PMA */ + u32 z_pkey_violations; /* starting count for PMA */ + u32 z_local_link_integrity_errors; /* starting count for PMA */ + u32 z_excessive_buffer_overrun_errors; /* starting count for PMA */ + u32 z_vl15_dropped; /* starting count for PMA */ + u32 n_rc_resends; + u32 n_rc_acks; + u32 n_rc_qacks; + u32 n_seq_naks; + u32 n_rdma_seq; + u32 n_rnr_naks; + u32 n_other_naks; + u32 n_timeouts; + u32 n_pkt_drops; + u32 n_vl15_dropped; + u32 n_wqe_errs; + u32 n_rdma_dup_busy; + u32 n_piowait; + u32 n_unaligned; + u32 port_cap_flags; + u32 pma_sample_start; + u32 pma_sample_interval; + __be16 pma_counter_select[5]; + u16 pma_tag; + u16 qkey_violations; + u16 mkey_violations; + u16 mkey_lease_period; + u16 pending_index; /* which pending queue is active */ + u8 pma_sample_status; + u8 subnet_timeout; + u8 vl_high_limit; + struct ipath_opcode_stats opstats[128]; +}; + +struct ipath_verbs_counters { + u64 symbol_error_counter; + u64 link_error_recovery_counter; + u64 link_downed_counter; + u64 port_rcv_errors; + u64 port_rcv_remphys_errors; + u64 port_xmit_discards; + u64 port_xmit_data; + u64 port_rcv_data; + u64 port_xmit_packets; + u64 port_rcv_packets; + u32 local_link_integrity_errors; + u32 excessive_buffer_overrun_errors; + u32 vl15_dropped; +}; + +struct ipath_verbs_txreq { + struct ipath_qp *qp; + struct ipath_swqe *wqe; + u32 map_len; + u32 len; + struct ipath_sge_state *ss; + struct ipath_pio_header hdr; + struct ipath_sdma_txreq txreq; +}; + +static inline struct ipath_mr *to_imr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct ipath_mr, ibmr); +} + +static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd) +{ + return container_of(ibpd, struct ipath_pd, ibpd); +} + +static inline struct ipath_ah *to_iah(struct ib_ah *ibah) +{ + return container_of(ibah, struct ipath_ah, ibah); +} + +static inline struct ipath_cq *to_icq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct ipath_cq, ibcq); +} + +static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct ipath_srq, ibsrq); +} + +static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct ipath_qp, ibqp); +} + +static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct ipath_ibdev, ibdev); +} + +/* + * This must be called with s_lock held. + */ +static inline void ipath_schedule_send(struct ipath_qp *qp) +{ + if (qp->s_flags & IPATH_S_ANY_WAIT) + qp->s_flags &= ~IPATH_S_ANY_WAIT; + if (!(qp->s_flags & IPATH_S_BUSY)) + tasklet_hi_schedule(&qp->s_task); +} + +int ipath_process_mad(struct ib_device *ibdev, + int mad_flags, + u8 port_num, + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad_hdr *in, size_t in_mad_size, + struct ib_mad_hdr *out, size_t *out_mad_size, + u16 *out_mad_pkey_index); + +/* + * Compare the lower 24 bits of the two values. + * Returns an integer <, ==, or > than zero. + */ +static inline int ipath_cmp24(u32 a, u32 b) +{ + return (((int) a) - ((int) b)) << 8; +} + +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid); + +int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords, + u64 *rwords, u64 *spkts, u64 *rpkts, + u64 *xmit_wait); + +int ipath_get_counters(struct ipath_devdata *dd, + struct ipath_verbs_counters *cntrs); + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid); + +int ipath_mcast_tree_empty(void); + +__be32 ipath_compute_aeth(struct ipath_qp *qp); + +struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn); + +struct ib_qp *ipath_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); + +int ipath_destroy_qp(struct ib_qp *ibqp); + +int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err); + +int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr); + +unsigned ipath_free_all_qps(struct ipath_qp_table *qpt); + +int ipath_init_qp_table(struct ipath_ibdev *idev, int size); + +void ipath_get_credit(struct ipath_qp *qp, u32 aeth); + +unsigned ipath_ib_rate_to_mult(enum ib_rate rate); + +int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr, + u32 hdrwords, struct ipath_sge_state *ss, u32 len); + +void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length); + +void ipath_skip_sge(struct ipath_sge_state *ss, u32 length); + +void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +void ipath_restart_rc(struct ipath_qp *qp, u32 psn); + +void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err); + +int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr); + +void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr, + int has_grh, void *data, u32 tlen, struct ipath_qp *qp); + +int ipath_alloc_lkey(struct ipath_lkey_table *rkt, + struct ipath_mregion *mr); + +void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey); + +int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge, + struct ib_sge *sge, int acc); + +int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss, + u32 len, u64 vaddr, u32 rkey, int acc); + +int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr); + +struct ib_srq *ipath_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + +int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); + +int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); + +int ipath_destroy_srq(struct ib_srq *ibsrq); + +void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig); + +int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); + +struct ib_cq *ipath_create_cq(struct ib_device *ibdev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata); + +int ipath_destroy_cq(struct ib_cq *ibcq); + +int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); + +int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); + +struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc); + +struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, + struct ib_phys_buf *buffer_list, + int num_phys_buf, int acc, u64 *iova_start); + +struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, + u64 virt_addr, int mr_access_flags, + struct ib_udata *udata); + +int ipath_dereg_mr(struct ib_mr *ibmr); + +struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags, + struct ib_fmr_attr *fmr_attr); + +int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list, + int list_len, u64 iova); + +int ipath_unmap_fmr(struct list_head *fmr_list); + +int ipath_dealloc_fmr(struct ib_fmr *ibfmr); + +void ipath_release_mmap_info(struct kref *ref); + +struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev, + u32 size, + struct ib_ucontext *context, + void *obj); + +void ipath_update_mmap_info(struct ipath_ibdev *dev, + struct ipath_mmap_info *ip, + u32 size, void *obj); + +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); + +void ipath_insert_rnr_queue(struct ipath_qp *qp); + +int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss); + +int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only); + +u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp, + struct ipath_other_headers *ohdr, + u32 bth0, u32 bth2); + +void ipath_do_send(unsigned long data); + +void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe, + enum ib_wc_status status); + +int ipath_make_rc_req(struct ipath_qp *qp); + +int ipath_make_uc_req(struct ipath_qp *qp); + +int ipath_make_ud_req(struct ipath_qp *qp); + +int ipath_register_ib_device(struct ipath_devdata *); + +void ipath_unregister_ib_device(struct ipath_ibdev *); + +void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32); + +int ipath_ib_piobufavail(struct ipath_ibdev *); + +unsigned ipath_get_npkeys(struct ipath_devdata *); + +u32 ipath_get_cr_errpkey(struct ipath_devdata *); + +unsigned ipath_get_pkey(struct ipath_devdata *, unsigned); + +extern const enum ib_wc_opcode ib_ipath_wc_opcode[]; + +/* + * Below converts HCA-specific LinkTrainingState to IB PhysPortState + * values. + */ +extern const u8 ipath_cvt_physportstate[]; +#define IB_PHYSPORTSTATE_SLEEP 1 +#define IB_PHYSPORTSTATE_POLL 2 +#define IB_PHYSPORTSTATE_DISABLED 3 +#define IB_PHYSPORTSTATE_CFG_TRAIN 4 +#define IB_PHYSPORTSTATE_LINKUP 5 +#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6 + +extern const int ib_ipath_state_ops[]; + +extern unsigned int ib_ipath_lkey_table_size; + +extern unsigned int ib_ipath_max_cqes; + +extern unsigned int ib_ipath_max_cqs; + +extern unsigned int ib_ipath_max_qp_wrs; + +extern unsigned int ib_ipath_max_qps; + +extern unsigned int ib_ipath_max_sges; + +extern unsigned int ib_ipath_max_mcast_grps; + +extern unsigned int ib_ipath_max_mcast_qp_attached; + +extern unsigned int ib_ipath_max_srqs; + +extern unsigned int ib_ipath_max_srq_sges; + +extern unsigned int ib_ipath_max_srq_wrs; + +extern const u32 ib_ipath_rnr_table[]; + +extern struct ib_dma_mapping_ops ipath_dma_mapping_ops; + +#endif /* IPATH_VERBS_H */ diff --git a/drivers/staging/rdma/ipath/ipath_verbs_mcast.c b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c new file mode 100644 index 000000000..6216ea923 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_verbs_mcast.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#include "ipath_verbs.h" + +/* + * Global table of GID to attached QPs. + * The table is global to all ipath devices since a send from one QP/device + * needs to be locally routed to any locally attached QPs on the same + * or different device. + */ +static struct rb_root mcast_tree; +static DEFINE_SPINLOCK(mcast_lock); + +/** + * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct + * @qp: the QP to link + */ +static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp) +{ + struct ipath_mcast_qp *mqp; + + mqp = kmalloc(sizeof *mqp, GFP_KERNEL); + if (!mqp) + goto bail; + + mqp->qp = qp; + atomic_inc(&qp->refcount); + +bail: + return mqp; +} + +static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp) +{ + struct ipath_qp *qp = mqp->qp; + + /* Notify ipath_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + + kfree(mqp); +} + +/** + * ipath_mcast_alloc - allocate the multicast GID structure + * @mgid: the multicast GID + * + * A list of QPs will be attached to this structure. + */ +static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid) +{ + struct ipath_mcast *mcast; + + mcast = kmalloc(sizeof *mcast, GFP_KERNEL); + if (!mcast) + goto bail; + + mcast->mgid = *mgid; + INIT_LIST_HEAD(&mcast->qp_list); + init_waitqueue_head(&mcast->wait); + atomic_set(&mcast->refcount, 0); + mcast->n_attached = 0; + +bail: + return mcast; +} + +static void ipath_mcast_free(struct ipath_mcast *mcast) +{ + struct ipath_mcast_qp *p, *tmp; + + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) + ipath_mcast_qp_free(p); + + kfree(mcast); +} + +/** + * ipath_mcast_find - search the global table for the given multicast GID + * @mgid: the multicast GID to search for + * + * Returns NULL if not found. + * + * The caller is responsible for decrementing the reference count if found. + */ +struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid) +{ + struct rb_node *n; + unsigned long flags; + struct ipath_mcast *mcast; + + spin_lock_irqsave(&mcast_lock, flags); + n = mcast_tree.rb_node; + while (n) { + int ret; + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + + ret = memcmp(mgid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else { + atomic_inc(&mcast->refcount); + spin_unlock_irqrestore(&mcast_lock, flags); + goto bail; + } + } + spin_unlock_irqrestore(&mcast_lock, flags); + + mcast = NULL; + +bail: + return mcast; +} + +/** + * ipath_mcast_add - insert mcast GID into table and attach QP struct + * @mcast: the mcast GID table + * @mqp: the QP to attach + * + * Return zero if both were added. Return EEXIST if the GID was already in + * the table but the QP was added. Return ESRCH if the QP was already + * attached and neither structure was added. + */ +static int ipath_mcast_add(struct ipath_ibdev *dev, + struct ipath_mcast *mcast, + struct ipath_mcast_qp *mqp) +{ + struct rb_node **n = &mcast_tree.rb_node; + struct rb_node *pn = NULL; + int ret; + + spin_lock_irq(&mcast_lock); + + while (*n) { + struct ipath_mcast *tmcast; + struct ipath_mcast_qp *p; + + pn = *n; + tmcast = rb_entry(pn, struct ipath_mcast, rb_node); + + ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) { + n = &pn->rb_left; + continue; + } + if (ret > 0) { + n = &pn->rb_right; + continue; + } + + /* Search the QP list to see if this is already there. */ + list_for_each_entry_rcu(p, &tmcast->qp_list, list) { + if (p->qp == mqp->qp) { + ret = ESRCH; + goto bail; + } + } + if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) { + ret = ENOMEM; + goto bail; + } + + tmcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &tmcast->qp_list); + ret = EEXIST; + goto bail; + } + + spin_lock(&dev->n_mcast_grps_lock); + if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) { + spin_unlock(&dev->n_mcast_grps_lock); + ret = ENOMEM; + goto bail; + } + + dev->n_mcast_grps_allocated++; + spin_unlock(&dev->n_mcast_grps_lock); + + mcast->n_attached++; + + list_add_tail_rcu(&mqp->list, &mcast->qp_list); + + atomic_inc(&mcast->refcount); + rb_link_node(&mcast->rb_node, pn, n); + rb_insert_color(&mcast->rb_node, &mcast_tree); + + ret = 0; + +bail: + spin_unlock_irq(&mcast_lock); + + return ret; +} + +int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_mcast *mcast; + struct ipath_mcast_qp *mqp; + int ret; + + /* + * Allocate data structures since its better to do this outside of + * spin locks and it will most likely be needed. + */ + mcast = ipath_mcast_alloc(gid); + if (mcast == NULL) { + ret = -ENOMEM; + goto bail; + } + mqp = ipath_mcast_qp_alloc(qp); + if (mqp == NULL) { + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + } + switch (ipath_mcast_add(dev, mcast, mqp)) { + case ESRCH: + /* Neither was used: can't attach the same QP twice. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -EINVAL; + goto bail; + case EEXIST: /* The mcast wasn't used */ + ipath_mcast_free(mcast); + break; + case ENOMEM: + /* Exceeded the maximum number of mcast groups. */ + ipath_mcast_qp_free(mqp); + ipath_mcast_free(mcast); + ret = -ENOMEM; + goto bail; + default: + break; + } + + ret = 0; + +bail: + return ret; +} + +int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) +{ + struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_ibdev *dev = to_idev(ibqp->device); + struct ipath_mcast *mcast = NULL; + struct ipath_mcast_qp *p, *tmp; + struct rb_node *n; + int last = 0; + int ret; + + spin_lock_irq(&mcast_lock); + + /* Find the GID in the mcast table. */ + n = mcast_tree.rb_node; + while (1) { + if (n == NULL) { + spin_unlock_irq(&mcast_lock); + ret = -EINVAL; + goto bail; + } + + mcast = rb_entry(n, struct ipath_mcast, rb_node); + ret = memcmp(gid->raw, mcast->mgid.raw, + sizeof(union ib_gid)); + if (ret < 0) + n = n->rb_left; + else if (ret > 0) + n = n->rb_right; + else + break; + } + + /* Search the QP list. */ + list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) { + if (p->qp != qp) + continue; + /* + * We found it, so remove it, but don't poison the forward + * link until we are sure there are no list walkers. + */ + list_del_rcu(&p->list); + mcast->n_attached--; + + /* If this was the last attached QP, remove the GID too. */ + if (list_empty(&mcast->qp_list)) { + rb_erase(&mcast->rb_node, &mcast_tree); + last = 1; + } + break; + } + + spin_unlock_irq(&mcast_lock); + + if (p) { + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + ipath_mcast_qp_free(p); + } + if (last) { + atomic_dec(&mcast->refcount); + wait_event(mcast->wait, !atomic_read(&mcast->refcount)); + ipath_mcast_free(mcast); + spin_lock_irq(&dev->n_mcast_grps_lock); + dev->n_mcast_grps_allocated--; + spin_unlock_irq(&dev->n_mcast_grps_lock); + } + + ret = 0; + +bail: + return ret; +} + +int ipath_mcast_tree_empty(void) +{ + return mcast_tree.rb_node == NULL; +} diff --git a/drivers/staging/rdma/ipath/ipath_wc_ppc64.c b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c new file mode 100644 index 000000000..1a7e20a75 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_wc_ppc64.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on PowerPC only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include "ipath_kernel.h" + +/** + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device + * + * Nothing to do on PowerPC, so just return without error. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + return 0; +} diff --git a/drivers/staging/rdma/ipath/ipath_wc_x86_64.c b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c new file mode 100644 index 000000000..7b6e4c843 --- /dev/null +++ b/drivers/staging/rdma/ipath/ipath_wc_x86_64.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved. + * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * This file is conditionally built on x86_64 only. Otherwise weak symbol + * versions of the functions exported from here are used. + */ + +#include +#include + +#include "ipath_kernel.h" + +/** + * ipath_enable_wc - enable write combining for MMIO writes to the device + * @dd: infinipath device + * + * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable + * write combining. + */ +int ipath_enable_wc(struct ipath_devdata *dd) +{ + int ret = 0; + u64 pioaddr, piolen; + unsigned bits; + const unsigned long addr = pci_resource_start(dd->pcidev, 0); + const size_t len = pci_resource_len(dd->pcidev, 0); + + /* + * Set the PIO buffers to be WCCOMB, so we get HT bursts to the + * chip. Linux (possibly the hardware) requires it to be on a power + * of 2 address matching the length (which has to be a power of 2). + * For rev1, that means the base address, for rev2, it will be just + * the PIO buffers themselves. + * For chips with two sets of buffers, the calculations are + * somewhat more complicated; we need to sum, and the piobufbase + * register has both offsets, 2K in low 32 bits, 4K in high 32 bits. + * The buffers are still packed, so a single range covers both. + */ + if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */ + unsigned long pio2kbase, pio4kbase; + pio2kbase = dd->ipath_piobufbase & 0xffffffffUL; + pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL; + if (pio2kbase < pio4kbase) { /* all, for now */ + pioaddr = addr + pio2kbase; + piolen = pio4kbase - pio2kbase + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } else { + pioaddr = addr + pio4kbase; + piolen = pio2kbase - pio4kbase + + dd->ipath_piobcnt2k * dd->ipath_palign; + } + } else { /* single buffer size (2K, currently) */ + pioaddr = addr + dd->ipath_piobufbase; + piolen = dd->ipath_piobcnt2k * dd->ipath_palign + + dd->ipath_piobcnt4k * dd->ipath_4kalign; + } + + for (bits = 0; !(piolen & (1ULL << bits)); bits++) + /* do nothing */ ; + + if (piolen != (1ULL << bits)) { + piolen >>= bits; + while (piolen >>= 1) + bits++; + piolen = 1ULL << (bits + 1); + } + if (pioaddr & (piolen - 1)) { + u64 atmp; + ipath_dbg("pioaddr %llx not on right boundary for size " + "%llx, fixing\n", + (unsigned long long) pioaddr, + (unsigned long long) piolen); + atmp = pioaddr & ~(piolen - 1); + if (atmp < addr || (atmp + piolen) > (addr + len)) { + ipath_dev_err(dd, "No way to align address/size " + "(%llx/%llx), no WC mtrr\n", + (unsigned long long) atmp, + (unsigned long long) piolen << 1); + ret = -ENODEV; + } else { + ipath_dbg("changing WC base from %llx to %llx, " + "len from %llx to %llx\n", + (unsigned long long) pioaddr, + (unsigned long long) atmp, + (unsigned long long) piolen, + (unsigned long long) piolen << 1); + pioaddr = atmp; + piolen <<= 1; + } + } + + if (!ret) { + dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen); + if (dd->wc_cookie < 0) { + ipath_dev_err(dd, "Seting mtrr failed on PIO buffers\n"); + ret = -ENODEV; + } else if (dd->wc_cookie == 0) + ipath_cdbg(VERBOSE, "Set mtrr for chip to WC not needed\n"); + else + ipath_cdbg(VERBOSE, "Set mtrr for chip to WC\n"); + } + + return ret; +} + +/** + * ipath_disable_wc - disable write combining for MMIO writes to the device + * @dd: infinipath device + */ +void ipath_disable_wc(struct ipath_devdata *dd) +{ + arch_phys_wc_del(dd->wc_cookie); +} -- cgit v1.2.3-54-g00ecf