diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/Makefile | 3 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/eeh-powernv.c | 4 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/idle.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/npu-dma.c | 348 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-kmsg.c | 75 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-prd.c | 1 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-rtc.c | 5 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-wrappers.S | 1 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal-xscom.c | 2 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/opal.c | 41 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 216 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.c | 6 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/pci.h | 19 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/setup.c | 12 | ||||
-rw-r--r-- | arch/powerpc/platforms/powernv/smp.c | 74 |
15 files changed, 700 insertions, 109 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 1c8cdb625..f1516b5ec 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -2,9 +2,10 @@ obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o +obj-y += opal-kmsg.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o -obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o +obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o npu-dma.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 2ba602591..87f47e55a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -48,8 +48,8 @@ static int pnv_eeh_init(void) struct pci_controller *hose; struct pnv_phb *phb; - if (!firmware_has_feature(FW_FEATURE_OPALv3)) { - pr_warn("%s: OPALv3 is required !\n", + if (!firmware_has_feature(FW_FEATURE_OPAL)) { + pr_warn("%s: OPAL is required !\n", __func__); return -EINVAL; } diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 59d735d2e..15bfbcd5d 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -242,7 +242,7 @@ static int __init pnv_init_idle_states(void) if (cpuidle_disable != IDLE_NO_OVERRIDE) goto out; - if (!firmware_has_feature(FW_FEATURE_OPALv3)) + if (!firmware_has_feature(FW_FEATURE_OPAL)) goto out; power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c new file mode 100644 index 000000000..e85aa900f --- /dev/null +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -0,0 +1,348 @@ +/* + * This file implements the DMA operations for NVLink devices. The NPU + * devices all point to the same iommu table as the parent PCI device. + * + * Copyright Alistair Popple, IBM Corporation 2015. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +#include <linux/export.h> +#include <linux/pci.h> +#include <linux/memblock.h> + +#include <asm/iommu.h> +#include <asm/pnv-pci.h> +#include <asm/msi_bitmap.h> +#include <asm/opal.h> + +#include "powernv.h" +#include "pci.h" + +/* + * Other types of TCE cache invalidation are not functional in the + * hardware. + */ +#define TCE_KILL_INVAL_ALL PPC_BIT(0) + +static struct pci_dev *get_pci_dev(struct device_node *dn) +{ + return PCI_DN(dn)->pcidev; +} + +/* Given a NPU device get the associated PCI device. */ +struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev) +{ + struct device_node *dn; + struct pci_dev *gpdev; + + /* Get assoicated PCI device */ + dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); + if (!dn) + return NULL; + + gpdev = get_pci_dev(dn); + of_node_put(dn); + + return gpdev; +} +EXPORT_SYMBOL(pnv_pci_get_gpu_dev); + +/* Given the real PCI device get a linked NPU device. */ +struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) +{ + struct device_node *dn; + struct pci_dev *npdev; + + /* Get assoicated PCI device */ + dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); + if (!dn) + return NULL; + + npdev = get_pci_dev(dn); + of_node_put(dn); + + return npdev; +} +EXPORT_SYMBOL(pnv_pci_get_npu_dev); + +#define NPU_DMA_OP_UNSUPPORTED() \ + dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \ + __func__) + +static void *dma_npu_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + struct dma_attrs *attrs) +{ + NPU_DMA_OP_UNSUPPORTED(); + return NULL; +} + +static void dma_npu_free(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + NPU_DMA_OP_UNSUPPORTED(); +} + +static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + NPU_DMA_OP_UNSUPPORTED(); + return 0; +} + +static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + NPU_DMA_OP_UNSUPPORTED(); + return 0; +} + +static int dma_npu_dma_supported(struct device *dev, u64 mask) +{ + NPU_DMA_OP_UNSUPPORTED(); + return 0; +} + +static u64 dma_npu_get_required_mask(struct device *dev) +{ + NPU_DMA_OP_UNSUPPORTED(); + return 0; +} + +struct dma_map_ops dma_npu_ops = { + .map_page = dma_npu_map_page, + .map_sg = dma_npu_map_sg, + .alloc = dma_npu_alloc, + .free = dma_npu_free, + .dma_supported = dma_npu_dma_supported, + .get_required_mask = dma_npu_get_required_mask, +}; + +/* + * Returns the PE assoicated with the PCI device of the given + * NPU. Returns the linked pci device if pci_dev != NULL. + */ +static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, + struct pci_dev **gpdev) +{ + struct pnv_phb *phb; + struct pci_controller *hose; + struct pci_dev *pdev; + struct pnv_ioda_pe *pe; + struct pci_dn *pdn; + + if (npe->flags & PNV_IODA_PE_PEER) { + pe = npe->peers[0]; + pdev = pe->pdev; + } else { + pdev = pnv_pci_get_gpu_dev(npe->pdev); + if (!pdev) + return NULL; + + pdn = pci_get_pdn(pdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return NULL; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + pe = &phb->ioda.pe_array[pdn->pe_number]; + } + + if (gpdev) + *gpdev = pdev; + + return pe; +} + +void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + + if (WARN_ON(phb->type != PNV_PHB_NPU || + !phb->ioda.tce_inval_reg || + !(npe->flags & PNV_IODA_PE_DEV))) + return; + + mb(); /* Ensure previous TCE table stores are visible */ + __raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL), + phb->ioda.tce_inval_reg); +} + +void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, + struct iommu_table *tbl, + unsigned long index, + unsigned long npages, + bool rm) +{ + struct pnv_phb *phb = npe->phb; + + /* We can only invalidate the whole cache on NPU */ + unsigned long val = TCE_KILL_INVAL_ALL; + + if (WARN_ON(phb->type != PNV_PHB_NPU || + !phb->ioda.tce_inval_reg || + !(npe->flags & PNV_IODA_PE_DEV))) + return; + + mb(); /* Ensure previous TCE table stores are visible */ + if (rm) + __raw_rm_writeq(cpu_to_be64(val), + (__be64 __iomem *) phb->ioda.tce_inval_reg_phys); + else + __raw_writeq(cpu_to_be64(val), + phb->ioda.tce_inval_reg); +} + +void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) +{ + struct pnv_ioda_pe *gpe; + struct pci_dev *gpdev; + int i, avail = -1; + + if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) + return; + + gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); + if (!gpe) + return; + + for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { + /* Nothing to do if the PE is already connected. */ + if (gpe->peers[i] == npe) + return; + + if (!gpe->peers[i]) + avail = i; + } + + if (WARN_ON(avail < 0)) + return; + + gpe->peers[avail] = npe; + gpe->flags |= PNV_IODA_PE_PEER; + + /* + * We assume that the NPU devices only have a single peer PE + * (the GPU PCIe device PE). + */ + npe->peers[0] = gpe; + npe->flags |= PNV_IODA_PE_PEER; +} + +/* + * For the NPU we want to point the TCE table at the same table as the + * real PCI device. + */ +static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + struct pci_dev *gpdev; + struct pnv_ioda_pe *gpe; + void *addr; + unsigned int size; + int64_t rc; + + /* + * Find the assoicated PCI devices and get the dma window + * information from there. + */ + if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) + return; + + gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); + if (!gpe) + return; + + addr = (void *)gpe->table_group.tables[0]->it_base; + size = gpe->table_group.tables[0]->it_size << 3; + rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, + npe->pe_number, 1, __pa(addr), + size, 0x1000); + if (rc != OPAL_SUCCESS) + pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n", + __func__, rc, phb->hose->global_number, npe->pe_number); + + /* + * We don't initialise npu_pe->tce32_table as we always use + * dma_npu_ops which are nops. + */ + set_dma_ops(&npe->pdev->dev, &dma_npu_ops); +} + +/* + * Enable/disable bypass mode on the NPU. The NPU only supports one + * window per link, so bypass needs to be explicity enabled or + * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be + * active at the same time. + */ +int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable) +{ + struct pnv_phb *phb = npe->phb; + int64_t rc = 0; + + if (phb->type != PNV_PHB_NPU || !npe->pdev) + return -EINVAL; + + if (enable) { + /* Enable the bypass window */ + phys_addr_t top = memblock_end_of_DRAM(); + + npe->tce_bypass_base = 0; + top = roundup_pow_of_two(top); + dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", + npe->pe_number); + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + npe->tce_bypass_base, top); + } else { + /* + * Disable the bypass window by replacing it with the + * TCE32 window. + */ + pnv_npu_disable_bypass(npe); + } + + return rc; +} + +int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +{ + struct pci_controller *hose = pci_bus_to_host(npdev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pci_get_pdn(npdev); + struct pnv_ioda_pe *npe, *gpe; + struct pci_dev *gpdev; + uint64_t top; + bool bypass = false; + + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return -ENXIO; + + /* We only do bypass if it's enabled on the linked device */ + npe = &phb->ioda.pe_array[pdn->pe_number]; + gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); + if (!gpe) + return -ENODEV; + + if (gpe->tce_bypass_enabled) { + top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1; + bypass = (dma_mask >= top); + } + + if (bypass) + dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n"); + else + dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); + + pnv_npu_dma_set_bypass(npe, bypass); + *npdev->dev.dma_mask = dma_mask; + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c new file mode 100644 index 000000000..6f1214d4d --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-kmsg.c @@ -0,0 +1,75 @@ +/* + * kmsg dumper that ensures the OPAL console fully flushes panic messages + * + * Author: Russell Currey <ruscur@russell.cc> + * + * Copyright 2015 IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/kmsg_dump.h> + +#include <asm/opal.h> +#include <asm/opal-api.h> + +/* + * Console output is controlled by OPAL firmware. The kernel regularly calls + * OPAL_POLL_EVENTS, which flushes some console output. In a panic state, + * however, the kernel no longer calls OPAL_POLL_EVENTS and the panic message + * may not be completely printed. This function does not actually dump the + * message, it just ensures that OPAL completely flushes the console buffer. + */ +static void force_opal_console_flush(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + int i; + int64_t ret; + + /* + * Outside of a panic context the pollers will continue to run, + * so we don't need to do any special flushing. + */ + if (reason != KMSG_DUMP_PANIC) + return; + + if (opal_check_token(OPAL_CONSOLE_FLUSH)) { + ret = opal_console_flush(0); + + if (ret == OPAL_UNSUPPORTED || ret == OPAL_PARAMETER) + return; + + /* Incrementally flush until there's nothing left */ + while (opal_console_flush(0) != OPAL_SUCCESS); + } else { + /* + * If OPAL_CONSOLE_FLUSH is not implemented in the firmware, + * the console can still be flushed by calling the polling + * function enough times to flush the buffer. We don't know + * how much output still needs to be flushed, but we can be + * generous since the kernel is in panic and doesn't need + * to do much else. + */ + printk(KERN_NOTICE "opal: OPAL_CONSOLE_FLUSH missing.\n"); + for (i = 0; i < 1024; i++) { + opal_poll_events(NULL); + } + } +} + +static struct kmsg_dumper opal_kmsg_dumper = { + .dump = force_opal_console_flush +}; + +void __init opal_kmsg_init(void) +{ + int rc; + + /* Add our dumper to the list */ + rc = kmsg_dump_register(&opal_kmsg_dumper); + if (rc != 0) + pr_err("opal: kmsg_dump_register failed; returned %d\n", rc); +} diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 4ece8e40d..e315e704c 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -434,7 +434,6 @@ static const struct of_device_id opal_prd_match[] = { static struct platform_driver opal_prd_driver = { .driver = { .name = "opal-prd", - .owner = THIS_MODULE, .of_match_table = opal_prd_match, }, .probe = opal_prd_probe, diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c index 37dbee157..f8868864f 100644 --- a/arch/powerpc/platforms/powernv/opal-rtc.c +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -31,8 +31,7 @@ static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) tm->tm_hour = bcd2bin((h_m_s_ms >> 56) & 0xff); tm->tm_min = bcd2bin((h_m_s_ms >> 48) & 0xff); tm->tm_sec = bcd2bin((h_m_s_ms >> 40) & 0xff); - - GregorianDay(tm); + tm->tm_wday = -1; } unsigned long __init opal_get_boot_time(void) @@ -51,7 +50,7 @@ unsigned long __init opal_get_boot_time(void) rc = opal_rtc_read(&__y_m_d, &__h_m_s_ms); if (rc == OPAL_BUSY_EVENT) opal_poll_events(NULL); - else + else if (rc == OPAL_BUSY) mdelay(10); } if (rc != OPAL_SUCCESS) diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index b7a464fef..e45b88a5d 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -301,3 +301,4 @@ OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 7634d1c62..d0ac535cf 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -126,7 +126,7 @@ static const struct scom_controller opal_scom_controller = { static int opal_xscom_init(void) { - if (firmware_has_feature(FW_FEATURE_OPALv3)) + if (firmware_has_feature(FW_FEATURE_OPAL)) scom_init(&opal_scom_controller); return 0; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 57cffb80b..4e0da5af9 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -98,16 +98,11 @@ int __init early_init_dt_scan_opal(unsigned long node, pr_debug("OPAL Entry = 0x%llx (sizep=%p runtimesz=%d)\n", opal.size, sizep, runtimesz); - powerpc_firmware_features |= FW_FEATURE_OPAL; if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) { - powerpc_firmware_features |= FW_FEATURE_OPALv2; - powerpc_firmware_features |= FW_FEATURE_OPALv3; - pr_info("OPAL V3 detected !\n"); - } else if (of_flat_dt_is_compatible(node, "ibm,opal-v2")) { - powerpc_firmware_features |= FW_FEATURE_OPALv2; - pr_info("OPAL V2 detected !\n"); + powerpc_firmware_features |= FW_FEATURE_OPAL; + pr_info("OPAL detected !\n"); } else { - pr_info("OPAL V1 detected !\n"); + panic("OPAL != V3 detected, no longer supported.\n"); } /* Reinit all cores with the right endian */ @@ -352,17 +347,15 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) * enough room and be done with it */ spin_lock_irqsave(&opal_write_lock, flags); - if (firmware_has_feature(FW_FEATURE_OPALv2)) { - rc = opal_console_write_buffer_space(vtermno, &olen); - len = be64_to_cpu(olen); - if (rc || len < total_len) { - spin_unlock_irqrestore(&opal_write_lock, flags); - /* Closed -> drop characters */ - if (rc) - return total_len; - opal_poll_events(NULL); - return -EAGAIN; - } + rc = opal_console_write_buffer_space(vtermno, &olen); + len = be64_to_cpu(olen); + if (rc || len < total_len) { + spin_unlock_irqrestore(&opal_write_lock, flags); + /* Closed -> drop characters */ + if (rc) + return total_len; + opal_poll_events(NULL); + return -EAGAIN; } /* We still try to handle partial completions, though they @@ -555,7 +548,7 @@ bool opal_mce_check_early_recovery(struct pt_regs *regs) goto out; if ((regs->nip >= opal.base) && - (regs->nip <= (opal.base + opal.size))) + (regs->nip < (opal.base + opal.size))) recover_addr = find_recovery_address(regs->nip); /* @@ -696,10 +689,7 @@ static int __init opal_init(void) } /* Register OPAL consoles if any ports */ - if (firmware_has_feature(FW_FEATURE_OPALv2)) - consoles = of_find_node_by_path("/ibm,opal/consoles"); - else - consoles = of_node_get(opal_node); + consoles = of_find_node_by_path("/ibm,opal/consoles"); if (consoles) { for_each_child_of_node(consoles, np) { if (strcmp(np->name, "serial")) @@ -758,6 +748,9 @@ static int __init opal_init(void) opal_pdev_init(opal_node, "ibm,opal-flash"); opal_pdev_init(opal_node, "ibm,opal-prd"); + /* Initialise OPAL kmsg dumper for flushing console on panic */ + opal_kmsg_init(); + return 0; } machine_subsys_initcall(powernv, opal_init); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e40d07146..f90dc0439 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -116,16 +116,6 @@ static int __init iommu_setup(char *str) } early_param("iommu", iommu_setup); -/* - * stdcix is only supposed to be used in hypervisor real mode as per - * the architecture spec - */ -static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) -{ - __asm__ __volatile__("stdcix %0,0,%1" - : : "r" (val), "r" (paddr) : "memory"); -} - static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) { return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == @@ -344,7 +334,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) return; } - if (!firmware_has_feature(FW_FEATURE_OPALv3)) { + if (!firmware_has_feature(FW_FEATURE_OPAL)) { pr_info(" Firmware too old to support M64 window\n"); return; } @@ -357,6 +347,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) } res = &hose->mem_resources[1]; + res->name = dn->full_name; res->start = of_translate_address(dn, r + 2); res->end = res->start + of_read_number(r + 4, 2) - 1; res->flags = (IORESOURCE_MEM | IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); @@ -780,8 +771,12 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) return -ENXIO; } - /* Configure PELTV */ - pnv_ioda_set_peltv(phb, pe, true); + /* + * Configure PELTV. NPUs don't have a PELTV table so skip + * configuration on them. + */ + if (phb->type != PNV_PHB_NPU) + pnv_ioda_set_peltv(phb, pe, true); /* Setup reverse map */ for (rid = pe->rid; rid < rid_end; rid++) @@ -924,7 +919,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) } #endif /* CONFIG_PCI_IOV */ -#if 0 static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); @@ -941,11 +935,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - /* PE#0 has been pre-set */ - if (dev->bus->number == 0) - pe_num = 0; - else - pe_num = pnv_ioda_alloc_pe(phb); + pe_num = pnv_ioda_alloc_pe(phb); if (pe_num == IODA_INVALID_PE) { pr_warning("%s: Not enough PE# available, disabling device\n", pci_name(dev)); @@ -963,6 +953,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) pci_dev_get(dev); pdn->pcidev = dev; pdn->pe_number = pe_num; + pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; pe->tce32_seg = -1; @@ -993,7 +984,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) return pe; } -#endif /* Useful for SRIOV case */ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) { @@ -1007,6 +997,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) pci_name(dev)); continue; } + pdn->pcidev = dev; pdn->pe_number = pe->pe_number; pe->dma_weight += pnv_ioda_dma_weight(dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) @@ -1083,6 +1074,77 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pnv_ioda_link_pe_by_weight(phb, pe); } +static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) +{ + int pe_num, found_pe = false, rc; + long rid; + struct pnv_ioda_pe *pe; + struct pci_dev *gpu_pdev; + struct pci_dn *npu_pdn; + struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); + struct pnv_phb *phb = hose->private_data; + + /* + * Due to a hardware errata PE#0 on the NPU is reserved for + * error handling. This means we only have three PEs remaining + * which need to be assigned to four links, implying some + * links must share PEs. + * + * To achieve this we assign PEs such that NPUs linking the + * same GPU get assigned the same PE. + */ + gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); + for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) { + pe = &phb->ioda.pe_array[pe_num]; + if (!pe->pdev) + continue; + + if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) { + /* + * This device has the same peer GPU so should + * be assigned the same PE as the existing + * peer NPU. + */ + dev_info(&npu_pdev->dev, + "Associating to existing PE %d\n", pe_num); + pci_dev_get(npu_pdev); + npu_pdn = pci_get_pdn(npu_pdev); + rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; + npu_pdn->pcidev = npu_pdev; + npu_pdn->pe_number = pe_num; + pe->dma_weight += pnv_ioda_dma_weight(npu_pdev); + phb->ioda.pe_rmap[rid] = pe->pe_number; + + /* Map the PE to this link */ + rc = opal_pci_set_pe(phb->opal_id, pe_num, rid, + OpalPciBusAll, + OPAL_COMPARE_RID_DEVICE_NUMBER, + OPAL_COMPARE_RID_FUNCTION_NUMBER, + OPAL_MAP_PE); + WARN_ON(rc != OPAL_SUCCESS); + found_pe = true; + break; + } + } + + if (!found_pe) + /* + * Could not find an existing PE so allocate a new + * one. + */ + return pnv_ioda_setup_dev_PE(npu_pdev); + else + return pe; +} + +static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) +{ + struct pci_dev *pdev; + + list_for_each_entry(pdev, &bus->devices, bus_list) + pnv_ioda_setup_npu_PE(pdev); +} + static void pnv_ioda_setup_PEs(struct pci_bus *bus) { struct pci_dev *dev; @@ -1119,7 +1181,17 @@ static void pnv_pci_ioda_setup_PEs(void) if (phb->reserve_m64_pe) phb->reserve_m64_pe(hose->bus, NULL, true); - pnv_ioda_setup_PEs(hose->bus); + /* + * On NPU PHB, we expect separate PEs for individual PCI + * functions. PCI bus dependent PEs are required for the + * remaining types of PHBs. + */ + if (phb->type == PNV_PHB_NPU) { + /* PE#0 is needed for error reporting */ + pnv_ioda_reserve_pe(phb, 0); + pnv_ioda_setup_npu_PEs(hose->bus); + } else + pnv_ioda_setup_PEs(hose->bus); } } @@ -1578,6 +1650,8 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; + struct pci_dev *linked_npu_dev; + int i; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV;; @@ -1596,6 +1670,18 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) set_dma_ops(&pdev->dev, &dma_iommu_ops); } *pdev->dev.dma_mask = dma_mask; + + /* Update peer npu devices */ + if (pe->flags & PNV_IODA_PE_PEER) + for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { + if (!pe->peers[i]) + continue; + + linked_npu_dev = pe->peers[i]->pdev; + if (dma_get_mask(&linked_npu_dev->dev) != dma_mask) + dma_set_mask(&linked_npu_dev->dev, dma_mask); + } + return 0; } @@ -1740,12 +1826,23 @@ static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) /* 01xb - invalidate TCEs that match the specified PE# */ unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); struct pnv_phb *phb = pe->phb; + struct pnv_ioda_pe *npe; + int i; if (!phb->ioda.tce_inval_reg) return; mb(); /* Ensure above stores are visible */ __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); + + if (pe->flags & PNV_IODA_PE_PEER) + for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { + npe = pe->peers[i]; + if (!npe || npe->phb->type != PNV_PHB_NPU) + continue; + + pnv_npu_tce_invalidate_entire(npe); + } } static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, @@ -1780,15 +1877,28 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct iommu_table_group_link *tgl; list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + struct pnv_ioda_pe *npe; struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : pe->phb->ioda.tce_inval_reg; + int i; pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, invalidate, tbl->it_page_shift, index, npages); + + if (pe->flags & PNV_IODA_PE_PEER) + /* Invalidate PEs using the same TCE table */ + for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { + npe = pe->peers[i]; + if (!npe || npe->phb->type != PNV_PHB_NPU) + continue; + + pnv_npu_tce_invalidate(npe, tbl, index, + npages, rm); + } } } @@ -2436,10 +2546,17 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", pe->dma_weight, segs); pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); - } else { + } else if (phb->type == PNV_PHB_IODA2) { pe_info(pe, "Assign DMA32 space\n"); segs = 0; pnv_pci_ioda2_setup_dma_pe(phb, pe); + } else if (phb->type == PNV_PHB_NPU) { + /* + * We initialise the DMA space for an NPU PHB + * after setup of the PHB is complete as we + * point the NPU TVT to the the same location + * as the PHB3 TVT. + */ } remaining -= segs; @@ -2881,6 +2998,11 @@ static void pnv_pci_ioda_setup_seg(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { phb = hose->private_data; + + /* NPU PHB does not support IO or MMIO segmentation */ + if (phb->type == PNV_PHB_NPU) + continue; + list_for_each_entry(pe, &phb->ioda.pe_list, list) { pnv_ioda_setup_pe_seg(hose, pe); } @@ -2920,6 +3042,27 @@ static void pnv_pci_ioda_create_dbgfs(void) #endif /* CONFIG_DEBUG_FS */ } +static void pnv_npu_ioda_fixup(void) +{ + bool enable_bypass; + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + if (phb->type != PNV_PHB_NPU) + continue; + + list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { + enable_bypass = dma_get_mask(&pe->pdev->dev) == + DMA_BIT_MASK(64); + pnv_npu_init_dma_pe(pe); + pnv_npu_dma_set_bypass(pe, enable_bypass); + } + } +} + static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); @@ -2932,6 +3075,9 @@ static void pnv_pci_ioda_fixup(void) eeh_init(); eeh_addr_cache_build(); #endif + + /* Link NPU IODA tables to their PCI devices. */ + pnv_npu_ioda_fixup(); } /* @@ -3047,6 +3193,19 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .shutdown = pnv_pci_ioda_shutdown, }; +static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { + .dma_dev_setup = pnv_pci_dma_dev_setup, +#ifdef CONFIG_PCI_MSI + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, +#endif + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_npu_dma_set_mask, + .shutdown = pnv_pci_ioda_shutdown, +}; + static void __init pnv_pci_init_ioda_phb(struct device_node *np, u64 hub_id, int ioda_type) { @@ -3102,6 +3261,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->model = PNV_PHB_MODEL_P7IOC; else if (of_device_is_compatible(np, "ibm,power8-pciex")) phb->model = PNV_PHB_MODEL_PHB3; + else if (of_device_is_compatible(np, "ibm,power8-npu-pciex")) + phb->model = PNV_PHB_MODEL_NPU; else phb->model = PNV_PHB_MODEL_UNKNOWN; @@ -3202,7 +3363,11 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, * the child P2P bridges) can form individual PE. */ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; - hose->controller_ops = pnv_pci_ioda_controller_ops; + + if (phb->type == PNV_PHB_NPU) + hose->controller_ops = pnv_npu_ioda_controller_ops; + else + hose->controller_ops = pnv_pci_ioda_controller_ops; #ifdef CONFIG_PCI_IOV ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; @@ -3237,6 +3402,11 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np) pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); } +void __init pnv_pci_init_npu_phb(struct device_node *np) +{ + pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU); +} + void __init pnv_pci_init_ioda_hub(struct device_node *np) { struct device_node *phbn; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index ad8c3f4a5..b1ef84a6c 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -1,8 +1,6 @@ /* * Support PCI/PCIe on PowerNV platforms * - * Currently supports only P5IOC2 - * * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. * * This program is free software; you can redistribute it and/or @@ -833,6 +831,10 @@ void __init pnv_pci_init(void) for_each_compatible_node(np, NULL, "ibm,ioda2-phb") pnv_pci_init_ioda2_phb(np); + /* Look for NPU PHBs */ + for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb") + pnv_pci_init_npu_phb(np); + /* Setup the linkage between OF nodes and PHBs */ pci_devs_phb_init(); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 36a99feab..00691a9b9 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -7,6 +7,7 @@ enum pnv_phb_type { PNV_PHB_P5IOC2 = 0, PNV_PHB_IODA1 = 1, PNV_PHB_IODA2 = 2, + PNV_PHB_NPU = 3, }; /* Precise PHB model for error management */ @@ -15,6 +16,7 @@ enum pnv_phb_model { PNV_PHB_MODEL_P5IOC2, PNV_PHB_MODEL_P7IOC, PNV_PHB_MODEL_PHB3, + PNV_PHB_MODEL_NPU, }; #define PNV_PCI_DIAG_BUF_SIZE 8192 @@ -24,6 +26,7 @@ enum pnv_phb_model { #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ +#define PNV_IODA_PE_PEER (1 << 6) /* PE has peers */ /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; @@ -31,6 +34,9 @@ struct pnv_ioda_pe { unsigned long flags; struct pnv_phb *phb; +#define PNV_IODA_MAX_PEER_PES 8 + struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES]; + /* A PE can be associated with a single device or an * entire bus (& children). In the former case, pdev * is populated, in the later case, pbus is. @@ -229,6 +235,7 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, extern void pnv_pci_init_p5ioc2_hub(struct device_node *np); extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); +extern void pnv_pci_init_npu_phb(struct device_node *np); extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, __be64 *startp, __be64 *endp, bool rm); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); @@ -239,4 +246,16 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +/* Nvlink functions */ +extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe); +extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, + struct iommu_table *tbl, + unsigned long index, + unsigned long npages, + bool rm); +extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); +extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); +extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled); +extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask); + #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index a9a8fa37a..1acb0c72d 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -90,12 +90,8 @@ static void pnv_show_cpuinfo(struct seq_file *m) if (root) model = of_get_property(root, "model", NULL); seq_printf(m, "machine\t\t: PowerNV %s\n", model); - if (firmware_has_feature(FW_FEATURE_OPALv3)) - seq_printf(m, "firmware\t: OPAL v3\n"); - else if (firmware_has_feature(FW_FEATURE_OPALv2)) - seq_printf(m, "firmware\t: OPAL v2\n"); - else if (firmware_has_feature(FW_FEATURE_OPAL)) - seq_printf(m, "firmware\t: OPAL v1\n"); + if (firmware_has_feature(FW_FEATURE_OPAL)) + seq_printf(m, "firmware\t: OPAL\n"); else seq_printf(m, "firmware\t: BML\n"); of_node_put(root); @@ -224,9 +220,9 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { xics_kexec_teardown_cpu(secondary); - /* On OPAL v3, we return all CPUs to firmware */ + /* On OPAL, we return all CPUs to firmware */ - if (!firmware_has_feature(FW_FEATURE_OPALv3)) + if (!firmware_has_feature(FW_FEATURE_OPAL)) return; if (secondary) { diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index ca264833e..ad7b1a3db 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -61,14 +61,15 @@ static int pnv_smp_kick_cpu(int nr) unsigned long start_here = __pa(ppc_function_entry(generic_secondary_smp_init)); long rc; + uint8_t status; BUG_ON(nr < 0 || nr >= NR_CPUS); /* - * If we already started or OPALv2 is not supported, we just + * If we already started or OPAL is not supported, we just * kick the CPU via the PACA */ - if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPALv2)) + if (paca[nr].cpu_start || !firmware_has_feature(FW_FEATURE_OPAL)) goto kick; /* @@ -77,55 +78,42 @@ static int pnv_smp_kick_cpu(int nr) * first time. OPAL v3 allows us to query OPAL to know if it * has the CPUs, so we do that */ - if (firmware_has_feature(FW_FEATURE_OPALv3)) { - uint8_t status; - - rc = opal_query_cpu_status(pcpu, &status); - if (rc != OPAL_SUCCESS) { - pr_warn("OPAL Error %ld querying CPU %d state\n", - rc, nr); - return -ENODEV; - } + rc = opal_query_cpu_status(pcpu, &status); + if (rc != OPAL_SUCCESS) { + pr_warn("OPAL Error %ld querying CPU %d state\n", rc, nr); + return -ENODEV; + } - /* - * Already started, just kick it, probably coming from - * kexec and spinning - */ - if (status == OPAL_THREAD_STARTED) - goto kick; + /* + * Already started, just kick it, probably coming from + * kexec and spinning + */ + if (status == OPAL_THREAD_STARTED) + goto kick; - /* - * Available/inactive, let's kick it - */ - if (status == OPAL_THREAD_INACTIVE) { - pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", - nr, pcpu); - rc = opal_start_cpu(pcpu, start_here); - if (rc != OPAL_SUCCESS) { - pr_warn("OPAL Error %ld starting CPU %d\n", - rc, nr); - return -ENODEV; - } - } else { - /* - * An unavailable CPU (or any other unknown status) - * shouldn't be started. It should also - * not be in the possible map but currently it can - * happen - */ - pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable" - " (status %d)...\n", nr, pcpu, status); + /* + * Available/inactive, let's kick it + */ + if (status == OPAL_THREAD_INACTIVE) { + pr_devel("OPAL: Starting CPU %d (HW 0x%x)...\n", nr, pcpu); + rc = opal_start_cpu(pcpu, start_here); + if (rc != OPAL_SUCCESS) { + pr_warn("OPAL Error %ld starting CPU %d\n", rc, nr); return -ENODEV; } } else { /* - * On OPAL v2, we just kick it and hope for the best, - * we must not test the error from opal_start_cpu() or - * we would fail to get CPUs from kexec. + * An unavailable CPU (or any other unknown status) + * shouldn't be started. It should also + * not be in the possible map but currently it can + * happen */ - opal_start_cpu(pcpu, start_here); + pr_devel("OPAL: CPU %d (HW 0x%x) is unavailable" + " (status %d)...\n", nr, pcpu, status); + return -ENODEV; } - kick: + +kick: return smp_generic_kick_cpu(nr); } |