From d0b2f91bede3bd5e3d24dd6803e56eee959c1797 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Thu, 20 Oct 2016 00:10:27 -0300 Subject: Linux-libre 4.8.2-gnu --- arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/eeh-powernv.c | 49 +- arch/powerpc/platforms/powernv/idle.c | 188 +++- arch/powerpc/platforms/powernv/npu-dma.c | 16 +- arch/powerpc/platforms/powernv/opal-async.c | 5 + arch/powerpc/platforms/powernv/opal-irqchip.c | 3 +- .../powerpc/platforms/powernv/opal-memory-errors.c | 2 +- arch/powerpc/platforms/powernv/opal-sensor.c | 2 +- arch/powerpc/platforms/powernv/opal-sysparam.c | 4 +- arch/powerpc/platforms/powernv/opal-tracepoints.c | 1 + arch/powerpc/platforms/powernv/opal-wrappers.S | 74 +- arch/powerpc/platforms/powernv/opal.c | 31 +- arch/powerpc/platforms/powernv/pci-cxl.c | 385 ++++++++ arch/powerpc/platforms/powernv/pci-ioda.c | 963 ++++++++++++--------- arch/powerpc/platforms/powernv/pci.c | 134 ++- arch/powerpc/platforms/powernv/pci.h | 45 +- arch/powerpc/platforms/powernv/powernv.h | 1 + arch/powerpc/platforms/powernv/setup.c | 16 +- arch/powerpc/platforms/powernv/smp.c | 4 +- 19 files changed, 1395 insertions(+), 529 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/pci-cxl.c (limited to 'arch/powerpc/platforms/powernv') diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index cd9711e72..b5d98cb3f 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -6,6 +6,7 @@ obj-y += opal-kmsg.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o +obj-$(CONFIG_CXL_BASE) += pci-cxl.o obj-$(CONFIG_EEH) += eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 9226df11b..86544ea85 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "powernv.h" #include "pci.h" @@ -717,12 +718,12 @@ static int pnv_eeh_get_state(struct eeh_pe *pe, int *delay) return ret; } -static s64 pnv_eeh_phb_poll(struct pnv_phb *phb) +static s64 pnv_eeh_poll(unsigned long id) { s64 rc = OPAL_HARDWARE; while (1) { - rc = opal_pci_poll(phb->opal_id); + rc = opal_pci_poll(id); if (rc <= 0) break; @@ -762,7 +763,7 @@ int pnv_eeh_phb_reset(struct pci_controller *hose, int option) * reset followed by hot reset on root bus. So we also * need the PCI bus settlement delay. */ - rc = pnv_eeh_phb_poll(phb); + rc = pnv_eeh_poll(phb->opal_id); if (option == EEH_RESET_DEACTIVATE) { if (system_state < SYSTEM_RUNNING) udelay(1000 * EEH_PE_RST_SETTLE_TIME); @@ -805,7 +806,7 @@ static int pnv_eeh_root_reset(struct pci_controller *hose, int option) goto out; /* Poll state of the PHB until the request is done */ - rc = pnv_eeh_phb_poll(phb); + rc = pnv_eeh_poll(phb->opal_id); if (option == EEH_RESET_DEACTIVATE) msleep(EEH_PE_RST_SETTLE_TIME); out: @@ -815,7 +816,7 @@ out: return 0; } -static int pnv_eeh_bridge_reset(struct pci_dev *dev, int option) +static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) { struct pci_dn *pdn = pci_get_pdn_by_devfn(dev->bus, dev->devfn); struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -866,6 +867,44 @@ static int pnv_eeh_bridge_reset(struct pci_dev *dev, int option) return 0; } +static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct device_node *dn = pci_device_to_OF_node(pdev); + uint64_t id = PCI_SLOT_ID(phb->opal_id, + (pdev->bus->number << 8) | pdev->devfn); + uint8_t scope; + int64_t rc; + + /* Hot reset to the bus if firmware cannot handle */ + if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL)) + return __pnv_eeh_bridge_reset(pdev, option); + + switch (option) { + case EEH_RESET_FUNDAMENTAL: + scope = OPAL_RESET_PCI_FUNDAMENTAL; + break; + case EEH_RESET_HOT: + scope = OPAL_RESET_PCI_HOT; + break; + case EEH_RESET_DEACTIVATE: + return 0; + default: + dev_dbg(&pdev->dev, "%s: Unsupported reset %d\n", + __func__, option); + return -EINVAL; + } + + rc = opal_pci_reset(id, scope, OPAL_ASSERT_RESET); + if (rc <= OPAL_SUCCESS) + goto out; + + rc = pnv_eeh_poll(id); +out: + return (rc == OPAL_SUCCESS) ? 0 : -EIO; +} + void pnv_pci_reset_secondary_bus(struct pci_dev *dev) { struct pci_controller *hose; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index fcc8b6861..479c25601 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -27,9 +27,12 @@ #include "powernv.h" #include "subcore.h" +/* Power ISA 3.0 allows for stop states 0x0 - 0xF */ +#define MAX_STOP_STATE 0xF + static u32 supported_cpuidle_states; -int pnv_save_sprs_for_winkle(void) +static int pnv_save_sprs_for_deep_states(void) { int cpu; int rc; @@ -50,15 +53,19 @@ int pnv_save_sprs_for_winkle(void) uint64_t pir = get_hard_smp_processor_id(cpu); uint64_t hsprg0_val = (uint64_t)&paca[cpu]; - /* - * HSPRG0 is used to store the cpu's pointer to paca. Hence last - * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0 - * with 63rd bit set, so that when a thread wakes up at 0x100 we - * can use this bit to distinguish between fastsleep and - * deep winkle. - */ - hsprg0_val |= 1; - + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * HSPRG0 is used to store the cpu's pointer to paca. + * Hence last 3 bits are guaranteed to be 0. Program + * slw to restore HSPRG0 with 63rd bit set, so that + * when a thread wakes up at 0x100 we can use this bit + * to distinguish between fastsleep and deep winkle. + * This is not necessary with stop/psscr since PLS + * field of psscr indicates which state we are waking + * up from. + */ + hsprg0_val |= 1; + } rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); if (rc != 0) return rc; @@ -130,8 +137,8 @@ static void pnv_alloc_idle_core_states(void) update_subcore_sibling_mask(); - if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) - pnv_save_sprs_for_winkle(); + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) + pnv_save_sprs_for_deep_states(); } u32 pnv_get_supported_cpuidle_states(void) @@ -230,43 +237,162 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); -static int __init pnv_init_idle_states(void) + +/* + * Used for ppc_md.power_save which needs a function with no parameters + */ +static void power9_idle(void) { - struct device_node *power_mgt; - int dt_idle_states; - u32 *flags; - int i; + /* Requesting stop state 0 */ + power9_idle_stop(0); +} +/* + * First deep stop state. Used to figure out when to save/restore + * hypervisor context. + */ +u64 pnv_first_deep_stop_state = MAX_STOP_STATE; - supported_cpuidle_states = 0; +/* + * Deepest stop idle state. Used when a cpu is offlined + */ +u64 pnv_deepest_stop_state; - if (cpuidle_disable != IDLE_NO_OVERRIDE) - goto out; +/* + * Power ISA 3.0 idle initialization. + * + * POWER ISA 3.0 defines a new SPR Processor stop Status and Control + * Register (PSSCR) to control idle behavior. + * + * PSSCR layout: + * ---------------------------------------------------------- + * | PLS | /// | SD | ESL | EC | PSLL | /// | TR | MTL | RL | + * ---------------------------------------------------------- + * 0 4 41 42 43 44 48 54 56 60 + * + * PSSCR key fields: + * Bits 0:3 - Power-Saving Level Status (PLS). This field indicates the + * lowest power-saving state the thread entered since stop instruction was + * last executed. + * + * Bit 41 - Status Disable(SD) + * 0 - Shows PLS entries + * 1 - PLS entries are all 0 + * + * Bit 42 - Enable State Loss + * 0 - No state is lost irrespective of other fields + * 1 - Allows state loss + * + * Bit 43 - Exit Criterion + * 0 - Exit from power-save mode on any interrupt + * 1 - Exit from power-save mode controlled by LPCR's PECE bits + * + * Bits 44:47 - Power-Saving Level Limit + * This limits the power-saving level that can be entered into. + * + * Bits 60:63 - Requested Level + * Used to specify which power-saving level must be entered on executing + * stop instruction + * + * @np: /ibm,opal/power-mgt device node + * @flags: cpu-idle-state-flags array + * @dt_idle_states: Number of idle state entries + * Returns 0 on success + */ +static int __init pnv_arch300_idle_init(struct device_node *np, u32 *flags, + int dt_idle_states) +{ + u64 *psscr_val = NULL; + int rc = 0, i; - if (!firmware_has_feature(FW_FEATURE_OPAL)) + psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), + GFP_KERNEL); + if (!psscr_val) { + rc = -1; + goto out; + } + if (of_property_read_u64_array(np, + "ibm,cpu-idle-state-psscr", + psscr_val, dt_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-states-psscr in DT\n"); + rc = -1; goto out; + } - power_mgt = of_find_node_by_path("/ibm,opal/power-mgt"); - if (!power_mgt) { + /* + * Set pnv_first_deep_stop_state and pnv_deepest_stop_state. + * pnv_first_deep_stop_state should be set to the first stop + * level to cause hypervisor state loss. + * pnv_deepest_stop_state should be set to the deepest stop + * stop state. + */ + pnv_first_deep_stop_state = MAX_STOP_STATE; + for (i = 0; i < dt_idle_states; i++) { + u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK; + + if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) && + (pnv_first_deep_stop_state > psscr_rl)) + pnv_first_deep_stop_state = psscr_rl; + + if (pnv_deepest_stop_state < psscr_rl) + pnv_deepest_stop_state = psscr_rl; + } + +out: + kfree(psscr_val); + return rc; +} + +/* + * Probe device tree for supported idle states + */ +static void __init pnv_probe_idle_states(void) +{ + struct device_node *np; + int dt_idle_states; + u32 *flags = NULL; + int i; + + np = of_find_node_by_path("/ibm,opal/power-mgt"); + if (!np) { pr_warn("opal: PowerMgmt Node not found\n"); goto out; } - dt_idle_states = of_property_count_u32_elems(power_mgt, + dt_idle_states = of_property_count_u32_elems(np, "ibm,cpu-idle-state-flags"); if (dt_idle_states < 0) { pr_warn("cpuidle-powernv: no idle states found in the DT\n"); goto out; } - flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL); - if (of_property_read_u32_array(power_mgt, + flags = kcalloc(dt_idle_states, sizeof(*flags), GFP_KERNEL); + + if (of_property_read_u32_array(np, "ibm,cpu-idle-state-flags", flags, dt_idle_states)) { pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n"); - goto out_free; + goto out; + } + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (pnv_arch300_idle_init(np, flags, dt_idle_states)) + goto out; } for (i = 0; i < dt_idle_states; i++) supported_cpuidle_states |= flags[i]; +out: + kfree(flags); +} +static int __init pnv_init_idle_states(void) +{ + + supported_cpuidle_states = 0; + + if (cpuidle_disable != IDLE_NO_OVERRIDE) + goto out; + + pnv_probe_idle_states(); + if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { patch_instruction( (unsigned int *)pnv_fastsleep_workaround_at_entry, @@ -285,8 +411,12 @@ static int __init pnv_init_idle_states(void) } pnv_alloc_idle_core_states(); -out_free: - kfree(flags); + + if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) + ppc_md.power_save = power7_idle; + else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST) + ppc_md.power_save = power9_idle; + out: return 0; } diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 0459e100b..00e1a0195 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -73,7 +73,7 @@ EXPORT_SYMBOL(pnv_pci_get_npu_dev); static void *dma_npu_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, - struct dma_attrs *attrs) + unsigned long attrs) { NPU_DMA_OP_UNSUPPORTED(); return NULL; @@ -81,7 +81,7 @@ static void *dma_npu_alloc(struct device *dev, size_t size, static void dma_npu_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, - struct dma_attrs *attrs) + unsigned long attrs) { NPU_DMA_OP_UNSUPPORTED(); } @@ -89,7 +89,7 @@ static void dma_npu_free(struct device *dev, size_t size, static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction, - struct dma_attrs *attrs) + unsigned long attrs) { NPU_DMA_OP_UNSUPPORTED(); return 0; @@ -97,7 +97,7 @@ static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page, static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction, - struct dma_attrs *attrs) + unsigned long attrs) { NPU_DMA_OP_UNSUPPORTED(); return 0; @@ -180,7 +180,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); return rc; } - pnv_pci_ioda2_tce_invalidate_entire(phb, false); + pnv_pci_phb3_tce_invalidate_entire(phb, false); /* Add the table to the list so its TCE cache will get invalidated */ pnv_pci_link_table_and_group(phb->hose->node, num, @@ -204,7 +204,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) pe_err(npe, "Unmapping failed, ret = %lld\n", rc); return rc; } - pnv_pci_ioda2_tce_invalidate_entire(phb, false); + pnv_pci_phb3_tce_invalidate_entire(phb, false); pnv_pci_unlink_table_and_group(npe->table_group.tables[num], &npe->table_group); @@ -270,7 +270,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) 0 /* bypass base */, top); if (rc == OPAL_SUCCESS) - pnv_pci_ioda2_tce_invalidate_entire(phb, false); + pnv_pci_phb3_tce_invalidate_entire(phb, false); return rc; } @@ -334,7 +334,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) pe_err(npe, "Failed to disable bypass, err %lld\n", rc); return; } - pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); + pnv_pci_phb3_tce_invalidate_entire(npe->phb, false); } struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index bdc8c0c71..83bebeec0 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -117,6 +117,11 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) return -EINVAL; } + /* Wakeup the poller before we wait for events to speed things + * up on platforms or simulators where the interrupts aren't + * functional. + */ + opal_wake_poller(); wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); memcpy(msg, &opal_async_responses[token], sizeof(*msg)); diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index e505223b4..ed8bba68a 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -228,7 +228,8 @@ int __init opal_event_init(void) } /* Install interrupt handler */ - rc = request_irq(virq, opal_interrupt, 0, "opal", NULL); + rc = request_irq(virq, opal_interrupt, IRQF_TRIGGER_LOW, + "opal", NULL); if (rc) { irq_dispose_mapping(virq); pr_warn("Error %d requesting irq %d (0x%x)\n", diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index 00a29432b..4495f428b 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -44,7 +44,7 @@ static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt) { uint64_t paddr_start, paddr_end; - pr_debug("%s: Retrived memory error event, type: 0x%x\n", + pr_debug("%s: Retrieved memory error event, type: 0x%x\n", __func__, merr_evt->type); switch (merr_evt->type) { case OPAL_MEM_ERR_TYPE_RESILIENCE: diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index a06059df9..308efd170 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -55,7 +55,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) goto out_token; } - ret = opal_error_code(be64_to_cpu(msg.params[1])); + ret = opal_error_code(opal_get_async_rc(msg)); *sensor_data = be32_to_cpu(data); break; diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c index afe66c576..23fb6647d 100644 --- a/arch/powerpc/platforms/powernv/opal-sysparam.c +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -67,7 +67,7 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer) goto out_token; } - ret = opal_error_code(be64_to_cpu(msg.params[1])); + ret = opal_error_code(opal_get_async_rc(msg)); out_token: opal_async_release_token(token); @@ -103,7 +103,7 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer) goto out_token; } - ret = opal_error_code(be64_to_cpu(msg.params[1])); + ret = opal_error_code(opal_get_async_rc(msg)); out_token: opal_async_release_token(token); diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c index e11273b23..1e496b780 100644 --- a/arch/powerpc/platforms/powernv/opal-tracepoints.c +++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c @@ -1,6 +1,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL struct static_key opal_tracepoint_key = STATIC_KEY_INIT; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index e45b88a5d..3d29d40eb 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -59,12 +59,11 @@ END_FTR_SECTION(0, 1); \ #define OPAL_CALL(name, token) \ _GLOBAL_TOC(name); \ mflr r0; \ - std r0,16(r1); \ + std r0,PPC_LR_STKOFF(r1); \ li r0,token; \ OPAL_BRANCH(opal_tracepoint_entry) \ mfcr r12; \ stw r12,8(r1); \ - std r1,PACAR1(r13); \ li r11,0; \ mfmsr r12; \ ori r11,r11,MSR_EE; \ @@ -92,7 +91,7 @@ opal_return: FIXUP_ENDIAN ld r2,PACATOC(r13); lwz r4,8(r1); - ld r5,16(r1); + ld r5,PPC_LR_STKOFF(r1); ld r6,PACASAVEDMSR(r13); mtspr SPRN_SRR0,r5; mtspr SPRN_SRR1,r6; @@ -127,7 +126,6 @@ opal_tracepoint_entry: mfcr r12 std r11,16(r1) stw r12,8(r1) - std r1,PACAR1(r13) li r11,0 mfmsr r12 ori r11,r11,MSR_EE @@ -157,43 +155,37 @@ opal_tracepoint_return: blr #endif -/* - * Make opal call in realmode. This is a generic function to be called - * from realmode. It handles endianness. - * - * r13 - paca pointer - * r1 - stack pointer - * r0 - opal token - */ -_GLOBAL(opal_call_realmode) - mflr r12 - std r12,PPC_LR_STKOFF(r1) - ld r2,PACATOC(r13) - /* Set opal return address */ - LOAD_REG_ADDR(r12,return_from_opal_call) - mtlr r12 - - mfmsr r12 -#ifdef __LITTLE_ENDIAN__ - /* Handle endian-ness */ - li r11,MSR_LE - andc r12,r12,r11 -#endif - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 +#define OPAL_CALL_REAL(name, token) \ + _GLOBAL_TOC(name); \ + mflr r0; \ + std r0,PPC_LR_STKOFF(r1); \ + li r0,token; \ + mfcr r12; \ + stw r12,8(r1); \ + \ + /* Set opal return address */ \ + LOAD_REG_ADDR(r11, opal_return_realmode); \ + mtlr r11; \ + mfmsr r12; \ + li r11,MSR_LE; \ + andc r12,r12,r11; \ + mtspr SPRN_HSRR1,r12; \ + LOAD_REG_ADDR(r11,opal); \ + ld r12,8(r11); \ + ld r2,0(r11); \ + mtspr SPRN_HSRR0,r12; \ hrfid -return_from_opal_call: -#ifdef __LITTLE_ENDIAN__ +opal_return_realmode: FIXUP_ENDIAN -#endif + ld r2,PACATOC(r13); + lwz r11,8(r1); ld r12,PPC_LR_STKOFF(r1) + mtcr r11; mtlr r12 blr + OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); @@ -271,6 +263,7 @@ OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL_REAL(opal_rm_resync_timebase, OPAL_RESYNC_TIMEBASE); OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); @@ -278,6 +271,7 @@ OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); @@ -285,7 +279,9 @@ OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); OPAL_CALL(opal_get_param, OPAL_GET_PARAM); OPAL_CALL(opal_set_param, OPAL_SET_PARAM); OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL_REAL(opal_rm_handle_hmi, OPAL_HANDLE_HMI); OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); +OPAL_CALL_REAL(opal_rm_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); @@ -302,3 +298,13 @@ OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); +OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); +OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); +OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); +OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); +OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); +OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); +OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); +OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); +OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); +OPAL_CALL_REAL(opal_rm_pci_tce_kill, OPAL_PCI_TCE_KILL); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 0256d0729..6c9a65b52 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -55,8 +55,9 @@ struct device_node *opal_node; static DEFINE_SPINLOCK(opal_write_lock); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static uint32_t opal_heartbeat; +static struct task_struct *kopald_tsk; -static void opal_reinit_cores(void) +void opal_configure_cores(void) { /* Do the actual re-init, This will clobber all FPRs, VRs, etc... * @@ -69,6 +70,10 @@ static void opal_reinit_cores(void) #else opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); #endif + + /* Restore some bits */ + if (cur_cpu_spec->cpu_restore) + cur_cpu_spec->cpu_restore(); } int __init early_init_dt_scan_opal(unsigned long node, @@ -105,13 +110,6 @@ int __init early_init_dt_scan_opal(unsigned long node, panic("OPAL != V3 detected, no longer supported.\n"); } - /* Reinit all cores with the right endian */ - opal_reinit_cores(); - - /* Restore some bits */ - if (cur_cpu_spec->cpu_restore) - cur_cpu_spec->cpu_restore(); - return 1; } @@ -401,6 +399,7 @@ static int opal_recover_mce(struct pt_regs *regs, if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ + pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { /* Platform corrected itself */ @@ -653,6 +652,7 @@ static void opal_i2c_create_devs(void) static int kopald(void *unused) { + unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1; __be64 events; set_freezable(); @@ -660,12 +660,18 @@ static int kopald(void *unused) try_to_freeze(); opal_poll_events(&events); opal_handle_events(be64_to_cpu(events)); - msleep_interruptible(opal_heartbeat); + schedule_timeout_interruptible(timeout); } while (!kthread_should_stop()); return 0; } +void opal_wake_poller(void) +{ + if (kopald_tsk) + wake_up_process(kopald_tsk); +} + static void opal_init_heartbeat(void) { /* Old firwmware, we assume the HVC heartbeat is sufficient */ @@ -674,7 +680,7 @@ static void opal_init_heartbeat(void) opal_heartbeat = 0; if (opal_heartbeat) - kthread_run(kopald, NULL, "kopald"); + kopald_tsk = kthread_run(kopald, NULL, "kopald"); } static int __init opal_init(void) @@ -751,6 +757,9 @@ static int __init opal_init(void) opal_pdev_init(opal_node, "ibm,opal-flash"); opal_pdev_init(opal_node, "ibm,opal-prd"); + /* Initialise platform device: oppanel interface */ + opal_pdev_init(opal_node, "ibm,opal-oppanel"); + /* Initialise OPAL kmsg dumper for flushing console on panic */ opal_kmsg_init(); @@ -885,3 +894,5 @@ EXPORT_SYMBOL_GPL(opal_i2c_request); /* Export these symbols for PowerNV LED class driver */ EXPORT_SYMBOL_GPL(opal_leds_get_ind); EXPORT_SYMBOL_GPL(opal_leds_set_ind); +/* Export this symbol for PowerNV Operator Panel class driver */ +EXPORT_SYMBOL_GPL(opal_write_oppanel_async); diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c new file mode 100644 index 000000000..1349a099c --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-cxl.c @@ -0,0 +1,385 @@ +/* + * Copyright 2014-2016 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include "pci.h" + +struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + + return of_node_get(hose->dn); +} +EXPORT_SYMBOL(pnv_pci_get_phb_node); + +int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + int rc; + + pe = pnv_ioda_get_pe(dev); + if (!pe) + return -ENODEV; + + pe_info(pe, "Switching PHB to CXL\n"); + + rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number); + if (rc == OPAL_UNSUPPORTED) + dev_err(&dev->dev, "Required cxl mode not supported by firmware - update skiboot\n"); + else if (rc) + dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc); + + return rc; +} +EXPORT_SYMBOL(pnv_phb_to_cxl_mode); + +/* Find PHB for cxl dev and allocate MSI hwirqs? + * Returns the absolute hardware IRQ number + */ +int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num); + + if (hwirq < 0) { + dev_warn(&dev->dev, "Failed to find a free MSI\n"); + return -ENOSPC; + } + + return phb->msi_base + hwirq; +} +EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs); + +void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num); +} +EXPORT_SYMBOL(pnv_cxl_release_hwirqs); + +void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs, + struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + int i, hwirq; + + for (i = 1; i < CXL_IRQ_RANGES; i++) { + if (!irqs->range[i]) + continue; + pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n", + i, irqs->offset[i], + irqs->range[i]); + hwirq = irqs->offset[i] - phb->msi_base; + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, + irqs->range[i]); + } +} +EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges); + +int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, + struct pci_dev *dev, int num) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + int i, hwirq, try; + + memset(irqs, 0, sizeof(struct cxl_irq_ranges)); + + /* 0 is reserved for the multiplexed PSL DSI interrupt */ + for (i = 1; i < CXL_IRQ_RANGES && num; i++) { + try = num; + while (try) { + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try); + if (hwirq >= 0) + break; + try /= 2; + } + if (!try) + goto fail; + + irqs->offset[i] = phb->msi_base + hwirq; + irqs->range[i] = try; + pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n", + i, irqs->offset[i], irqs->range[i]); + num -= try; + } + if (num) + goto fail; + + return 0; +fail: + pnv_cxl_release_hwirq_ranges(irqs, dev); + return -ENOSPC; +} +EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges); + +int pnv_cxl_get_irq_count(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + + return phb->msi_bmp.irq_count; +} +EXPORT_SYMBOL(pnv_cxl_get_irq_count); + +int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, + unsigned int virq) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + unsigned int xive_num = hwirq - phb->msi_base; + struct pnv_ioda_pe *pe; + int rc; + + if (!(pe = pnv_ioda_get_pe(dev))) + return -ENODEV; + + /* Assign XIVE to PE */ + rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); + if (rc) { + pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x " + "hwirq 0x%x XIVE 0x%x PE\n", + pci_name(dev), rc, phb->msi_base, hwirq, xive_num); + return -EIO; + } + pnv_set_msi_irq_chip(phb, virq); + + return 0; +} +EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup); + +#if IS_MODULE(CONFIG_CXL) +static inline int get_cxl_module(void) +{ + struct module *cxl_module; + + mutex_lock(&module_mutex); + + cxl_module = find_module("cxl"); + if (cxl_module) + __module_get(cxl_module); + + mutex_unlock(&module_mutex); + + if (!cxl_module) + return -ENODEV; + + return 0; +} +#else +static inline int get_cxl_module(void) { return 0; } +#endif + +/* + * Sets flags and switches the controller ops to enable the cxl kernel api. + * Originally the cxl kernel API operated on a virtual PHB, but certain cards + * such as the Mellanox CX4 use a peer model instead and for these cards the + * cxl kernel api will operate on the real PHB. + */ +int pnv_cxl_enable_phb_kernel_api(struct pci_controller *hose, bool enable) +{ + struct pnv_phb *phb = hose->private_data; + int rc; + + if (!enable) { + /* + * Once cxl mode is enabled on the PHB, there is currently no + * known safe method to disable it again, and trying risks a + * checkstop. If we can find a way to safely disable cxl mode + * in the future we can revisit this, but for now the only sane + * thing to do is to refuse to disable cxl mode: + */ + return -EPERM; + } + + /* + * Hold a reference to the cxl module since several PHB operations now + * depend on it, and it would be insane to allow it to be removed so + * long as we are in this mode (and since we can't safely disable this + * mode once enabled...). + */ + rc = get_cxl_module(); + if (rc) + return rc; + + phb->flags |= PNV_PHB_FLAG_CXL; + hose->controller_ops = pnv_cxl_cx4_ioda_controller_ops; + + return 0; +} +EXPORT_SYMBOL_GPL(pnv_cxl_enable_phb_kernel_api); + +bool pnv_pci_on_cxl_phb(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + + return !!(phb->flags & PNV_PHB_FLAG_CXL); +} +EXPORT_SYMBOL_GPL(pnv_pci_on_cxl_phb); + +struct cxl_afu *pnv_cxl_phb_to_afu(struct pci_controller *hose) +{ + struct pnv_phb *phb = hose->private_data; + + return (struct cxl_afu *)phb->cxl_afu; +} +EXPORT_SYMBOL_GPL(pnv_cxl_phb_to_afu); + +void pnv_cxl_phb_set_peer_afu(struct pci_dev *dev, struct cxl_afu *afu) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + + phb->cxl_afu = afu; +} +EXPORT_SYMBOL_GPL(pnv_cxl_phb_set_peer_afu); + +/* + * In the peer cxl model, the XSL/PSL is physical function 0, and will be used + * by other functions on the device for memory access and interrupts. When the + * other functions are enabled we explicitly take a reference on the cxl + * function since they will use it, and allocate a default context associated + * with that function just like the vPHB model of the cxl kernel API. + */ +bool pnv_cxl_enable_device_hook(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct cxl_afu *afu = phb->cxl_afu; + + if (!pnv_pci_enable_device_hook(dev)) + return false; + + + /* No special handling for the cxl function, which is always PF 0 */ + if (PCI_FUNC(dev->devfn) == 0) + return true; + + if (!afu) { + dev_WARN(&dev->dev, "Attempted to enable function > 0 on CXL PHB without a peer AFU\n"); + return false; + } + + dev_info(&dev->dev, "Enabling function on CXL enabled PHB with peer AFU\n"); + + /* Make sure the peer AFU can't go away while this device is active */ + cxl_afu_get(afu); + + return cxl_pci_associate_default_context(dev, afu); +} + +void pnv_cxl_disable_device(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct cxl_afu *afu = phb->cxl_afu; + + /* No special handling for cxl function: */ + if (PCI_FUNC(dev->devfn) == 0) + return; + + cxl_pci_disable_device(dev); + cxl_afu_put(afu); +} + +/* + * This is a special version of pnv_setup_msi_irqs for cards in cxl mode. This + * function handles setting up the IVTE entries for the XSL to use. + * + * We are currently not filling out the MSIX table, since the only currently + * supported adapter (CX4) uses a custom MSIX table format in cxl mode and it + * is up to their driver to fill that out. In the future we may fill out the + * MSIX table (and change the IVTE entries to be an index to the MSIX table) + * for adapters implementing the Full MSI-X mode described in the CAIA. + */ +int pnv_cxl_cx4_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct msi_desc *entry; + struct cxl_context *ctx = NULL; + unsigned int virq; + int hwirq; + int afu_irq = 0; + int rc; + + if (WARN_ON(!phb) || !phb->msi_bmp.bitmap) + return -ENODEV; + + if (pdev->no_64bit_msi && !phb->msi32_support) + return -ENODEV; + + rc = cxl_cx4_setup_msi_irqs(pdev, nvec, type); + if (rc) + return rc; + + for_each_pci_msi_entry(entry, pdev) { + if (!entry->msi_attrib.is_64 && !phb->msi32_support) { + pr_warn("%s: Supports only 64-bit MSIs\n", + pci_name(pdev)); + return -ENXIO; + } + + hwirq = cxl_next_msi_hwirq(pdev, &ctx, &afu_irq); + if (WARN_ON(hwirq <= 0)) + return (hwirq ? hwirq : -ENOMEM); + + virq = irq_create_mapping(NULL, hwirq); + if (virq == NO_IRQ) { + pr_warn("%s: Failed to map cxl mode MSI to linux irq\n", + pci_name(pdev)); + return -ENOMEM; + } + + rc = pnv_cxl_ioda_msi_setup(pdev, hwirq, virq); + if (rc) { + pr_warn("%s: Failed to setup cxl mode MSI\n", pci_name(pdev)); + irq_dispose_mapping(virq); + return rc; + } + + irq_set_msi_desc(virq, entry); + } + + return 0; +} + +void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct msi_desc *entry; + irq_hw_number_t hwirq; + + if (WARN_ON(!phb)) + return; + + for_each_pci_msi_entry(entry, pdev) { + if (entry->irq == NO_IRQ) + continue; + hwirq = virq_to_hw(entry->irq); + irq_set_msi_desc(entry->irq, NULL); + irq_dispose_mapping(entry->irq); + } + + cxl_cx4_teardown_msi_irqs(pdev); +} diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 9e160fa74..38a5c657f 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -55,6 +55,7 @@ #define POWERNV_IOMMU_DEFAULT_LEVELS 1 #define POWERNV_IOMMU_MAX_LEVELS 5 +static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU" }; static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, @@ -110,10 +111,24 @@ static int __init iommu_setup(char *str) } early_param("iommu", iommu_setup); -static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) +static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) { - return ((flags & (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)) == - (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); + /* + * WARNING: We cannot rely on the resource flags. The Linux PCI + * allocation code sometimes decides to put a 64-bit prefetchable + * BAR in the 32-bit window, so we have to compare the addresses. + * + * For simplicity we only test resource start. + */ + return (r->start >= phb->ioda.m64_base && + r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); +} + +static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) +{ + unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); + + return (resource_flags & flags) == flags; } static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) @@ -141,16 +156,14 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) { - unsigned long pe; + long pe; - do { - pe = find_next_zero_bit(phb->ioda.pe_alloc, - phb->ioda.total_pe_num, 0); - if (pe >= phb->ioda.total_pe_num) - return NULL; - } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); + for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) { + if (!test_and_set_bit(pe, phb->ioda.pe_alloc)) + return pnv_ioda_init_pe(phb, pe); + } - return pnv_ioda_init_pe(phb, pe); + return NULL; } static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) @@ -193,18 +206,15 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb) goto fail; } - /* Mark the M64 BAR assigned */ - set_bit(phb->ioda.m64_bar_idx, &phb->ioda.m64_bar_alloc); - /* - * Strip off the segment used by the reserved PE, which is - * expected to be 0 or last one of PE capabicity. + * Exclude the segments for reserved and root bus PE, which + * are first or last two PEs. */ r = &phb->hose->mem_resources[1]; if (phb->ioda.reserved_pe_idx == 0) - r->start += phb->ioda.m64_segsize; + r->start += (2 * phb->ioda.m64_segsize); else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) - r->end -= phb->ioda.m64_segsize; + r->end -= (2 * phb->ioda.m64_segsize); else pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", phb->ioda.reserved_pe_idx); @@ -234,7 +244,7 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, sgsz = phb->ioda.m64_segsize; for (i = 0; i <= PCI_ROM_RESOURCE; i++) { r = &pdev->resource[i]; - if (!r->parent || !pnv_pci_is_mem_pref_64(r->flags)) + if (!r->parent || !pnv_pci_is_m64(phb, r)) continue; start = _ALIGN_DOWN(r->start - base, sgsz); @@ -284,14 +294,14 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb) } /* - * Exclude the segment used by the reserved PE, which - * is expected to be 0 or last supported PE#. + * Exclude the segments for reserved and root bus PE, which + * are first or last two PEs. */ r = &phb->hose->mem_resources[1]; if (phb->ioda.reserved_pe_idx == 0) - r->start += phb->ioda.m64_segsize; + r->start += (2 * phb->ioda.m64_segsize); else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) - r->end -= phb->ioda.m64_segsize; + r->end -= (2 * phb->ioda.m64_segsize); else WARN(1, "Wrong reserved PE#%d on PHB#%d\n", phb->ioda.reserved_pe_idx, phb->hose->global_number); @@ -406,6 +416,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) struct pci_controller *hose = phb->hose; struct device_node *dn = hose->dn; struct resource *res; + u32 m64_range[2], i; const u32 *r; u64 pci_addr; @@ -426,6 +437,30 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) return; } + /* + * Find the available M64 BAR range and pickup the last one for + * covering the whole 64-bits space. We support only one range. + */ + if (of_property_read_u32_array(dn, "ibm,opal-available-m64-ranges", + m64_range, 2)) { + /* In absence of the property, assume 0..15 */ + m64_range[0] = 0; + m64_range[1] = 16; + } + /* We only support 64 bits in our allocator */ + if (m64_range[1] > 63) { + pr_warn("%s: Limiting M64 range to 63 (from %d) on PHB#%x\n", + __func__, m64_range[1], phb->hose->global_number); + m64_range[1] = 63; + } + /* Empty range, no m64 */ + if (m64_range[1] <= m64_range[0]) { + pr_warn("%s: M64 empty, disabling M64 usage on PHB#%x\n", + __func__, phb->hose->global_number); + return; + } + + /* Configure M64 informations */ res = &hose->mem_resources[1]; res->name = dn->full_name; res->start = of_translate_address(dn, r + 2); @@ -438,11 +473,28 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num; phb->ioda.m64_base = pci_addr; - pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", - res->start, res->end, pci_addr); + /* This lines up nicely with the display from processing OF ranges */ + pr_info(" MEM 0x%016llx..0x%016llx -> 0x%016llx (M64 #%d..%d)\n", + res->start, res->end, pci_addr, m64_range[0], + m64_range[0] + m64_range[1] - 1); + + /* Mark all M64 used up by default */ + phb->ioda.m64_bar_alloc = (unsigned long)-1; /* Use last M64 BAR to cover M64 window */ - phb->ioda.m64_bar_idx = 15; + m64_range[1]--; + phb->ioda.m64_bar_idx = m64_range[0] + m64_range[1]; + + pr_info(" Using M64 #%d as default window\n", phb->ioda.m64_bar_idx); + + /* Mark remaining ones free */ + for (i = m64_range[0]; i < m64_range[1]; i++) + clear_bit(i, &phb->ioda.m64_bar_alloc); + + /* + * Setup init functions for M64 based on IODA version, IODA3 uses + * the IODA2 code. + */ if (phb->type == PNV_PHB_IODA1) phb->init_m64 = pnv_ioda1_init_m64; else @@ -597,7 +649,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) * but in the meantime, we need to protect them to avoid warnings */ #ifdef CONFIG_PCI_MSI -static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) +struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); struct pnv_phb *phb = hose->private_data; @@ -715,7 +767,6 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, return 0; } -#ifdef CONFIG_PCI_IOV static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; @@ -750,9 +801,11 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) } rid_end = pe->rid + (count << 8); } else { +#ifdef CONFIG_PCI_IOV if (pe->flags & PNV_IODA_PE_VF) parent = pe->parent_dev; else +#endif parent = pe->pdev->bus->self; bcomp = OpalPciBusAll; dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; @@ -762,7 +815,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) /* Clear the reverse map */ for (rid = pe->rid; rid < rid_end; rid++) - phb->ioda.pe_rmap[rid] = 0; + phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; /* Release from all parents PELT-V */ while (parent) { @@ -790,11 +843,12 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) pe->pbus = NULL; pe->pdev = NULL; +#ifdef CONFIG_PCI_IOV pe->parent_dev = NULL; +#endif return 0; } -#endif /* CONFIG_PCI_IOV */ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { @@ -1025,6 +1079,16 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) pci_name(dev)); continue; } + + /* + * In partial hotplug case, the PCI device might be still + * associated with the PE and needn't attach it to the PE + * again. + */ + if (pdn->pe_number != IODA_INVALID_PE) + continue; + + pe->device_count++; pdn->pcidev = dev; pdn->pe_number = pe->pe_number; if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) @@ -1043,9 +1107,26 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; struct pnv_ioda_pe *pe = NULL; + unsigned int pe_num; + + /* + * In partial hotplug case, the PE instance might be still alive. + * We should reuse it instead of allocating a new one. + */ + pe_num = phb->ioda.pe_rmap[bus->number << 8]; + if (pe_num != IODA_INVALID_PE) { + pe = &phb->ioda.pe_array[pe_num]; + pnv_ioda_setup_same_PE(bus, pe); + return NULL; + } + + /* PE number for root bus should have been reserved */ + if (pci_is_root_bus(bus) && + phb->ioda.root_pe_idx != IODA_INVALID_PE) + pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx]; /* Check if PE is determined by M64 */ - if (phb->pick_m64_pe) + if (!pe && phb->pick_m64_pe) pe = phb->pick_m64_pe(bus, all); /* The PE number isn't pinned by M64 */ @@ -1157,30 +1238,6 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) pnv_ioda_setup_npu_PE(pdev); } -static void pnv_ioda_setup_PEs(struct pci_bus *bus) -{ - struct pci_dev *dev; - - pnv_ioda_setup_bus_PE(bus, false); - - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->subordinate) { - if (pci_pcie_type(dev) == PCI_EXP_TYPE_PCI_BRIDGE) - pnv_ioda_setup_bus_PE(dev->subordinate, true); - else - pnv_ioda_setup_PEs(dev->subordinate); - } - } -} - -/* - * Configure PEs so that the downstream PCI buses and devices - * could have their associated PE#. Unfortunately, we didn't - * figure out the way to identify the PLX bridge yet. So we - * simply put the PCI bus and the subordinate behind the root - * port to PE# here. The game rule here is expected to be changed - * as soon as we can detected PLX bridge correctly. - */ static void pnv_pci_ioda_setup_PEs(void) { struct pci_controller *hose, *tmp; @@ -1188,22 +1245,11 @@ static void pnv_pci_ioda_setup_PEs(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { phb = hose->private_data; - - /* M64 layout might affect PE allocation */ - if (phb->reserve_m64_pe) - phb->reserve_m64_pe(hose->bus, NULL, true); - - /* - * On NPU PHB, we expect separate PEs for individual PCI - * functions. PCI bus dependent PEs are required for the - * remaining types of PHBs. - */ if (phb->type == PNV_PHB_NPU) { /* PE#0 is needed for error reporting */ pnv_ioda_reserve_pe(phb, 0); pnv_ioda_setup_npu_PEs(hose->bus); - } else - pnv_ioda_setup_PEs(hose->bus); + } } } @@ -1729,7 +1775,14 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, } } -static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, +static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb, + bool real_mode) +{ + return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) : + (phb->regs + 0x210); +} + +static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { struct iommu_table_group_link *tgl = list_first_entry_or_null( @@ -1737,33 +1790,17 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, next); struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); - __be64 __iomem *invalidate = rm ? - (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : - pe->phb->ioda.tce_inval_reg; + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); unsigned long start, end, inc; - const unsigned shift = tbl->it_page_shift; start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset); end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset + npages - 1); - /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */ - if (tbl->it_busno) { - start <<= shift; - end <<= shift; - inc = 128ull << shift; - start |= tbl->it_busno; - end |= tbl->it_busno; - } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) { - /* p7ioc-style invalidation, 2 TCEs per write */ - start |= (1ull << 63); - end |= (1ull << 63); - inc = 16; - } else { - /* Default (older HW) */ - inc = 128; - } - + /* p7ioc-style invalidation, 2 TCEs per write */ + start |= (1ull << 63); + end |= (1ull << 63); + inc = 16; end |= inc - 1; /* round up end to be different than start */ mb(); /* Ensure above stores are visible */ @@ -1784,13 +1821,13 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl, static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - struct dma_attrs *attrs) + unsigned long attrs) { int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, attrs); - if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) - pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); + if (!ret) + pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); return ret; } @@ -1801,9 +1838,8 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, { long ret = pnv_tce_xchg(tbl, index, hpa, direction); - if (!ret && (tbl->it_type & - (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) - pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false); + if (!ret) + pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); return ret; } @@ -1814,8 +1850,7 @@ static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, { pnv_tce_free(tbl, index, npages); - if (tbl->it_type & TCE_PCI_SWINV_FREE) - pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false); + pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); } static struct iommu_table_ops pnv_ioda1_iommu_ops = { @@ -1827,45 +1862,42 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .get = pnv_tce_get, }; -#define TCE_KILL_INVAL_ALL PPC_BIT(0) -#define TCE_KILL_INVAL_PE PPC_BIT(1) -#define TCE_KILL_INVAL_TCE PPC_BIT(2) +#define PHB3_TCE_KILL_INVAL_ALL PPC_BIT(0) +#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) +#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) -void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) { - const unsigned long val = TCE_KILL_INVAL_ALL; + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); + const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; mb(); /* Ensure previous TCE table stores are visible */ if (rm) - __raw_rm_writeq(cpu_to_be64(val), - (__be64 __iomem *) - phb->ioda.tce_inval_reg_phys); + __raw_rm_writeq(cpu_to_be64(val), invalidate); else - __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); + __raw_writeq(cpu_to_be64(val), invalidate); } -static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) +static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ - unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); - struct pnv_phb *phb = pe->phb; - - if (!phb->ioda.tce_inval_reg) - return; + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false); + unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); mb(); /* Ensure above stores are visible */ - __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); + __raw_writeq(cpu_to_be64(val), invalidate); } -static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, - __be64 __iomem *invalidate, unsigned shift, - unsigned long index, unsigned long npages) +static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, + unsigned shift, unsigned long index, + unsigned long npages) { + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ - start = TCE_KILL_INVAL_TCE; - start |= (pe_number & 0xFF); + start = PHB3_TCE_KILL_INVAL_ONE; + start |= (pe->pe_number & 0xFF); end = start; /* Figure out the start, end and step */ @@ -1883,6 +1915,17 @@ static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, } } +static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) +{ + struct pnv_phb *phb = pe->phb; + + if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs) + pnv_pci_phb3_tce_invalidate_pe(pe); + else + opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PE, + pe->pe_number, 0, 0, 0); +} + static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, unsigned long index, unsigned long npages, bool rm) { @@ -1891,34 +1934,43 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); - __be64 __iomem *invalidate = rm ? - (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : - pe->phb->ioda.tce_inval_reg; + struct pnv_phb *phb = pe->phb; + unsigned int shift = tbl->it_page_shift; - if (pe->phb->type == PNV_PHB_NPU) { + if (phb->type == PNV_PHB_NPU) { /* * The NVLink hardware does not support TCE kill * per TCE entry so we have to invalidate * the entire cache for it. */ - pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm); + pnv_pci_phb3_tce_invalidate_entire(phb, rm); continue; } - pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, - invalidate, tbl->it_page_shift, - index, npages); + if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs) + pnv_pci_phb3_tce_invalidate(pe, rm, shift, + index, npages); + else if (rm) + opal_rm_pci_tce_kill(phb->opal_id, + OPAL_PCI_TCE_KILL_PAGES, + pe->pe_number, 1u << shift, + index << shift, npages); + else + opal_pci_tce_kill(phb->opal_id, + OPAL_PCI_TCE_KILL_PAGES, + pe->pe_number, 1u << shift, + index << shift, npages); } } static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - struct dma_attrs *attrs) + unsigned long attrs) { int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, attrs); - if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE)) + if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); return ret; @@ -1930,8 +1982,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, { long ret = pnv_tce_xchg(tbl, index, hpa, direction); - if (!ret && (tbl->it_type & - (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE))) + if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); return ret; @@ -1943,8 +1994,7 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, { pnv_tce_free(tbl, index, npages); - if (tbl->it_type & TCE_PCI_SWINV_FREE) - pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); } static void pnv_ioda2_table_free(struct iommu_table *tbl) @@ -2113,12 +2163,6 @@ found: base * PNV_IODA1_DMA32_SEGSIZE, IOMMU_PAGE_SHIFT_4K); - /* OPAL variant of P7IOC SW invalidated TCEs */ - if (phb->ioda.tce_inval_reg) - tbl->it_type |= (TCE_PCI_SWINV_CREATE | - TCE_PCI_SWINV_FREE | - TCE_PCI_SWINV_PAIR); - tbl->it_ops = &pnv_ioda1_iommu_ops; pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; @@ -2241,8 +2285,6 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, } tbl->it_ops = &pnv_ioda2_iommu_ops; - if (pe->phb->ioda.tce_inval_reg) - tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); *ptbl = tbl; @@ -2291,10 +2333,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) if (!pnv_iommu_bypass_disabled) pnv_pci_ioda2_set_bypass(pe, true); - /* OPAL variant of PHB3 invalidated TCEs */ - if (pe->phb->ioda.tce_inval_reg) - tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE); - /* * Setting table base here only for carrying iommu_group * further down to let iommu_add_device() do the job. @@ -2505,19 +2543,6 @@ static void pnv_pci_ioda_setup_iommu_api(void) static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif -static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) -{ - const __be64 *swinvp; - - /* OPAL variant of PHB3 invalidated TCEs */ - swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL); - if (!swinvp) - return; - - phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp); - phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8); -} - static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, unsigned levels, unsigned long limit, unsigned long *current_offset, unsigned long *total_allocated) @@ -2658,6 +2683,9 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, { int64_t rc; + if (!pnv_pci_ioda_pe_dma_weight(pe)) + return; + /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; @@ -2689,49 +2717,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pnv_ioda_setup_bus_dma(pe, pe->pbus); } -static void pnv_ioda_setup_dma(struct pnv_phb *phb) -{ - struct pci_controller *hose = phb->hose; - struct pnv_ioda_pe *pe; - unsigned int weight; - - /* If we have more PE# than segments available, hand out one - * per PE until we run out and let the rest fail. If not, - * then we assign at least one segment per PE, plus more based - * on the amount of devices under that PE - */ - pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n", - hose->global_number, phb->ioda.dma32_count); - - pnv_pci_ioda_setup_opal_tce_kill(phb); - - /* Walk our PE list and configure their DMA segments */ - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - weight = pnv_pci_ioda_pe_dma_weight(pe); - if (!weight) - continue; - - /* - * For IODA2 compliant PHB3, we needn't care about the weight. - * The all available 32-bits DMA space will be assigned to - * the specific PE. - */ - if (phb->type == PNV_PHB_IODA1) { - pnv_pci_ioda1_setup_dma_pe(phb, pe); - } else if (phb->type == PNV_PHB_IODA2) { - pe_info(pe, "Assign DMA32 space\n"); - pnv_pci_ioda2_setup_dma_pe(phb, pe); - } else if (phb->type == PNV_PHB_NPU) { - /* - * We initialise the DMA space for an NPU PHB - * after setup of the PHB is complete as we - * point the NPU TVT to the the same location - * as the PHB3 TVT. - */ - } - } -} - #ifdef CONFIG_PCI_MSI static void pnv_ioda2_msi_eoi(struct irq_data *d) { @@ -2748,12 +2733,13 @@ static void pnv_ioda2_msi_eoi(struct irq_data *d) } -static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) +void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) { struct irq_data *idata; struct irq_chip *ichip; - if (phb->type != PNV_PHB_IODA2) + /* The MSI EOI OPAL call is only needed on PHB3 */ + if (phb->model != PNV_PHB_MODEL_PHB3) return; if (!phb->ioda.irq_chip_init) { @@ -2770,157 +2756,6 @@ static void set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) irq_set_chip(virq, &phb->ioda.irq_chip); } -#ifdef CONFIG_CXL_BASE - -struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - return of_node_get(hose->dn); -} -EXPORT_SYMBOL(pnv_pci_get_phb_node); - -int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - int rc; - - pe = pnv_ioda_get_pe(dev); - if (!pe) - return -ENODEV; - - pe_info(pe, "Switching PHB to CXL\n"); - - rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number); - if (rc) - dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc); - - return rc; -} -EXPORT_SYMBOL(pnv_phb_to_cxl_mode); - -/* Find PHB for cxl dev and allocate MSI hwirqs? - * Returns the absolute hardware IRQ number - */ -int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num); - - if (hwirq < 0) { - dev_warn(&dev->dev, "Failed to find a free MSI\n"); - return -ENOSPC; - } - - return phb->msi_base + hwirq; -} -EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs); - -void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num); -} -EXPORT_SYMBOL(pnv_cxl_release_hwirqs); - -void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs, - struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - int i, hwirq; - - for (i = 1; i < CXL_IRQ_RANGES; i++) { - if (!irqs->range[i]) - continue; - pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n", - i, irqs->offset[i], - irqs->range[i]); - hwirq = irqs->offset[i] - phb->msi_base; - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, - irqs->range[i]); - } -} -EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges); - -int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, - struct pci_dev *dev, int num) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - int i, hwirq, try; - - memset(irqs, 0, sizeof(struct cxl_irq_ranges)); - - /* 0 is reserved for the multiplexed PSL DSI interrupt */ - for (i = 1; i < CXL_IRQ_RANGES && num; i++) { - try = num; - while (try) { - hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try); - if (hwirq >= 0) - break; - try /= 2; - } - if (!try) - goto fail; - - irqs->offset[i] = phb->msi_base + hwirq; - irqs->range[i] = try; - pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n", - i, irqs->offset[i], irqs->range[i]); - num -= try; - } - if (num) - goto fail; - - return 0; -fail: - pnv_cxl_release_hwirq_ranges(irqs, dev); - return -ENOSPC; -} -EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges); - -int pnv_cxl_get_irq_count(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - - return phb->msi_bmp.irq_count; -} -EXPORT_SYMBOL(pnv_cxl_get_irq_count); - -int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, - unsigned int virq) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - unsigned int xive_num = hwirq - phb->msi_base; - struct pnv_ioda_pe *pe; - int rc; - - if (!(pe = pnv_ioda_get_pe(dev))) - return -ENODEV; - - /* Assign XIVE to PE */ - rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); - if (rc) { - pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x " - "hwirq 0x%x XIVE 0x%x PE\n", - pci_name(dev), rc, phb->msi_base, hwirq, xive_num); - return -EIO; - } - set_msi_irq_chip(phb, virq); - - return 0; -} -EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup); -#endif - static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg) @@ -2977,7 +2812,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, } msg->data = be32_to_cpu(data); - set_msi_irq_chip(phb, virq); + pnv_set_msi_irq_chip(phb, virq); pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," " address=%x_%08x data=%x PE# %d\n", @@ -3043,7 +2878,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) res = &pdev->resource[i + PCI_IOV_RESOURCES]; if (!res->flags || res->parent) continue; - if (!pnv_pci_is_mem_pref_64(res->flags)) { + if (!pnv_pci_is_m64_flags(res->flags)) { dev_warn(&pdev->dev, "Don't support SR-IOV with" " non M64 VF BAR%d: %pR. \n", i, res); @@ -3138,7 +2973,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, index++; } } else if ((res->flags & IORESOURCE_MEM) && - !pnv_pci_is_mem_pref_64(res->flags)) { + !pnv_pci_is_m64(phb, res)) { region.start = res->start - phb->hose->mem_offset[0] - phb->ioda.m32_pci_base; @@ -3198,41 +3033,6 @@ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) } } -static void pnv_pci_ioda_setup_seg(void) -{ - struct pci_controller *tmp, *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - phb = hose->private_data; - - /* NPU PHB does not support IO or MMIO segmentation */ - if (phb->type == PNV_PHB_NPU) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - pnv_ioda_setup_pe_seg(pe); - } - } -} - -static void pnv_pci_ioda_setup_DMA(void) -{ - struct pci_controller *hose, *tmp; - struct pnv_phb *phb; - - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pnv_ioda_setup_dma(hose->private_data); - - /* Mark the PHB initialization done */ - phb = hose->private_data; - phb->initialized = 1; - } - - pnv_pci_ioda_setup_iommu_api(); -} - static void pnv_pci_ioda_create_dbgfs(void) { #ifdef CONFIG_DEBUG_FS @@ -3243,6 +3043,9 @@ static void pnv_pci_ioda_create_dbgfs(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { phb = hose->private_data; + /* Notify initialization of PHB done */ + phb->initialized = 1; + sprintf(name, "PCI%04x", hose->global_number); phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); if (!phb->dbgfs) @@ -3255,9 +3058,7 @@ static void pnv_pci_ioda_create_dbgfs(void) static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); - pnv_pci_ioda_setup_seg(); - pnv_pci_ioda_setup_DMA(); - + pnv_pci_ioda_setup_iommu_api(); pnv_pci_ioda_create_dbgfs(); #ifdef CONFIG_EEH @@ -3297,9 +3098,12 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, bridge = bridge->bus->self; } - /* We fail back to M32 if M64 isn't supported */ - if (phb->ioda.m64_segsize && - pnv_pci_is_mem_pref_64(type)) + /* + * We fall back to M32 if M64 isn't supported. We enforce the M64 + * alignment for any 64-bit resource, PCIe doesn't care and + * bridges only do 64-bit prefetchable anyway. + */ + if (phb->ioda.m64_segsize && pnv_pci_is_m64_flags(type)) return phb->ioda.m64_segsize; if (type & IORESOURCE_MEM) return phb->ioda.m32_segsize; @@ -3307,6 +3111,115 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, return phb->ioda.io_segsize; } +/* + * We are updating root port or the upstream port of the + * bridge behind the root port with PHB's windows in order + * to accommodate the changes on required resources during + * PCI (slot) hotplug, which is connected to either root + * port or the downstream ports of PCIe switch behind the + * root port. + */ +static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus, + unsigned long type) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dev *bridge = bus->self; + struct resource *r, *w; + bool msi_region = false; + int i; + + /* Check if we need apply fixup to the bridge's windows */ + if (!pci_is_root_bus(bridge->bus) && + !pci_is_root_bus(bridge->bus->self->bus)) + return; + + /* Fixup the resources */ + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) { + r = &bridge->resource[PCI_BRIDGE_RESOURCES + i]; + if (!r->flags || !r->parent) + continue; + + w = NULL; + if (r->flags & type & IORESOURCE_IO) + w = &hose->io_resource; + else if (pnv_pci_is_m64(phb, r) && + (type & IORESOURCE_PREFETCH) && + phb->ioda.m64_segsize) + w = &hose->mem_resources[1]; + else if (r->flags & type & IORESOURCE_MEM) { + w = &hose->mem_resources[0]; + msi_region = true; + } + + r->start = w->start; + r->end = w->end; + + /* The 64KB 32-bits MSI region shouldn't be included in + * the 32-bits bridge window. Otherwise, we can see strange + * issues. One of them is EEH error observed on Garrison. + * + * Exclude top 1MB region which is the minimal alignment of + * 32-bits bridge window. + */ + if (msi_region) { + r->end += 0x10000; + r->end -= 0x100000; + } + } +} + +static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type) +{ + struct pci_controller *hose = pci_bus_to_host(bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dev *bridge = bus->self; + struct pnv_ioda_pe *pe; + bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); + + /* Extend bridge's windows if necessary */ + pnv_pci_fixup_bridge_resources(bus, type); + + /* The PE for root bus should be realized before any one else */ + if (!phb->ioda.root_pe_populated) { + pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false); + if (pe) { + phb->ioda.root_pe_idx = pe->pe_number; + phb->ioda.root_pe_populated = true; + } + } + + /* Don't assign PE to PCI bus, which doesn't have subordinate devices */ + if (list_empty(&bus->devices)) + return; + + /* Reserve PEs according to used M64 resources */ + if (phb->reserve_m64_pe) + phb->reserve_m64_pe(bus, NULL, all); + + /* + * Assign PE. We might run here because of partial hotplug. + * For the case, we just pick up the existing PE and should + * not allocate resources again. + */ + pe = pnv_ioda_setup_bus_PE(bus, all); + if (!pe) + return; + + pnv_ioda_setup_pe_seg(pe); + switch (phb->type) { + case PNV_PHB_IODA1: + pnv_pci_ioda1_setup_dma_pe(phb, pe); + break; + case PNV_PHB_IODA2: + pnv_pci_ioda2_setup_dma_pe(phb, pe); + break; + default: + pr_warn("%s: No DMA for PHB#%d (type %d)\n", + __func__, phb->hose->global_number, phb->type); + } +} + #ifdef CONFIG_PCI_IOV static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno) @@ -3346,7 +3259,7 @@ static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, /* Prevent enabling devices for which we couldn't properly * assign a PE */ -static bool pnv_pci_enable_device_hook(struct pci_dev *dev) +bool pnv_pci_enable_device_hook(struct pci_dev *dev) { struct pci_controller *hose = pci_bus_to_host(dev->bus); struct pnv_phb *phb = hose->private_data; @@ -3367,6 +3280,201 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) return true; } +static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, + int num) +{ + struct pnv_ioda_pe *pe = container_of(table_group, + struct pnv_ioda_pe, table_group); + struct pnv_phb *phb = pe->phb; + unsigned int idx; + long rc; + + pe_info(pe, "Removing DMA window #%d\n", num); + for (idx = 0; idx < phb->ioda.dma32_count; idx++) { + if (phb->ioda.dma32_segmap[idx] != pe->pe_number) + continue; + + rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, + idx, 0, 0ul, 0ul, 0ul); + if (rc != OPAL_SUCCESS) { + pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n", + rc, idx); + return rc; + } + + phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE; + } + + pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); + return OPAL_SUCCESS; +} + +static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) +{ + unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); + struct iommu_table *tbl = pe->table_group.tables[0]; + int64_t rc; + + if (!weight) + return; + + rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0); + if (rc != OPAL_SUCCESS) + return; + + pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false); + if (pe->table_group.group) { + iommu_group_put(pe->table_group.group); + WARN_ON(pe->table_group.group); + } + + free_pages(tbl->it_base, get_order(tbl->it_size << 3)); + iommu_free_table(tbl, "pnv"); +} + +static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) +{ + struct iommu_table *tbl = pe->table_group.tables[0]; + unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); +#ifdef CONFIG_IOMMU_API + int64_t rc; +#endif + + if (!weight) + return; + +#ifdef CONFIG_IOMMU_API + rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); + if (rc) + pe_warn(pe, "OPAL error %ld release DMA window\n", rc); +#endif + + pnv_pci_ioda2_set_bypass(pe, false); + if (pe->table_group.group) { + iommu_group_put(pe->table_group.group); + WARN_ON(pe->table_group.group); + } + + pnv_pci_ioda2_table_free_pages(tbl); + iommu_free_table(tbl, "pnv"); +} + +static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, + unsigned short win, + unsigned int *map) +{ + struct pnv_phb *phb = pe->phb; + int idx; + int64_t rc; + + for (idx = 0; idx < phb->ioda.total_pe_num; idx++) { + if (map[idx] != pe->pe_number) + continue; + + if (win == OPAL_M64_WINDOW_TYPE) + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + phb->ioda.reserved_pe_idx, win, + idx / PNV_IODA1_M64_SEGS, + idx % PNV_IODA1_M64_SEGS); + else + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + phb->ioda.reserved_pe_idx, win, 0, idx); + + if (rc != OPAL_SUCCESS) + pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n", + rc, win, idx); + + map[idx] = IODA_INVALID_PE; + } +} + +static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) +{ + struct pnv_phb *phb = pe->phb; + + if (phb->type == PNV_PHB_IODA1) { + pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE, + phb->ioda.io_segmap); + pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, + phb->ioda.m32_segmap); + pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE, + phb->ioda.m64_segmap); + } else if (phb->type == PNV_PHB_IODA2) { + pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, + phb->ioda.m32_segmap); + } +} + +static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) +{ + struct pnv_phb *phb = pe->phb; + struct pnv_ioda_pe *slave, *tmp; + + list_del(&pe->list); + switch (phb->type) { + case PNV_PHB_IODA1: + pnv_pci_ioda1_release_pe_dma(pe); + break; + case PNV_PHB_IODA2: + pnv_pci_ioda2_release_pe_dma(pe); + break; + default: + WARN_ON(1); + } + + pnv_ioda_release_pe_seg(pe); + pnv_ioda_deconfigure_pe(pe->phb, pe); + + /* Release slave PEs in the compound PE */ + if (pe->flags & PNV_IODA_PE_MASTER) { + list_for_each_entry_safe(slave, tmp, &pe->slaves, list) { + list_del(&slave->list); + pnv_ioda_free_pe(slave); + } + } + + /* + * The PE for root bus can be removed because of hotplug in EEH + * recovery for fenced PHB error. We need to mark the PE dead so + * that it can be populated again in PCI hot add path. The PE + * shouldn't be destroyed as it's the global reserved resource. + */ + if (phb->ioda.root_pe_populated && + phb->ioda.root_pe_idx == pe->pe_number) + phb->ioda.root_pe_populated = false; + else + pnv_ioda_free_pe(pe); +} + +static void pnv_pci_release_device(struct pci_dev *pdev) +{ + struct pci_controller *hose = pci_bus_to_host(pdev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn = pci_get_pdn(pdev); + struct pnv_ioda_pe *pe; + + if (pdev->is_virtfn) + return; + + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return; + + /* + * PCI hotplug can happen as part of EEH error recovery. The @pdn + * isn't removed and added afterwards in this scenario. We should + * set the PE number in @pdn to an invalid one. Otherwise, the PE's + * device count is decreased on removing devices while failing to + * be increased on adding devices. It leads to unbalanced PE's device + * count and eventually make normal PCI hotplug path broken. + */ + pe = &phb->ioda.pe_array[pdn->pe_number]; + pdn->pe_number = IODA_INVALID_PE; + + WARN_ON(--pe->device_count < 0); + if (pe->device_count == 0) + pnv_ioda_release_pe(pe); +} + static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -3383,7 +3491,9 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .teardown_msi_irqs = pnv_teardown_msi_irqs, #endif .enable_device_hook = pnv_pci_enable_device_hook, + .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, + .setup_bridge = pnv_pci_setup_bridge, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .dma_set_mask = pnv_pci_ioda_dma_set_mask, .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, @@ -3411,6 +3521,26 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { .shutdown = pnv_pci_ioda_shutdown, }; +#ifdef CONFIG_CXL_BASE +const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops = { + .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_bus_setup = pnv_pci_dma_bus_setup, +#ifdef CONFIG_PCI_MSI + .setup_msi_irqs = pnv_cxl_cx4_setup_msi_irqs, + .teardown_msi_irqs = pnv_cxl_cx4_teardown_msi_irqs, +#endif + .enable_device_hook = pnv_cxl_enable_device_hook, + .disable_device = pnv_cxl_disable_device, + .release_device = pnv_pci_release_device, + .window_alignment = pnv_pci_window_alignment, + .setup_bridge = pnv_pci_setup_bridge, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_pci_ioda_dma_set_mask, + .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, + .shutdown = pnv_pci_ioda_shutdown, +}; +#endif + static void __init pnv_pci_init_ioda_phb(struct device_node *np, u64 hub_id, int ioda_type) { @@ -3418,6 +3548,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, struct pnv_phb *phb; unsigned long size, m64map_off, m32map_off, pemap_off; unsigned long iomap_off = 0, dma32map_off = 0; + struct resource r; const __be64 *prop64; const __be32 *prop32; int len; @@ -3426,7 +3557,11 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, void *aux; long rc; - pr_info("Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name); + if (!of_device_is_available(np)) + return; + + pr_info("Initializing %s PHB (%s)\n", + pnv_phb_names[ioda_type], of_node_full_name(np)); prop64 = of_get_property(np, "ibm,opal-phbid", NULL); if (!prop64) { @@ -3477,9 +3612,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, pci_process_bridge_OF_ranges(hose, np, !hose->global_number); /* Get registers */ - phb->regs = of_iomap(np, 0); - if (phb->regs == NULL) - pr_err(" Failed to map registers !\n"); + if (!of_address_to_resource(np, 0, &r)) { + phb->regs_phys = r.start; + phb->regs = ioremap(r.start, resource_size(&r)); + if (phb->regs == NULL) + pr_err(" Failed to map registers !\n"); + } /* Initialize more IODA stuff */ phb->ioda.total_pe_num = 1; @@ -3490,6 +3628,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (prop32) phb->ioda.reserved_pe_idx = be32_to_cpup(prop32); + /* Invalidate RID to PE# mapping */ + for (segno = 0; segno < ARRAY_SIZE(phb->ioda.pe_rmap); segno++) + phb->ioda.pe_rmap[segno] = IODA_INVALID_PE; + /* Parse 64-bit MMIO range */ pnv_ioda_parse_m64_window(phb); @@ -3541,7 +3683,22 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE; } phb->ioda.pe_array = aux + pemap_off; - set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); + + /* + * Choose PE number for root bus, which shouldn't have + * M64 resources consumed by its child devices. To pick + * the PE number adjacent to the reserved one if possible. + */ + pnv_ioda_reserve_pe(phb, phb->ioda.reserved_pe_idx); + if (phb->ioda.reserved_pe_idx == 0) { + phb->ioda.root_pe_idx = 1; + pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); + } else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) { + phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1; + pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); + } else { + phb->ioda.root_pe_idx = IODA_INVALID_PE; + } INIT_LIST_HEAD(&phb->ioda.pe_list); mutex_init(&phb->ioda.pe_list_mutex); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 1d92bd93b..a21d831c1 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -36,8 +37,124 @@ #include "powernv.h" #include "pci.h" -/* Delay in usec */ -#define PCI_RESET_DELAY_US 3000000 +int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) +{ + struct device_node *parent = np; + u32 bdfn; + u64 phbid; + int ret; + + ret = of_property_read_u32(np, "reg", &bdfn); + if (ret) + return -ENXIO; + + bdfn = ((bdfn & 0x00ffff00) >> 8); + while ((parent = of_get_parent(parent))) { + if (!PCI_DN(parent)) { + of_node_put(parent); + break; + } + + if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) { + of_node_put(parent); + continue; + } + + ret = of_property_read_u64(parent, "ibm,opal-phbid", &phbid); + if (ret) { + of_node_put(parent); + return -ENXIO; + } + + *id = PCI_SLOT_ID(phbid, bdfn); + return 0; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(pnv_pci_get_slot_id); + +int pnv_pci_get_device_tree(uint32_t phandle, void *buf, uint64_t len) +{ + int64_t rc; + + if (!opal_check_token(OPAL_GET_DEVICE_TREE)) + return -ENXIO; + + rc = opal_get_device_tree(phandle, (uint64_t)buf, len); + if (rc < OPAL_SUCCESS) + return -EIO; + + return rc; +} +EXPORT_SYMBOL_GPL(pnv_pci_get_device_tree); + +int pnv_pci_get_presence_state(uint64_t id, uint8_t *state) +{ + int64_t rc; + + if (!opal_check_token(OPAL_PCI_GET_PRESENCE_STATE)) + return -ENXIO; + + rc = opal_pci_get_presence_state(id, (uint64_t)state); + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_GPL(pnv_pci_get_presence_state); + +int pnv_pci_get_power_state(uint64_t id, uint8_t *state) +{ + int64_t rc; + + if (!opal_check_token(OPAL_PCI_GET_POWER_STATE)) + return -ENXIO; + + rc = opal_pci_get_power_state(id, (uint64_t)state); + if (rc != OPAL_SUCCESS) + return -EIO; + + return 0; +} +EXPORT_SYMBOL_GPL(pnv_pci_get_power_state); + +int pnv_pci_set_power_state(uint64_t id, uint8_t state, struct opal_msg *msg) +{ + struct opal_msg m; + int token, ret; + int64_t rc; + + if (!opal_check_token(OPAL_PCI_SET_POWER_STATE)) + return -ENXIO; + + token = opal_async_get_token_interruptible(); + if (unlikely(token < 0)) + return token; + + rc = opal_pci_set_power_state(token, id, (uint64_t)&state); + if (rc == OPAL_SUCCESS) { + ret = 0; + goto exit; + } else if (rc != OPAL_ASYNC_COMPLETION) { + ret = -EIO; + goto exit; + } + + ret = opal_async_wait_response(token, &m); + if (ret < 0) + goto exit; + + if (msg) { + ret = 1; + memcpy(msg, &m, sizeof(m)); + } + +exit: + opal_async_release_token(token); + return ret; +} +EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); #ifdef CONFIG_PCI_MSI int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) @@ -587,7 +704,7 @@ static __be64 *pnv_tce(struct iommu_table *tbl, long idx) int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - struct dma_attrs *attrs) + unsigned long attrs) { u64 proto_tce = iommu_direction_to_tce_perm(direction); u64 rpn = __pa(uaddr) >> tbl->it_page_shift; @@ -620,8 +737,8 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, if (newtce & TCE_PCI_WRITE) newtce |= TCE_PCI_READ; - oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)); - *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE); + oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce))); + *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); *direction = iommu_tce_direction(oldtce); return 0; @@ -815,13 +932,14 @@ void __init pnv_pci_init(void) for_each_compatible_node(np, NULL, "ibm,ioda2-phb") pnv_pci_init_ioda2_phb(np); + /* Look for ioda3 built-in PHB4's, we treat them as IODA2 */ + for_each_compatible_node(np, NULL, "ibm,ioda3-phb") + pnv_pci_init_ioda2_phb(np); + /* Look for NPU PHBs */ for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb") pnv_pci_init_npu_phb(np); - /* Setup the linkage between OF nodes and PHBs */ - pci_devs_phb_init(); - /* Configure IOMMU DMA hooks */ set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 7dee25e30..e64df7894 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -1,6 +1,10 @@ #ifndef __POWERNV_PCI_H #define __POWERNV_PCI_H +#include +#include +#include + struct pci_dn; enum pnv_phb_type { @@ -30,6 +34,7 @@ struct pnv_phb; struct pnv_ioda_pe { unsigned long flags; struct pnv_phb *phb; + int device_count; /* A PE can be associated with a single device or an * entire bus (& children). In the former case, pdev @@ -71,6 +76,7 @@ struct pnv_ioda_pe { }; #define PNV_PHB_FLAG_EEH (1 << 0) +#define PNV_PHB_FLAG_CXL (1 << 1) /* Real PHB supporting the cxl kernel API */ struct pnv_phb { struct pci_controller *hose; @@ -80,6 +86,7 @@ struct pnv_phb { u64 opal_id; int flags; void __iomem *regs; + u64 regs_phys; int initialized; spinlock_t lock; @@ -110,6 +117,8 @@ struct pnv_phb { /* Global bridge info */ unsigned int total_pe_num; unsigned int reserved_pe_idx; + unsigned int root_pe_idx; + bool root_pe_populated; /* 32-bit MMIO window */ unsigned int m32_size; @@ -152,17 +161,8 @@ struct pnv_phb { struct list_head pe_list; struct mutex pe_list_mutex; - /* Reverse map of PEs, will have to extend if - * we are to support more than 256 PEs, indexed - * bus { bus, devfn } - */ - unsigned char pe_rmap[0x10000]; - - /* TCE cache invalidate registers (physical and - * remapped) - */ - phys_addr_t tce_inval_reg_phys; - __be64 __iomem *tce_inval_reg; + /* Reverse map of PEs, indexed by {bus, devfn} */ + unsigned int pe_rmap[0x10000]; } ioda; /* PHB and hub status structure */ @@ -173,12 +173,15 @@ struct pnv_phb { struct OpalIoP7IOCErrorData hub_diag; } diag; +#ifdef CONFIG_CXL_BASE + struct cxl_afu *cxl_afu; +#endif }; extern struct pci_ops pnv_pci_ops; extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, - struct dma_attrs *attrs); + unsigned long attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction); @@ -203,8 +206,6 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_init_npu_phb(struct device_node *np); -extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl, - __be64 *startp, __be64 *endp, bool rm); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); @@ -212,6 +213,9 @@ extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); +extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); +extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); @@ -224,7 +228,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, /* Nvlink functions */ extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); -extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); +extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm); extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, struct iommu_table *tbl); @@ -232,4 +236,15 @@ extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); + +/* cxl functions */ +extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev); +extern void pnv_cxl_disable_device(struct pci_dev *dev); +extern int pnv_cxl_cx4_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); +extern void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev *pdev); + + +/* phb ops (cxl switches these when enabling the kernel api on the phb) */ +extern const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops; + #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 6dbc0a1da..da7c843ac 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -18,6 +18,7 @@ static inline void pnv_pci_shutdown(void) { } #endif extern u32 pnv_get_supported_cpuidle_states(void); +extern u64 pnv_deepest_stop_state; extern void pnv_lpc_init(void); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index ee6430bed..efe8b6bb1 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -58,7 +58,7 @@ static void __init pnv_setup_arch(void) /* XXX PMCS */ } -static void __init pnv_init_early(void) +static void __init pnv_init(void) { /* * Initialize the LPC bus now so that legacy serial @@ -268,21 +268,16 @@ static void __init pnv_setup_machdep_opal(void) static int __init pnv_probe(void) { - unsigned long root = of_get_flat_dt_root(); - - if (!of_flat_dt_is_compatible(root, "ibm,powernv")) + if (!of_machine_is_compatible("ibm,powernv")) return 0; - if (IS_ENABLED(CONFIG_PPC_RADIX_MMU) && radix_enabled()) - radix_init_native(); - else if (IS_ENABLED(CONFIG_PPC_STD_MMU_64)) - hpte_init_native(); - if (firmware_has_feature(FW_FEATURE_OPAL)) pnv_setup_machdep_opal(); pr_debug("PowerNV detected !\n"); + pnv_init(); + return 1; } @@ -308,14 +303,13 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu) define_machine(powernv) { .name = "PowerNV", .probe = pnv_probe, - .init_early = pnv_init_early, .setup_arch = pnv_setup_arch, .init_IRQ = pnv_init_IRQ, .show_cpuinfo = pnv_show_cpuinfo, .get_proc_freq = pnv_get_proc_freq, .progress = pnv_progress, .machine_shutdown = pnv_shutdown, - .power_save = power7_idle, + .power_save = NULL, .calibrate_decr = generic_calibrate_decr, #ifdef CONFIG_KEXEC .kexec_cpu_down = pnv_kexec_cpu_down, diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index ad7b1a3db..c789258ae 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -182,7 +182,9 @@ static void pnv_smp_cpu_kill_self(void) ppc64_runlatch_off(); - if (idle_states & OPAL_PM_WINKLE_ENABLED) + if (cpu_has_feature(CPU_FTR_ARCH_300)) + srr1 = power9_idle_stop(pnv_deepest_stop_state); + else if (idle_states & OPAL_PM_WINKLE_ENABLED) srr1 = power7_winkle(); else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) -- cgit v1.2.3-54-g00ecf