diff options
Diffstat (limited to 'arch/powerpc')
179 files changed, 6456 insertions, 3164 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7cd32c038..0a9d439bc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -116,6 +116,8 @@ config PPC select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 select ARCH_WANT_IPC_PARSE_VERSION @@ -126,7 +128,7 @@ config PPC select IRQ_FORCED_THREADING select HAVE_RCU_TABLE_FREE if SMP select HAVE_SYSCALL_TRACEPOINTS - select HAVE_BPF_JIT + select HAVE_CBPF_JIT if CPU_BIG_ENDIAN select HAVE_ARCH_JUMP_LABEL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_HAS_GCOV_PROFILE_ALL @@ -153,6 +155,7 @@ config PPC select NO_BOOTMEM select HAVE_GENERIC_RCU_GUP select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_NMI if PERF_EVENTS select EDAC_SUPPORT select EDAC_ATOMIC_SCRUB select ARCH_HAS_DMA_SET_COHERENT_MASK @@ -160,6 +163,7 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN @@ -605,9 +609,9 @@ endchoice config FORCE_MAX_ZONEORDER int "Maximum zone order" - range 9 64 if PPC64 && PPC_64K_PAGES + range 8 9 if PPC64 && PPC_64K_PAGES default "9" if PPC64 && PPC_64K_PAGES - range 13 64 if PPC64 && !PPC_64K_PAGES + range 9 13 if PPC64 && !PPC_64K_PAGES default "13" if PPC64 && !PPC_64K_PAGES range 9 64 if PPC32 && PPC_16K_PAGES default "9" if PPC32 && PPC_16K_PAGES @@ -794,7 +798,6 @@ config 4xx_SOC config FSL_LBC bool "Freescale Local Bus support" - depends on FSL_SOC help Enables reporting of errors from the Freescale local bus controller. Also contains some common code used by @@ -1107,3 +1110,5 @@ config PPC_LIB_RHEAP bool source "arch/powerpc/kvm/Kconfig" + +source "kernel/livepatch/Kconfig" diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 638f9ce74..d3fcf7e64 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -19,14 +19,6 @@ config PPC_WERROR depends on !PPC_DISABLE_WERROR default y -config STRICT_MM_TYPECHECKS - bool "Do extra type checking on mm types" - default n - help - This option turns on extra type checking for some mm related types. - - If you don't know what this means, say N. - config PRINT_STACK_DEPTH int "Stack depth to print" if DEBUG_KERNEL default 64 diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 611651013..8fe78a3ef 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -362,9 +362,6 @@ $(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb) -$(obj)/cuImage.%: vmlinux $(obj)/fsl/%.dtb $(wrapperbits) - $(call if_changed,wrap,cuboot-$*,,$(obj)/fsl/$*.dtb) - $(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) @@ -381,6 +378,9 @@ $(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) $(obj)/%.dtb: $(src)/dts/%.dts FORCE $(call if_changed_dep,dtc) +$(obj)/%.dtb: $(src)/dts/fsl/%.dts FORCE + $(call if_changed_dep,dtc) + # If there isn't a platform selected then just strip the vmlinux. ifeq (,$(image-y)) image-y := vmlinux.strip diff --git a/arch/powerpc/boot/dts/canyonlands.dts b/arch/powerpc/boot/dts/canyonlands.dts index 3dc75deaf..549c24c4c 100644 --- a/arch/powerpc/boot/dts/canyonlands.dts +++ b/arch/powerpc/boot/dts/canyonlands.dts @@ -190,12 +190,21 @@ /* DMA */ 0x2 &UIC0 0xc 0x4>; }; + AHBDMA: dma@bffd0800 { + compatible = "snps,dma-spear1340"; + reg = <4 0xbffd0800 0x400>; + interrupt-parent = <&UIC3>; + interrupts = <0x5 0x4>; + #dma-cells = <3>; + }; + SATA0: sata@bffd1000 { compatible = "amcc,sata-460ex"; - reg = <4 0xbffd1000 0x800 4 0xbffd0800 0x400>; + reg = <4 0xbffd1000 0x800>; interrupt-parent = <&UIC3>; - interrupts = <0x0 0x4 /* SATA */ - 0x5 0x4>; /* AHBDMA */ + interrupts = <0x0 0x4>; + dmas = <&AHBDMA 0 1 0>; + dma-names = "sata-dma"; }; POB0: opb { diff --git a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts index 0424fc2bd..c88d4ef9e 100644 --- a/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts +++ b/arch/powerpc/boot/dts/fsl/gef_ppc9a.dts @@ -211,6 +211,10 @@ 0x0 0x00400000>; }; }; + + pci1: pcie@fef09000 { + status = "disabled"; + }; }; /include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts index 84b3d38f8..838515798 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc310.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc310.dts @@ -24,10 +24,6 @@ model = "GEF_SBC310"; compatible = "gef,sbc310"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x0 0x40000000>; // set by uboot @@ -223,29 +219,11 @@ }; pci1: pcie@fef09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xfef09000 0x1000>; - bus-range = <0x0 0xff>; ranges = <0x02000000 0x0 0xc0000000 0xc0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xfe400000 0x0 0x00400000>; - clock-frequency = <100000000>; - interrupts = <0x19 0x2 0 0>; - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - 0x0000 0x0 0x0 0x1 &mpic 0x4 0x2 - 0x0000 0x0 0x0 0x2 &mpic 0x5 0x2 - 0x0000 0x0 0x0 0x3 &mpic 0x6 0x2 - 0x0000 0x0 0x0 0x4 &mpic 0x7 0x2 - >; pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xc0000000 0x02000000 0x0 0xc0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts index 974446acc..ff423ab42 100644 --- a/arch/powerpc/boot/dts/fsl/gef_sbc610.dts +++ b/arch/powerpc/boot/dts/fsl/gef_sbc610.dts @@ -209,6 +209,10 @@ 0x0 0x00400000>; }; }; + + pci1: pcie@fef09000 { + status = "disabled"; + }; }; /include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts index 554001f2e..11bea3e6a 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts +++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts @@ -15,10 +15,6 @@ model = "MPC8641HPCN"; compatible = "fsl,mpc8641hpcn"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x00000000 0x40000000>; // 1G at 0x0 @@ -359,29 +355,11 @@ }; pci1: pcie@ffe09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xffe09000 0x1000>; - bus-range = <0 0xff>; ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xffc10000 0x0 0x00010000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; + pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xa0000000 0x02000000 0x0 0xa0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts index fec58671a..7ff62046a 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts +++ b/arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts @@ -17,10 +17,6 @@ #address-cells = <2>; #size-cells = <2>; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x0 0x00000000 0x0 0x40000000>; // 1G at 0x0 @@ -326,29 +322,11 @@ }; pci1: pcie@fffe09000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0x0f 0xffe09000 0x0 0x1000>; - bus-range = <0x0 0xff>; ranges = <0x02000000 0x0 0xe0000000 0x0c 0x20000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0x0f 0xffc10000 0x0 0x00010000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; + pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xe0000000 0x02000000 0x0 0xe0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi index 70889d8e8..eeb7c65d5 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi @@ -102,19 +102,46 @@ bus-range = <0x0 0xff>; clock-frequency = <100000000>; interrupts = <24 2 0 0>; - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 - 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 - 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 - 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 - >; + pcie@0 { + reg = <0 0 0 0 0>; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + interrupts = <24 2 0 0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 0x0 0x0 + >; + }; +}; + +&pci1 { + compatible = "fsl,mpc8641-pcie"; + device_type = "pci"; + #size-cells = <2>; + #address-cells = <3>; + bus-range = <0x0 0xff>; + clock-frequency = <100000000>; + interrupts = <25 2 0 0>; pcie@0 { reg = <0 0 0 0 0>; + #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; device_type = "pci"; + interrupts = <25 2 0 0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0 + 0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0 + >; }; }; diff --git a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi index 9e0332856..7c6db6f7c 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi @@ -25,6 +25,7 @@ serial0 = &serial0; serial1 = &serial1; pci0 = &pci0; + pci1 = &pci1; }; cpus { diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts index 0a9733cd4..75870a124 100644 --- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts +++ b/arch/powerpc/boot/dts/fsl/sbc8641d.dts @@ -19,10 +19,6 @@ model = "SBC8641D"; compatible = "wind,sbc8641"; - aliases { - pci1 = &pci1; - }; - memory { device_type = "memory"; reg = <0x00000000 0x20000000>; // 512M at 0x0 @@ -165,30 +161,11 @@ }; pci1: pcie@f8009000 { - compatible = "fsl,mpc8641-pcie"; - device_type = "pci"; - #size-cells = <2>; - #address-cells = <3>; reg = <0xf8009000 0x1000>; - bus-range = <0 0xff>; ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>; - clock-frequency = <100000000>; - interrupts = <25 2 0 0>; - interrupt-map-mask = <0xf800 0 0 7>; - interrupt-map = < - /* IDSEL 0x0 */ - 0x0000 0 0 1 &mpic 4 1 - 0x0000 0 0 2 &mpic 5 1 - 0x0000 0 0 3 &mpic 6 1 - 0x0000 0 0 4 &mpic 7 1 - >; pcie@0 { - reg = <0 0 0 0 0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; ranges = <0x02000000 0x0 0xa0000000 0x02000000 0x0 0xa0000000 0x0 0x20000000 diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi index 99e421df7..6e0b4892a 100644 --- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi @@ -263,7 +263,7 @@ }; rcpm: global-utilities@e2000 { - compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0"; + compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.1"; reg = <0xe2000 0x1000>; }; diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi index e0f4da554..507649ece 100644 --- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi @@ -472,7 +472,7 @@ }; rcpm: global-utilities@e2000 { - compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.0"; + compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.1"; reg = <0xe2000 0x1000>; }; diff --git a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi index 72691ef10..7c4afdb44 100644 --- a/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t104xrdb.dtsi @@ -109,7 +109,7 @@ flash@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "micron,n25q512a", "jedec,spi-nor"; + compatible = "micron,n25q512ax3", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <10000000>; /* input clock */ }; diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi index dc9326875..ff87e67c7 100644 --- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi @@ -113,7 +113,7 @@ flash@0 { #address-cells = <1>; #size-cells = <1>; - compatible = "micron,n25q512a", "jedec,spi-nor"; + compatible = "micron,n25q512ax3", "jedec,spi-nor"; reg = <0>; spi-max-frequency = <10000000>; /* input clock */ }; diff --git a/arch/powerpc/include/asm/book3s/32/hash.h b/arch/powerpc/include/asm/book3s/32/hash.h index 264b754d6..880db13a2 100644 --- a/arch/powerpc/include/asm/book3s/32/hash.h +++ b/arch/powerpc/include/asm/book3s/32/hash.h @@ -39,8 +39,5 @@ #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) -/* Hash table based platforms need atomic updates of the linux PTE */ -#define PTE_ATOMIC_UPDATES 1 - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 16f513e5c..b82e06349 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_MMU_HASH32_H_ -#define _ASM_POWERPC_MMU_HASH32_H_ +#ifndef _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ +#define _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ /* * 32-bit hash table MMU support */ @@ -90,4 +90,4 @@ typedef struct { #define mmu_virtual_psize MMU_PAGE_4K #define mmu_linear_psize MMU_PAGE_256M -#endif /* _ASM_POWERPC_MMU_HASH32_H_ */ +#endif /* _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h new file mode 100644 index 000000000..8e21bb492 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -0,0 +1,108 @@ +#ifndef _ASM_POWERPC_BOOK3S_32_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_32_PGALLOC_H + +#include <linux/threads.h> + +/* For 32-bit, all levels of page tables are just drawn from get_free_page() */ +#define MAX_PGTABLE_INDEX_SIZE 0 + +extern void __bad_pte(pmd_t *pmd); + +extern pgd_t *pgd_alloc(struct mm_struct *mm); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); + +/* + * We don't have any real pmd's, and this code never triggers because + * the pgd will always be present.. + */ +/* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ +#define pmd_free(mm, x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) +/* #define pgd_populate(mm, pmd, pte) BUG() */ + +#ifndef CONFIG_BOOKE + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#else + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, + pte_t *pte) +{ + *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pte_page) +{ + *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT); +} + +#define pmd_pgtable(pmd) pmd_page(pmd) +#endif + +extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); +extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + free_page((unsigned long)pte); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pgtable_page_dtor(ptepage); + __free_page(ptepage); +} + +static inline void pgtable_free(void *table, unsigned index_size) +{ + BUG_ON(index_size); /* 32-bit doesn't use this */ + free_page((unsigned long)table); +} + +#define check_pgt_cache() do { } while (0) + +#ifdef CONFIG_SMP +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +static inline void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); +} +#else +static inline void pgtable_free_tlb(struct mmu_gather *tlb, + void *table, int shift) +{ + pgtable_free(table, shift); +} +#endif + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); +} +#endif /* _ASM_POWERPC_BOOK3S_32_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 5f08a0832..1af837c56 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -5,58 +5,31 @@ * for each page table entry. The PMD and PGD level use a 32b record for * each entry by assuming that each entry is page aligned. */ -#define PTE_INDEX_SIZE 9 -#define PMD_INDEX_SIZE 7 -#define PUD_INDEX_SIZE 9 -#define PGD_INDEX_SIZE 9 +#define H_PTE_INDEX_SIZE 9 +#define H_PMD_INDEX_SIZE 7 +#define H_PUD_INDEX_SIZE 9 +#define H_PGD_INDEX_SIZE 9 #ifndef __ASSEMBLY__ -#define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) -#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) -#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) -#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) -#endif /* __ASSEMBLY__ */ - -#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) -#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) -#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) -#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) - -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) +#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) +#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) +#define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE) +#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE) /* With 4k base page size, hugepage PTEs go at the PMD level */ #define MIN_HUGEPTE_SHIFT PMD_SHIFT -/* PUD_SHIFT determines what a third-level page table entry can map */ -#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ -#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -/* Bits to mask out from a PMD to get to the PTE page */ -#define PMD_MASKED_BITS 0 -/* Bits to mask out from a PUD to get to the PMD page */ -#define PUD_MASKED_BITS 0 -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0 - /* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \ - _PAGE_F_SECOND | _PAGE_F_GIX) - -/* shift to put page number into pte */ -#define PTE_RPN_SHIFT (12) -#define PTE_RPN_SIZE (45) /* gives 57-bit real addresses */ - -#define _PAGE_4K_PFN 0 -#ifndef __ASSEMBLY__ +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \ + H_PAGE_F_SECOND | H_PAGE_F_GIX) +/* + * Not supported by 4k linux page size + */ +#define H_PAGE_4K_PFN 0x0 +#define H_PAGE_THP_HUGE 0x0 +#define H_PAGE_COMBO 0x0 +#define H_PTE_FRAG_NR 0 +#define H_PTE_FRAG_SIZE_SHIFT 0 /* * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */ @@ -64,37 +37,76 @@ remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot)) #ifdef CONFIG_HUGETLB_PAGE -/* - * For 4k page size, we support explicit hugepage via hugepd - */ -static inline int pmd_huge(pmd_t pmd) +static inline int hash__hugepd_ok(hugepd_t hpd) +{ + /* + * if it is not a pte and have hugepd shift mask + * set, then it is a hugepd directory pointer + */ + if (!(hpd.pd & _PAGE_PTE) && + ((hpd.pd & HUGEPD_SHIFT_MASK) != 0)) + return true; + return false; +} +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +static inline char *get_hpte_slot_array(pmd_t *pmdp) +{ + BUG(); + return NULL; +} + +static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index) { + BUG(); return 0; } -static inline int pud_huge(pud_t pud) +static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array, + int index) { + BUG(); return 0; } -static inline int pgd_huge(pgd_t pgd) +static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, + unsigned int index, unsigned int hidx) +{ + BUG(); +} + +static inline int hash__pmd_trans_huge(pmd_t pmd) { return 0; } -#define pgd_huge pgd_huge -static inline int hugepd_ok(hugepd_t hpd) +static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) { - /* - * if it is not a pte and have hugepd shift mask - * set, then it is a hugepd directory pointer - */ - if (!(hpd.pd & _PAGE_PTE) && - ((hpd.pd & HUGEPD_SHIFT_MASK) != 0)) - return true; - return false; + BUG(); + return 0; } -#define is_hugepd(hpd) (hugepd_ok(hpd)) + +static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) +{ + BUG(); + return pmd; +} + +extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set); +extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int hash__has_transparent_hugepage(void); #endif #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 0a7956a80..5aae4f530 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -1,73 +1,44 @@ #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H -#define PTE_INDEX_SIZE 8 -#define PMD_INDEX_SIZE 5 -#define PUD_INDEX_SIZE 5 -#define PGD_INDEX_SIZE 12 - -#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) -#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) -#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) -#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) +#define H_PTE_INDEX_SIZE 8 +#define H_PMD_INDEX_SIZE 5 +#define H_PUD_INDEX_SIZE 5 +#define H_PGD_INDEX_SIZE 12 /* With 4k base page size, hugepage PTEs go at the PMD level */ #define MIN_HUGEPTE_SHIFT PAGE_SHIFT -/* PMD_SHIFT determines what a second-level page table entry can map */ -#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) - -/* PUD_SHIFT determines what a third-level page table entry can map */ -#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) - -/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ -#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define _PAGE_COMBO 0x00001000 /* this is a combo 4k page */ -#define _PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ +#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ +#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ /* - * Used to track subpage group valid if _PAGE_COMBO is set - * This overloads _PAGE_F_GIX and _PAGE_F_SECOND + * We need to differentiate between explicit huge page and THP huge + * page, since THP huge page also need to track real subpage details */ -#define _PAGE_COMBO_VALID (_PAGE_F_GIX | _PAGE_F_SECOND) +#define H_PAGE_THP_HUGE H_PAGE_4K_PFN -/* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \ - _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO) - -/* Shift to put page number into pte. - * - * That gives us a max RPN of 41 bits, which means a max of 57 bits - * of addressable physical space, or 53 bits for the special 4k PFNs. +/* + * Used to track subpage group valid if H_PAGE_COMBO is set + * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND */ -#define PTE_RPN_SHIFT (16) -#define PTE_RPN_SIZE (41) +#define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND) +/* PTE flags to conserve for HPTE identification */ +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \ + H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO) /* * we support 16 fragments per PTE page of 64K size. */ -#define PTE_FRAG_NR 16 +#define H_PTE_FRAG_NR 16 /* * We use a 2K PTE page fragment and another 2K for storing * real_pte_t hash index */ -#define PTE_FRAG_SIZE_SHIFT 12 +#define H_PTE_FRAG_SIZE_SHIFT 12 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) -/* Bits to mask out from a PMD to get to the PTE page */ -#define PMD_MASKED_BITS 0xc0000000000000ffUL -/* Bits to mask out from a PUD to get to the PMD page */ -#define PUD_MASKED_BITS 0xc0000000000000ffUL -/* Bits to mask out from a PGD to get to the PUD page */ -#define PGD_MASKED_BITS 0xc0000000000000ffUL - #ifndef __ASSEMBLY__ +#include <asm/errno.h> /* * With 64K pages on hash table, we have a special PTE format that @@ -83,9 +54,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep) rpte.pte = pte; rpte.hidx = 0; - if (pte_val(pte) & _PAGE_COMBO) { + if (pte_val(pte) & H_PAGE_COMBO) { /* - * Make sure we order the hidx load against the _PAGE_COMBO + * Make sure we order the hidx load against the H_PAGE_COMBO * check. The store side ordering is done in __hash_page_4K */ smp_rmb(); @@ -97,9 +68,9 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep) static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) { - if ((pte_val(rpte.pte) & _PAGE_COMBO)) + if ((pte_val(rpte.pte) & H_PAGE_COMBO)) return (rpte.hidx >> (index<<2)) & 0xf; - return (pte_val(rpte.pte) >> _PAGE_F_GIX_SHIFT) & 0xf; + return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf; } #define __rpte_to_pte(r) ((r).pte) @@ -122,79 +93,32 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index); #define pte_iterate_hashed_end() } while(0); } } while(0) #define pte_pagesize_index(mm, addr, pte) \ - (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) - -#define remap_4k_pfn(vma, addr, pfn, prot) \ - (WARN_ON(((pfn) >= (1UL << PTE_RPN_SIZE))) ? -EINVAL : \ - remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, \ - __pgprot(pgprot_val((prot)) | _PAGE_4K_PFN))) - -#define PTE_TABLE_SIZE PTE_FRAG_SIZE -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + (sizeof(unsigned long) << PMD_INDEX_SIZE)) -#else -#define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) -#endif -#define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) -#define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) - -#ifdef CONFIG_HUGETLB_PAGE -/* - * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have - * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; - * - * Defined in such a way that we can optimize away code block at build time - * if CONFIG_HUGETLB_PAGE=n. - */ -static inline int pmd_huge(pmd_t pmd) -{ - /* - * leaf pte for huge page - */ - return !!(pmd_val(pmd) & _PAGE_PTE); -} - -static inline int pud_huge(pud_t pud) -{ - /* - * leaf pte for huge page - */ - return !!(pud_val(pud) & _PAGE_PTE); -} + (((pte) & H_PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) -static inline int pgd_huge(pgd_t pgd) +extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); +static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) { - /* - * leaf pte for huge page - */ - return !!(pgd_val(pgd) & _PAGE_PTE); + if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) { + WARN(1, "remap_4k_pfn called with wrong pfn value\n"); + return -EINVAL; + } + return remap_pfn_range(vma, addr, pfn, PAGE_SIZE, + __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN)); } -#define pgd_huge pgd_huge -#ifdef CONFIG_DEBUG_VM -extern int hugepd_ok(hugepd_t hpd); -#define is_hugepd(hpd) (hugepd_ok(hpd)) +#define H_PTE_TABLE_SIZE PTE_FRAG_SIZE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define H_PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + \ + (sizeof(unsigned long) << PMD_INDEX_SIZE)) #else -/* - * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't - * need to setup hugepage directory for them. Our pte and page directory format - * enable us to have this enabled. - */ -static inline int hugepd_ok(hugepd_t hpd) -{ - return 0; -} -#define is_hugepd(pdep) 0 -#endif /* CONFIG_DEBUG_VM */ - -#endif /* CONFIG_HUGETLB_PAGE */ +#define H_PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) +#endif +#define H_PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) +#define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern unsigned long pmd_hugepage_update(struct mm_struct *mm, - unsigned long addr, - pmd_t *pmdp, - unsigned long clr, - unsigned long set); static inline char *get_hpte_slot_array(pmd_t *pmdp) { /* @@ -253,50 +177,35 @@ static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, * that for explicit huge pages. * */ -static inline int pmd_trans_huge(pmd_t pmd) +static inline int hash__pmd_trans_huge(pmd_t pmd) { - return !!((pmd_val(pmd) & (_PAGE_PTE | _PAGE_THP_HUGE)) == - (_PAGE_PTE | _PAGE_THP_HUGE)); + return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) == + (_PAGE_PTE | H_PAGE_THP_HUGE)); } -static inline int pmd_large(pmd_t pmd) +static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) { - return !!(pmd_val(pmd) & _PAGE_PTE); + return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) { - return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); -} - -#define __HAVE_ARCH_PMD_SAME -static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) -{ - return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0); -} - -static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - unsigned long old; - - if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) - return 0; - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); - return ((old & _PAGE_ACCESSED) != 0); -} - -#define __HAVE_ARCH_PMDP_SET_WRPROTECT -static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - - if ((pmd_val(*pmdp) & _PAGE_RW) == 0) - return; - - pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0); + return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); } +extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set); +extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int hash__has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d0ee6fcef..f61cad3de 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -13,48 +13,12 @@ * We could create separate kernel read-only if we used the 3 PP bits * combinations that newer processors provide but we currently don't. */ -#define _PAGE_BIT_SWAP_TYPE 0 - -#define _PAGE_EXEC 0x00001 /* execute permission */ -#define _PAGE_RW 0x00002 /* read & write access allowed */ -#define _PAGE_READ 0x00004 /* read access allowed */ -#define _PAGE_USER 0x00008 /* page may be accessed by userspace */ -#define _PAGE_GUARDED 0x00010 /* G: guarded (side-effect) page */ -/* M (memory coherence) is always set in the HPTE, so we don't need it here */ -#define _PAGE_COHERENT 0x0 -#define _PAGE_NO_CACHE 0x00020 /* I: cache inhibit */ -#define _PAGE_WRITETHRU 0x00040 /* W: cache write-through */ -#define _PAGE_DIRTY 0x00080 /* C: page changed */ -#define _PAGE_ACCESSED 0x00100 /* R: page referenced */ -#define _PAGE_SPECIAL 0x00400 /* software: special page */ -#define _PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ - -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY 0x200 /* software: software dirty tracking */ -#else -#define _PAGE_SOFT_DIRTY 0x000 -#endif - -#define _PAGE_F_GIX_SHIFT 57 -#define _PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ -#define _PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ -#define _PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ -#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ -#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ - -/* - * We need to differentiate between explicit huge page and THP huge - * page, since THP huge page also need to track real subpage details - */ -#define _PAGE_THP_HUGE _PAGE_4K_PFN - -/* - * set of bits not changed in pmd_modify. - */ -#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) - +#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ +#define H_PTE_NONE_MASK _PAGE_HPTEFLAGS +#define H_PAGE_F_GIX_SHIFT 57 +#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ +#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ +#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ #ifdef CONFIG_PPC_64K_PAGES #include <asm/book3s/64/hash-64k.h> @@ -65,29 +29,33 @@ /* * Size of EA range mapped by our pagetables. */ -#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ - PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) -#define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) +#define H_PGTABLE_EADDR_SIZE (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \ + H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT) +#define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1) +/* + * only with hash we need to use the second half of pmd page table + * to store pointer to deposited pgtable_t + */ +#define H_PMD_CACHE_INDEX (H_PMD_INDEX_SIZE + 1) #else -#define PMD_CACHE_INDEX PMD_INDEX_SIZE +#define H_PMD_CACHE_INDEX H_PMD_INDEX_SIZE #endif /* * Define the address range of the kernel non-linear virtual area */ -#define KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) +#define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) +#define H_KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) /* * The vmalloc space starts at the beginning of that region, and * occupies half of it on hash CPUs and a quarter of it on Book3E * (we keep a quarter for the virtual memmap) */ -#define VMALLOC_START KERN_VIRT_START -#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) -#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) +#define H_VMALLOC_START H_KERN_VIRT_START +#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) +#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) /* * Region IDs @@ -96,7 +64,7 @@ #define REGION_MASK (0xfUL << REGION_SHIFT) #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) -#define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) +#define VMALLOC_REGION_ID (REGION_ID(H_VMALLOC_START)) #define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) #define VMEMMAP_REGION_ID (0xfUL) /* Server only */ #define USER_REGION_ID (0UL) @@ -105,381 +73,97 @@ * Defines the address of the vmemap area, in its own region on * hash table CPUs. */ -#define VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) +#define H_VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN #endif /* CONFIG_PPC_MM_SLICES */ -/* No separate kernel read-only */ -#define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ -#define _PAGE_KERNEL_RO _PAGE_KERNEL_RW -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) - -/* Strong Access Ordering */ -#define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT) - -/* No page size encoding in the linux PTE */ -#define _PAGE_PSIZE 0 /* PTEIDX nibble */ #define _PTEIDX_SECONDARY 0x8 #define _PTEIDX_GROUP_IX 0x7 -/* Hash table based platforms need atomic updates of the linux PTE */ -#define PTE_ATOMIC_UPDATES 1 -#define _PTE_NONE_MASK _PAGE_HPTEFLAGS -/* - * The mask convered by the RPN must be a ULL on 32-bit platforms with - * 64-bit PTEs - */ -#define PTE_RPN_MASK (((1UL << PTE_RPN_SIZE) - 1) << PTE_RPN_SHIFT) -/* - * _PAGE_CHG_MASK masks of bits that are to be preserved across - * pgprot changes - */ -#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ - _PAGE_SOFT_DIRTY) -/* - * Mask of bits returned by pte_pgprot() - */ -#define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU | _PAGE_4K_PFN | \ - _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC | \ - _PAGE_SOFT_DIRTY) -/* - * We define 2 sets of base prot bits, one for basic pages (ie, - * cacheable kernel and user pages) and one for non cacheable - * pages. We always set _PAGE_COHERENT when SMP is enabled or - * the processor might need it for DMA coherency. - */ -#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) -#define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) - -/* Permission masks used to generate the __P and __S table, - * - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h - * - * Write permissions imply read permissions for now (we could make write-only - * pages on BookE but we don't bother for now). Execute permission control is - * possible on platforms that define _PAGE_EXEC - * - * Note due to the way vm flags are laid out, the bits are XWR - */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \ - _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER ) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER ) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) - -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - -/* Permission masks used for kernel mappings */ -#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) -#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE) -#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ - _PAGE_NO_CACHE | _PAGE_GUARDED) -#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) -#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) - -/* Protection used for kernel text. We want the debuggers to be able to - * set breakpoints anywhere, so don't write protect the kernel text - * on platforms where such control is possible. - */ -#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) -#define PAGE_KERNEL_TEXT PAGE_KERNEL_X -#else -#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX -#endif - -/* Make modules code happy. We don't set RO yet */ -#define PAGE_KERNEL_EXEC PAGE_KERNEL_X -#define PAGE_AGP (PAGE_KERNEL_NC) - -#define PMD_BAD_BITS (PTE_TABLE_SIZE-1) -#define PUD_BAD_BITS (PMD_TABLE_SIZE-1) +#define H_PMD_BAD_BITS (PTE_TABLE_SIZE-1) +#define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) #ifndef __ASSEMBLY__ -#define pmd_bad(pmd) (pmd_val(pmd) & PMD_BAD_BITS) -#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) - -#define pud_bad(pud) (pud_val(pud) & PUD_BAD_BITS) -#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) - -/* Pointers in the page table tree are physical addresses */ -#define __pgtable_ptr_val(ptr) __pa(ptr) - -#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) -#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) -#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) -#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) +#define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) +#define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) +static inline int hash__pgd_bad(pgd_t pgd) +{ + return (pgd_val(pgd) == 0); +} extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); extern unsigned long htab_convert_pte_flags(unsigned long pteflags); /* Atomic PTE updates */ -static inline unsigned long pte_update(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, unsigned long clr, - unsigned long set, - int huge) +static inline unsigned long hash__pte_update(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, + int huge) { - unsigned long old, tmp; + __be64 old_be, tmp_be; + unsigned long old; __asm__ __volatile__( "1: ldarx %0,0,%3 # pte_update\n\ - andi. %1,%0,%6\n\ + and. %1,%0,%6\n\ bne- 1b \n\ andc %1,%0,%4 \n\ or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*ptep) - : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set) + : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep) + : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) : "cc" ); /* huge pages use the old page table lock */ if (!huge) assert_pte_locked(mm, addr); - if (old & _PAGE_HASHPTE) + old = be64_to_cpu(old_be); + if (old & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); return old; } -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old; - - if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) - return 0; - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); - return (old & _PAGE_ACCESSED) != 0; -} -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ -({ \ - int __r; \ - __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ - __r; \ -}) - -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - - if ((pte_val(*ptep) & _PAGE_RW) == 0) - return; - - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - if ((pte_val(*ptep) & _PAGE_RW) == 0) - return; - - pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); -} - -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \ - __ptep); \ - __young; \ -}) - -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); - return __pte(old); -} - -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t * ptep) -{ - pte_update(mm, addr, ptep, ~0UL, 0, 0); -} - - /* Set the dirty and/or accessed bits atomically in a linux PTE, this * function doesn't need to flush the hash entry */ -static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) +static inline void hash__ptep_set_access_flags(pte_t *ptep, pte_t entry) { - unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC | - _PAGE_SOFT_DIRTY); + __be64 old, tmp, val, mask; - unsigned long old, tmp; + mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_READ | _PAGE_WRITE | + _PAGE_EXEC | _PAGE_SOFT_DIRTY); + + val = pte_raw(entry) & mask; __asm__ __volatile__( "1: ldarx %0,0,%4\n\ - andi. %1,%0,%6\n\ + and. %1,%0,%6\n\ bne- 1b \n\ or %0,%3,%0\n\ stdcx. %0,0,%4\n\ bne- 1b" :"=&r" (old), "=&r" (tmp), "=m" (*ptep) - :"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY) + :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY)) :"cc"); } -static inline int pgd_bad(pgd_t pgd) -{ - return (pgd_val(pgd) == 0); -} - -#define __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) -static inline unsigned long pgd_page_vaddr(pgd_t pgd) -{ - return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS); -} - - -/* Generic accessors to PTE bits */ -static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} -static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } -static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } -static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } -static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } -static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } - -#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -static inline bool pte_soft_dirty(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); -} -static inline pte_t pte_mksoft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); -} - -static inline pte_t pte_clear_soft_dirty(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); -} -#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ - -#ifdef CONFIG_NUMA_BALANCING -/* - * These work without NUMA balancing but the kernel does not care. See the - * comment in include/asm-generic/pgtable.h . On powerpc, this will only - * work for user pages and always return true for kernel pages. - */ -static inline int pte_protnone(pte_t pte) -{ - return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; -} -#endif /* CONFIG_NUMA_BALANCING */ - -static inline int pte_present(pte_t pte) -{ - return !!(pte_val(pte) & _PAGE_PRESENT); -} - -/* Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - * - * Even if PTEs can be unsigned long long, a PFN is always an unsigned - * long for now. - */ -static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) -{ - return __pte((((pte_basic_t)(pfn) << PTE_RPN_SHIFT) & PTE_RPN_MASK) | - pgprot_val(pgprot)); -} - -static inline unsigned long pte_pfn(pte_t pte) -{ - return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT; -} - -/* Generic modifiers for PTE bits */ -static inline pte_t pte_wrprotect(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_RW); -} - -static inline pte_t pte_mkclean(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_DIRTY); -} - -static inline pte_t pte_mkold(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_ACCESSED); -} - -static inline pte_t pte_mkwrite(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_RW); -} - -static inline pte_t pte_mkdirty(pte_t pte) +static inline int hash__pte_same(pte_t pte_a, pte_t pte_b) { - return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); + return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); } -static inline pte_t pte_mkyoung(pte_t pte) +static inline int hash__pte_none(pte_t pte) { - return __pte(pte_val(pte) | _PAGE_ACCESSED); -} - -static inline pte_t pte_mkspecial(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SPECIAL); -} - -static inline pte_t pte_mkhuge(pte_t pte) -{ - return pte; -} - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); + return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; } /* This low level function performs the actual PTE insertion @@ -487,8 +171,8 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) * an horrible mess that I'm not going to try to clean up now but * I'm keeping it in one place rather than spread around */ -static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, int percpu) +static inline void hash__set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) { /* * Anything else just stores the PTE normally. That covers all 64-bit @@ -497,53 +181,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, *ptep = pte; } -/* - * Macro to mark a page protection value as "uncacheable". - */ - -#define _PAGE_CACHE_CTL (_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \ - _PAGE_WRITETHRU) - -#define pgprot_noncached pgprot_noncached -static inline pgprot_t pgprot_noncached(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_NO_CACHE | _PAGE_GUARDED); -} - -#define pgprot_noncached_wc pgprot_noncached_wc -static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_NO_CACHE); -} - -#define pgprot_cached pgprot_cached -static inline pgprot_t pgprot_cached(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_COHERENT); -} - -#define pgprot_cached_wthru pgprot_cached_wthru -static inline pgprot_t pgprot_cached_wthru(pgprot_t prot) -{ - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | - _PAGE_COHERENT | _PAGE_WRITETHRU); -} - -#define pgprot_cached_noncoherent pgprot_cached_noncoherent -static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot) -{ - return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL); -} - -#define pgprot_writecombine pgprot_writecombine -static inline pgprot_t pgprot_writecombine(pgprot_t prot) -{ - return pgprot_noncached_wc(prot); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long old_pmd); @@ -556,6 +193,14 @@ static inline void hpte_do_hugepage_flush(struct mm_struct *mm, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +extern int hash__map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags); +extern int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h new file mode 100644 index 000000000..60f476493 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h @@ -0,0 +1,14 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H +#define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H +/* + * For radix we want generic code to handle hugetlb. But then if we want + * both hash and radix to be enabled together we need to workaround the + * limitations. + */ +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +#endif diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 0cea4807e..74839f24f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -1,5 +1,5 @@ -#ifndef _ASM_POWERPC_MMU_HASH64_H_ -#define _ASM_POWERPC_MMU_HASH64_H_ +#ifndef _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ +#define _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ /* * PowerPC64 memory management structures * @@ -78,12 +78,17 @@ #define HPTE_V_SECONDARY ASM_CONST(0x0000000000000002) #define HPTE_V_VALID ASM_CONST(0x0000000000000001) +/* + * ISA 3.0 have a different HPTE format. + */ +#define HPTE_R_3_0_SSIZE_SHIFT 58 #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) #define HPTE_R_TS ASM_CONST(0x4000000000000000) #define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000) #define HPTE_R_RPN_SHIFT 12 #define HPTE_R_RPN ASM_CONST(0x0ffffffffffff000) #define HPTE_R_PP ASM_CONST(0x0000000000000003) +#define HPTE_R_PPP ASM_CONST(0x8000000000000003) #define HPTE_R_N ASM_CONST(0x0000000000000004) #define HPTE_R_G ASM_CONST(0x0000000000000008) #define HPTE_R_M ASM_CONST(0x0000000000000010) @@ -115,6 +120,7 @@ #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ #define POWER8_TLB_SETS 512 /* # sets in POWER8 TLB */ #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */ +#define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ #ifndef __ASSEMBLY__ @@ -127,24 +133,6 @@ extern struct hash_pte *htab_address; extern unsigned long htab_size_bytes; extern unsigned long htab_hash_mask; -/* - * Page size definition - * - * shift : is the "PAGE_SHIFT" value for that page size - * sllp : is a bit mask with the value of SLB L || LP to be or'ed - * directly to a slbmte "vsid" value - * penc : is the HPTE encoding mask for the "LP" field: - * - */ -struct mmu_psize_def -{ - unsigned int shift; /* number of bits */ - int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ - unsigned int tlbiel; /* tlbiel supported for that page size */ - unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ - unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ -}; -extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; static inline int shift_to_mmu_psize(unsigned int shift) { @@ -210,11 +198,6 @@ static inline int segment_shift(int ssize) /* * The current system page and segment sizes */ -extern int mmu_linear_psize; -extern int mmu_virtual_psize; -extern int mmu_vmalloc_psize; -extern int mmu_vmemmap_psize; -extern int mmu_io_psize; extern int mmu_kernel_ssize; extern int mmu_highuser_ssize; extern u16 mmu_slb_size; @@ -247,7 +230,8 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize, */ v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm); v <<= HPTE_V_AVPN_SHIFT; - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; return v; } @@ -271,8 +255,12 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize, * aligned for the requested page size */ static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize, - int actual_psize) + int actual_psize, int ssize) { + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT; + /* A 4K page needs no special encoding */ if (actual_psize == MMU_PAGE_4K) return pa & HPTE_R_RPN; @@ -476,7 +464,7 @@ extern void slb_set_size(u16 size); add rt,rt,rx /* 4 bits per slice and we have one slice per 1TB */ -#define SLICE_ARRAY_SIZE (PGTABLE_RANGE >> 41) +#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) #ifndef __ASSEMBLY__ @@ -512,38 +500,6 @@ static inline void subpage_prot_free(struct mm_struct *mm) {} static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #endif /* CONFIG_PPC_SUBPAGE_PROT */ -typedef unsigned long mm_context_id_t; -struct spinlock; - -typedef struct { - mm_context_id_t id; - u16 user_psize; /* page size index */ - -#ifdef CONFIG_PPC_MM_SLICES - u64 low_slices_psize; /* SLB page size encodings */ - unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; -#else - u16 sllp; /* SLB page size encoding */ -#endif - unsigned long vdso_base; -#ifdef CONFIG_PPC_SUBPAGE_PROT - struct subpage_prot_table spt; -#endif /* CONFIG_PPC_SUBPAGE_PROT */ -#ifdef CONFIG_PPC_ICSWX - struct spinlock *cop_lockp; /* guard acop and cop_pid */ - unsigned long acop; /* mask of enabled coprocessor types */ - unsigned int cop_pid; /* pid value used with coprocessors */ -#endif /* CONFIG_PPC_ICSWX */ -#ifdef CONFIG_PPC_64K_PAGES - /* for 4K PTE fragment support */ - void *pte_frag; -#endif -#ifdef CONFIG_SPAPR_TCE_IOMMU - struct list_head iommu_group_mem_list; -#endif -} mm_context_t; - - #if 0 /* * The code below is equivalent to this function for arguments @@ -579,7 +535,7 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, /* * Bad address. We return VSID 0 for that */ - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) + if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) return 0; if (ssize == MMU_SEGSIZE_256M) @@ -613,4 +569,4 @@ unsigned htab_shift_for_mem_size(unsigned long mem_size); #endif /* __ASSEMBLY__ */ -#endif /* _ASM_POWERPC_MMU_HASH64_H_ */ +#endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h new file mode 100644 index 000000000..5854263d4 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -0,0 +1,137 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_MMU_H_ +#define _ASM_POWERPC_BOOK3S_64_MMU_H_ + +#ifndef __ASSEMBLY__ +/* + * Page size definition + * + * shift : is the "PAGE_SHIFT" value for that page size + * sllp : is a bit mask with the value of SLB L || LP to be or'ed + * directly to a slbmte "vsid" value + * penc : is the HPTE encoding mask for the "LP" field: + * + */ +struct mmu_psize_def { + unsigned int shift; /* number of bits */ + int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ + unsigned int tlbiel; /* tlbiel supported for that page size */ + unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ + union { + unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ + unsigned long ap; /* Ap encoding used by PowerISA 3.0 */ + }; +}; +extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + +#define radix_enabled() mmu_has_feature(MMU_FTR_RADIX) + +#endif /* __ASSEMBLY__ */ + +/* 64-bit classic hash table MMU */ +#include <asm/book3s/64/mmu-hash.h> + +#ifndef __ASSEMBLY__ +/* + * ISA 3.0 partiton and process table entry format + */ +struct prtb_entry { + __be64 prtb0; + __be64 prtb1; +}; +extern struct prtb_entry *process_tb; + +struct patb_entry { + __be64 patb0; + __be64 patb1; +}; +extern struct patb_entry *partition_tb; + +#define PATB_HR (1UL << 63) +#define PATB_GR (1UL << 63) +#define RPDB_MASK 0x0ffffffffffff00fUL +#define RPDB_SHIFT (1UL << 8) +/* + * Limit process table to PAGE_SIZE table. This + * also limit the max pid we can support. + * MAX_USER_CONTEXT * 16 bytes of space. + */ +#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) +/* + * Power9 currently only support 64K partition table size. + */ +#define PATB_SIZE_SHIFT 16 + +typedef unsigned long mm_context_id_t; +struct spinlock; + +typedef struct { + mm_context_id_t id; + u16 user_psize; /* page size index */ + +#ifdef CONFIG_PPC_MM_SLICES + u64 low_slices_psize; /* SLB page size encodings */ + unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; +#else + u16 sllp; /* SLB page size encoding */ +#endif + unsigned long vdso_base; +#ifdef CONFIG_PPC_SUBPAGE_PROT + struct subpage_prot_table spt; +#endif /* CONFIG_PPC_SUBPAGE_PROT */ +#ifdef CONFIG_PPC_ICSWX + struct spinlock *cop_lockp; /* guard acop and cop_pid */ + unsigned long acop; /* mask of enabled coprocessor types */ + unsigned int cop_pid; /* pid value used with coprocessors */ +#endif /* CONFIG_PPC_ICSWX */ +#ifdef CONFIG_PPC_64K_PAGES + /* for 4K PTE fragment support */ + void *pte_frag; +#endif +#ifdef CONFIG_SPAPR_TCE_IOMMU + struct list_head iommu_group_mem_list; +#endif +} mm_context_t; + +/* + * The current system page and segment sizes + */ +extern int mmu_linear_psize; +extern int mmu_virtual_psize; +extern int mmu_vmalloc_psize; +extern int mmu_vmemmap_psize; +extern int mmu_io_psize; + +/* MMU initialization */ +extern void radix_init_native(void); +extern void hash__early_init_mmu(void); +extern void radix__early_init_mmu(void); +static inline void early_init_mmu(void) +{ + if (radix_enabled()) + return radix__early_init_mmu(); + return hash__early_init_mmu(); +} +extern void hash__early_init_mmu_secondary(void); +extern void radix__early_init_mmu_secondary(void); +static inline void early_init_mmu_secondary(void) +{ + if (radix_enabled()) + return radix__early_init_mmu_secondary(); + return hash__early_init_mmu_secondary(); +} + +extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + if (radix_enabled()) + return radix__setup_initial_memory_limit(first_memblock_base, + first_memblock_size); + return hash__setup_initial_memory_limit(first_memblock_base, + first_memblock_size); +} +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h new file mode 100644 index 000000000..cd5e7aa8c --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -0,0 +1,219 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_64_PGALLOC_H +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/slab.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> + +struct vmemmap_backing { + struct vmemmap_backing *list; + unsigned long phys; + unsigned long virt_addr; +}; +extern struct vmemmap_backing *vmemmap_list; + +/* + * Functions that deal with pagetables that could be at any level of + * the table need to be passed an "index_size" so they know how to + * handle allocation. For PTE pages (which are linked to a struct + * page for now, and drawn from the main get_free_pages() pool), the + * allocation size will be (2^index_size * sizeof(pointer)) and + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). + * + * The maximum index size needs to be big enough to allow any + * pagetable sizes we need, but small enough to fit in the low bits of + * any page table pointer. In other words all pagetables, even tiny + * ones, must be aligned to allow at least enough low 0 bits to + * contain this value. This value is also used as a mask, so it must + * be one less than a power of two. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf + +extern struct kmem_cache *pgtable_cache[]; +#define PGT_CACHE(shift) ({ \ + BUG_ON(!(shift)); \ + pgtable_cache[(shift) - 1]; \ + }) + +#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO + +extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern void pte_fragment_free(unsigned long *, int); +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); +#ifdef CONFIG_SMP +extern void __tlb_remove_table(void *_table); +#endif + +static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) +{ +#ifdef CONFIG_PPC_64K_PAGES + return (pgd_t *)__get_free_page(PGALLOC_GFP); +#else + struct page *page; + page = alloc_pages(PGALLOC_GFP | __GFP_REPEAT, 4); + if (!page) + return NULL; + return (pgd_t *) page_address(page); +#endif +} + +static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_PPC_64K_PAGES + free_page((unsigned long)pgd); +#else + free_pages((unsigned long)pgd, 4); +#endif +} + +static inline pgd_t *pgd_alloc(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__pgd_alloc(mm); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); +} + +static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + if (radix_enabled()) + return radix__pgd_free(mm, pgd); + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); +} + +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +{ + pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); +} + +static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), GFP_KERNEL); +} + +static inline void pud_free(struct mm_struct *mm, pud_t *pud) +{ + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); +} + +static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +{ + pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); +} + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + /* + * By now all the pud entries should be none entries. So go + * ahead and flush the page walk cache + */ + flush_tlb_pgtable(tlb, address); + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE); +} + +static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), GFP_KERNEL); +} + +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +{ + kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); +} + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) +{ + /* + * By now all the pud entries should be none entries. So go + * ahead and flush the page walk cache + */ + flush_tlb_pgtable(tlb, address); + return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX); +} + +static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, + pte_t *pte) +{ + pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); +} + +static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, + pgtable_t pte_page) +{ + pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); +} + +static inline pgtable_t pmd_pgtable(pmd_t pmd) +{ + return (pgtable_t)pmd_page_vaddr(pmd); +} + +#ifdef CONFIG_PPC_4K_PAGES +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + struct page *page; + pte_t *pte; + + pte = pte_alloc_one_kernel(mm, address); + if (!pte) + return NULL; + page = virt_to_page(pte); + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + return pte; +} +#else /* if CONFIG_PPC_64K_PAGES */ + +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, + unsigned long address) +{ + return (pte_t *)pte_fragment_alloc(mm, address, 1); +} + +static inline pgtable_t pte_alloc_one(struct mm_struct *mm, + unsigned long address) +{ + return (pgtable_t)pte_fragment_alloc(mm, address, 0); +} +#endif + +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + pte_fragment_free((unsigned long *)pte, 1); +} + +static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) +{ + pte_fragment_free((unsigned long *)ptepage, 0); +} + +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, + unsigned long address) +{ + /* + * By now all the pud entries should be none entries. So go + * ahead and flush the page walk cache + */ + flush_tlb_pgtable(tlb, address); + pgtable_free_tlb(tlb, table, 0); +} + +#define check_pgt_cache() do { } while (0) + +#endif /* _ASM_POWERPC_BOOK3S_64_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h new file mode 100644 index 000000000..71e9abced --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -0,0 +1,53 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H +#define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H +/* + * hash 4k can't share hugetlb and also doesn't support THP + */ +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +static inline int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pmd_val(pmd) & _PAGE_PTE); + return 0; +} + +static inline int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pud_val(pud) & _PAGE_PTE); + return 0; +} + +static inline int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page + */ + if (radix_enabled()) + return !!(pgd_val(pgd) & _PAGE_PTE); + return 0; +} +#define pgd_huge pgd_huge +/* + * With radix , we have hugepage ptes in the pud and pmd entries. We don't + * need to setup hugepage directory for them. Our pte and page directory format + * enable us to have this enabled. + */ +static inline int hugepd_ok(hugepd_t hpd) +{ + if (radix_enabled()) + return 0; + return hash__hugepd_ok(hpd); +} +#define is_hugepd(hpd) (hugepd_ok(hpd)) +#endif /* CONFIG_HUGETLB_PAGE */ +#endif /* __ASSEMBLY__ */ + +#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h new file mode 100644 index 000000000..cb2d0a5fa --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -0,0 +1,64 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H +#define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_HUGETLB_PAGE +/* + * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have + * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; + * + * Defined in such a way that we can optimize away code block at build time + * if CONFIG_HUGETLB_PAGE=n. + */ +static inline int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page + */ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page + */ + return !!(pud_val(pud) & _PAGE_PTE); +} + +static inline int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page + */ + return !!(pgd_val(pgd) & _PAGE_PTE); +} +#define pgd_huge pgd_huge + +#ifdef CONFIG_DEBUG_VM +extern int hugepd_ok(hugepd_t hpd); +#define is_hugepd(hpd) (hugepd_ok(hpd)) +#else +/* + * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't + * need to setup hugepage directory for them. Our pte and page directory format + * enable us to have this enabled. + */ +static inline int hugepd_ok(hugepd_t hpd) +{ + return 0; +} +#define is_hugepd(pdep) 0 +#endif /* CONFIG_DEBUG_VM */ + +#endif /* CONFIG_HUGETLB_PAGE */ + +static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t prot) +{ + if (radix_enabled()) + BUG(); + return hash__remap_4k_pfn(vma, addr, pfn, prot); +} +#endif /* __ASSEMBLY__ */ +#endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 77d3ce057..ab84c89c9 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1,13 +1,248 @@ #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ + +/* + * Common bits between hash and Radix page table + */ +#define _PAGE_BIT_SWAP_TYPE 0 + +#define _PAGE_EXEC 0x00001 /* execute permission */ +#define _PAGE_WRITE 0x00002 /* write access allowed */ +#define _PAGE_READ 0x00004 /* read access allowed */ +#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) +#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) +#define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ +#define _PAGE_SAO 0x00010 /* Strong access order */ +#define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ +#define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ +#define _PAGE_DIRTY 0x00080 /* C: page changed */ +#define _PAGE_ACCESSED 0x00100 /* R: page referenced */ /* - * This file contains the functions and defines necessary to modify and use - * the ppc64 hashed page table. + * Software bits */ +#define _RPAGE_SW0 0x2000000000000000UL +#define _RPAGE_SW1 0x00800 +#define _RPAGE_SW2 0x00400 +#define _RPAGE_SW3 0x00200 +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ +#else +#define _PAGE_SOFT_DIRTY 0x00000 +#endif +#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ + + +#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ +#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ +/* + * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE + * Instead of fixing all of them, add an alternate define which + * maps CI pte mapping. + */ +#define _PAGE_NO_CACHE _PAGE_TOLERANT +/* + * We support 57 bit real address in pte. Clear everything above 57, and + * every thing below PAGE_SHIFT; + */ +#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) +/* + * set of bits not changed in pmd_modify. Even though we have hash specific bits + * in here, on radix we expect them to be zero. + */ +#define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ + _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ + _PAGE_SOFT_DIRTY) +/* + * user access blocked by key + */ +#define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) +#define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ + _PAGE_RW | _PAGE_EXEC) +/* + * No page size encoding in the linux PTE + */ +#define _PAGE_PSIZE 0 +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ + _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ + _PAGE_SOFT_DIRTY) +/* + * Mask of bits returned by pte_pgprot() + */ +#define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ + H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ + _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ + _PAGE_SOFT_DIRTY) +/* + * We define 2 sets of base prot bits, one for basic pages (ie, + * cacheable kernel and user pages) and one for non cacheable + * pages. We always set _PAGE_COHERENT when SMP is enabled or + * the processor might need it for DMA coherency. + */ +#define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) +#define _PAGE_BASE (_PAGE_BASE_NC) + +/* Permission masks used to generate the __P and __S table, + * + * Note:__pgprot is defined in arch/powerpc/include/asm/page.h + * + * Write permissions imply read permissions for now (we could make write-only + * pages on BookE but we don't bother for now). Execute permission control is + * possible on platforms that define _PAGE_EXEC + * + * Note due to the way vm flags are laid out, the bits are XWR + */ +#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) + +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_X +#define __P101 PAGE_READONLY_X +#define __P110 PAGE_COPY_X +#define __P111 PAGE_COPY_X + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_X +#define __S101 PAGE_READONLY_X +#define __S110 PAGE_SHARED_X +#define __S111 PAGE_SHARED_X + +/* Permission masks used for kernel mappings */ +#define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) +#define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_TOLERANT) +#define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ + _PAGE_NON_IDEMPOTENT) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) +#define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) + +/* + * Protection used for kernel text. We want the debuggers to be able to + * set breakpoints anywhere, so don't write protect the kernel text + * on platforms where such control is possible. + */ +#if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) +#define PAGE_KERNEL_TEXT PAGE_KERNEL_X +#else +#define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX +#endif + +/* Make modules code happy. We don't set RO yet */ +#define PAGE_KERNEL_EXEC PAGE_KERNEL_X +#define PAGE_AGP (PAGE_KERNEL_NC) + +#ifndef __ASSEMBLY__ +/* + * page table defines + */ +extern unsigned long __pte_index_size; +extern unsigned long __pmd_index_size; +extern unsigned long __pud_index_size; +extern unsigned long __pgd_index_size; +extern unsigned long __pmd_cache_index; +#define PTE_INDEX_SIZE __pte_index_size +#define PMD_INDEX_SIZE __pmd_index_size +#define PUD_INDEX_SIZE __pud_index_size +#define PGD_INDEX_SIZE __pgd_index_size +#define PMD_CACHE_INDEX __pmd_cache_index +/* + * Because of use of pte fragments and THP, size of page table + * are not always derived out of index size above. + */ +extern unsigned long __pte_table_size; +extern unsigned long __pmd_table_size; +extern unsigned long __pud_table_size; +extern unsigned long __pgd_table_size; +#define PTE_TABLE_SIZE __pte_table_size +#define PMD_TABLE_SIZE __pmd_table_size +#define PUD_TABLE_SIZE __pud_table_size +#define PGD_TABLE_SIZE __pgd_table_size + +extern unsigned long __pmd_val_bits; +extern unsigned long __pud_val_bits; +extern unsigned long __pgd_val_bits; +#define PMD_VAL_BITS __pmd_val_bits +#define PUD_VAL_BITS __pud_val_bits +#define PGD_VAL_BITS __pgd_val_bits + +extern unsigned long __pte_frag_nr; +#define PTE_FRAG_NR __pte_frag_nr +extern unsigned long __pte_frag_size_shift; +#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift +#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) +/* + * Pgtable size used by swapper, init in asm code + */ +#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) + +#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) +#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) +#define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) +#define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) + +/* PMD_SHIFT determines what a second-level page table entry can map */ +#define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) + +/* PUD_SHIFT determines what a third-level page table entry can map */ +#define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) + +/* PGDIR_SHIFT determines what a fourth-level page table entry can map */ +#define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +/* Bits to mask out from a PMD to get to the PTE page */ +#define PMD_MASKED_BITS 0xc0000000000000ffUL +/* Bits to mask out from a PUD to get to the PMD page */ +#define PUD_MASKED_BITS 0xc0000000000000ffUL +/* Bits to mask out from a PGD to get to the PUD page */ +#define PGD_MASKED_BITS 0xc0000000000000ffUL + +extern unsigned long __vmalloc_start; +extern unsigned long __vmalloc_end; +#define VMALLOC_START __vmalloc_start +#define VMALLOC_END __vmalloc_end + +extern unsigned long __kernel_virt_start; +extern unsigned long __kernel_virt_size; +#define KERN_VIRT_START __kernel_virt_start +#define KERN_VIRT_SIZE __kernel_virt_size +extern struct page *vmemmap; +extern unsigned long ioremap_bot; +extern unsigned long pci_io_base; +#endif /* __ASSEMBLY__ */ #include <asm/book3s/64/hash.h> -#include <asm/barrier.h> +#include <asm/book3s/64/radix.h> +#ifdef CONFIG_PPC_64K_PAGES +#include <asm/book3s/64/pgtable-64k.h> +#else +#include <asm/book3s/64/pgtable-4k.h> +#endif + +#include <asm/barrier.h> /* * The second half of the kernel virtual space is used for IO mappings, * it's itself carved into the PIO region (ISA and PHB IO space) and @@ -26,8 +261,6 @@ #define IOREMAP_BASE (PHB_IO_END) #define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) -#define vmemmap ((struct page *)VMEMMAP_BASE) - /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP @@ -45,7 +278,7 @@ #define __real_pte(e,p) ((real_pte_t){(e)}) #define __rpte_to_pte(r) ((r).pte) -#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT) +#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT) #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ do { \ @@ -62,6 +295,327 @@ #endif /* __real_pte */ +static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, int huge) +{ + if (radix_enabled()) + return radix__pte_update(mm, addr, ptep, clr, set, huge); + return hash__pte_update(mm, addr, ptep, clr, set, huge); +} +/* + * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update. + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same + * function for both hash and radix. + */ +static inline int __ptep_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old; + + if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); + return (old & _PAGE_ACCESSED) != 0; +} + +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ +({ \ + int __r; \ + __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ + __r; \ +}) + +#define __HAVE_ARCH_PTEP_SET_WRPROTECT +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) + return; + + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); +} + +static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) + return; + + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); +} + +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); + return __pte(old); +} + +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t * ptep) +{ + pte_update(mm, addr, ptep, ~0UL, 0, 0); +} +static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE);} +static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } +static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } +static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } +static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } + +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline bool pte_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); +} +static inline pte_t pte_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); +} +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h . On powerpc, this will only + * work for user pages and always return true for kernel pages. + */ +static inline int pte_protnone(pte_t pte) +{ + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) == + (_PAGE_PRESENT | _PAGE_PRIVILEGED); +} +#endif /* CONFIG_NUMA_BALANCING */ + +static inline int pte_present(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_PRESENT); +} +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + * + * Even if PTEs can be unsigned long long, a PFN is always an unsigned + * long for now. + */ +static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) +{ + return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | + pgprot_val(pgprot)); +} + +static inline unsigned long pte_pfn(pte_t pte) +{ + return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; +} + +/* Generic modifiers for PTE bits */ +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_WRITE); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_DIRTY); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + /* + * write implies read, hence set both + */ + return __pte(pte_val(pte) | _PAGE_RW); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +static inline pte_t pte_mkspecial(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SPECIAL); +} + +static inline pte_t pte_mkhuge(pte_t pte) +{ + return pte; +} + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + /* FIXME!! check whether this need to be a conditional */ + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); +} + +static inline bool pte_user(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_PRIVILEGED); +} + +/* Encode and de-code a swap entry */ +#define MAX_SWAPFILES_CHECK() do { \ + BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ + /* \ + * Don't have overlapping bits with _PAGE_HPTEFLAGS \ + * We filter HPTEFLAGS on set_pte. \ + */ \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ + BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ + } while (0) +/* + * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; + */ +#define SWP_TYPE_BITS 5 +#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ + & ((1UL << SWP_TYPE_BITS) - 1)) +#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) +#define __swp_entry(type, offset) ((swp_entry_t) { \ + ((type) << _PAGE_BIT_SWAP_TYPE) \ + | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) +/* + * swp_entry_t must be independent of pte bits. We build a swp_entry_t from + * swap type and offset we get from swap and convert that to pte to find a + * matching pte in linux page table. + * Clear bits not found in swap entries here. + */ +#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) +#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) + +#ifdef CONFIG_MEM_SOFT_DIRTY +#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) +#else +#define _PAGE_SWP_SOFT_DIRTY 0UL +#endif /* CONFIG_MEM_SOFT_DIRTY */ + +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); +} +static inline bool pte_swp_soft_dirty(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); +} +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); +} +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + +static inline bool check_pte_access(unsigned long access, unsigned long ptev) +{ + /* + * This check for _PAGE_RWX and _PAGE_PRESENT bits + */ + if (access & ~ptev) + return false; + /* + * This check for access to privilege space + */ + if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED)) + return false; + + return true; +} +/* + * Generic functions with hash/radix callbacks + */ + +static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) +{ + if (radix_enabled()) + return radix__ptep_set_access_flags(ptep, entry); + return hash__ptep_set_access_flags(ptep, entry); +} + +#define __HAVE_ARCH_PTE_SAME +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + if (radix_enabled()) + return radix__pte_same(pte_a, pte_b); + return hash__pte_same(pte_a, pte_b); +} + +static inline int pte_none(pte_t pte) +{ + if (radix_enabled()) + return radix__pte_none(pte); + return hash__pte_none(pte); +} + +static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) +{ + if (radix_enabled()) + return radix__set_pte_at(mm, addr, ptep, pte, percpu); + return hash__set_pte_at(mm, addr, ptep, pte, percpu); +} + +#define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) + +#define pgprot_noncached pgprot_noncached +static inline pgprot_t pgprot_noncached(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | + _PAGE_NON_IDEMPOTENT); +} + +#define pgprot_noncached_wc pgprot_noncached_wc +static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | + _PAGE_TOLERANT); +} + +#define pgprot_cached pgprot_cached +static inline pgprot_t pgprot_cached(pgprot_t prot) +{ + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL)); +} + +#define pgprot_writecombine pgprot_writecombine +static inline pgprot_t pgprot_writecombine(pgprot_t prot) +{ + return pgprot_noncached_wc(prot); +} +/* + * check a pte mapping have cache inhibited property + */ +static inline bool pte_ci(pte_t pte) +{ + unsigned long pte_v = pte_val(pte); + + if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || + ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) + return true; + return false; +} + static inline void pmd_set(pmd_t *pmdp, unsigned long val) { *pmdp = __pmd(val); @@ -75,6 +629,13 @@ static inline void pmd_clear(pmd_t *pmdp) #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_present(pmd) (!pmd_none(pmd)) +static inline int pmd_bad(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_bad(pmd); + return hash__pmd_bad(pmd); +} + static inline void pud_set(pud_t *pudp, unsigned long val) { *pudp = __pud(val); @@ -100,6 +661,15 @@ static inline pud_t pte_pud(pte_t pte) return __pud(pte_val(pte)); } #define pud_write(pud) pte_write(pud_pte(pud)) + +static inline int pud_bad(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_bad(pud); + return hash__pud_bad(pud); +} + + #define pgd_write(pgd) pte_write(pgd_pte(pgd)) static inline void pgd_set(pgd_t *pgdp, unsigned long val) { @@ -124,8 +694,27 @@ static inline pgd_t pte_pgd(pte_t pte) return __pgd(pte_val(pte)); } +static inline int pgd_bad(pgd_t pgd) +{ + if (radix_enabled()) + return radix__pgd_bad(pgd); + return hash__pgd_bad(pgd); +} + extern struct page *pgd_page(pgd_t pgd); +/* Pointers in the page table tree are physical addresses */ +#define __pgtable_ptr_val(ptr) __pa(ptr) + +#define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) +#define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) +#define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS) + +#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) +#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) +#define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) +#define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) + /* * Find an entry in a page-table-directory. We combine the address region * (the high order N bits) and the pgd portion of the address. @@ -156,73 +745,42 @@ extern struct page *pgd_page(pgd_t pgd); #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -/* Encode and de-code a swap entry */ -#define MAX_SWAPFILES_CHECK() do { \ - BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ - /* \ - * Don't have overlapping bits with _PAGE_HPTEFLAGS \ - * We filter HPTEFLAGS on set_pte. \ - */ \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ - BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ - } while (0) -/* - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; - */ -#define SWP_TYPE_BITS 5 -#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ - & ((1UL << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PTE_RPN_SHIFT) -#define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << _PAGE_BIT_SWAP_TYPE) \ - | (((offset) << PTE_RPN_SHIFT) & PTE_RPN_MASK)}) -/* - * swp_entry_t must be independent of pte bits. We build a swp_entry_t from - * swap type and offset we get from swap and convert that to pte to find a - * matching pte in linux page table. - * Clear bits not found in swap entries here. - */ -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) -#define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) - -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) -#else -#define _PAGE_SWP_SOFT_DIRTY 0UL -#endif /* CONFIG_MEM_SOFT_DIRTY */ +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); +void pgtable_cache_init(void); -#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY -static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +static inline int map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags) { - return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); + if (radix_enabled()) { +#if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM) + unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift; + WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE"); +#endif + return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE); + } + return hash__map_kernel_page(ea, pa, flags); } -static inline bool pte_swp_soft_dirty(pte_t pte) + +static inline int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) { - return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); + if (radix_enabled()) + return radix__vmemmap_create_mapping(start, page_size, phys); + return hash__vmemmap_create_mapping(start, page_size, phys); } -static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) + +#ifdef CONFIG_MEMORY_HOTPLUG +static inline void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) { - return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); + if (radix_enabled()) + return radix__vmemmap_remove_mapping(start, page_size); + return hash__vmemmap_remove_mapping(start, page_size); } -#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ - -void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); -void pgtable_cache_init(void); - +#endif struct page *realmode_pfn_to_page(unsigned long pfn); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); -extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); -extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); -extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd); -extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd); -extern int has_transparent_hugepage(void); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - - static inline pte_t pmd_pte(pmd_t pmd) { return __pte(pmd_val(pmd)); @@ -237,7 +795,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) { return (pte_t *)pmd; } - #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) @@ -264,9 +821,87 @@ static inline int pmd_protnone(pmd_t pmd) #define __HAVE_ARCH_PMD_WRITE #define pmd_write(pmd) pte_write(pmd_pte(pmd)) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); +extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); +extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd); +extern int hash__has_transparent_hugepage(void); +static inline int has_transparent_hugepage(void) +{ + if (radix_enabled()) + return radix__has_transparent_hugepage(); + return hash__has_transparent_hugepage(); +} +#define has_transparent_hugepage has_transparent_hugepage + +static inline unsigned long +pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, + unsigned long clr, unsigned long set) +{ + if (radix_enabled()) + return radix__pmd_hugepage_update(mm, addr, pmdp, clr, set); + return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); +} + +static inline int pmd_large(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); +} +/* + * For radix we should always find H_PAGE_HASHPTE zero. Hence + * the below will work for radix too + */ +static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + unsigned long old; + + if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + + if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) + return; + + pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + if (radix_enabled()) + return radix__pmd_trans_huge(pmd); + return hash__pmd_trans_huge(pmd); +} + +#define __HAVE_ARCH_PMD_SAME +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + if (radix_enabled()) + return radix__pmd_same(pmd_a, pmd_b); + return hash__pmd_same(pmd_a, pmd_b); +} + static inline pmd_t pmd_mkhuge(pmd_t pmd) { - return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_THP_HUGE)); + if (radix_enabled()) + return radix__pmd_mkhuge(pmd); + return hash__pmd_mkhuge(pmd); } #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS @@ -277,37 +912,63 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma, #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR -extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp); +static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pmdp_huge_get_and_clear(mm, addr, pmdp); + return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); +} -extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pmdp_collapse_flush(vma, address, pmdp); + return hash__pmdp_collapse_flush(vma, address, pmdp); +} #define pmdp_collapse_flush pmdp_collapse_flush #define __HAVE_ARCH_PGTABLE_DEPOSIT -extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable); +static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, + pmd_t *pmdp, pgtable_t pgtable) +{ + if (radix_enabled()) + return radix__pgtable_trans_huge_deposit(mm, pmdp, pgtable); + return hash__pgtable_trans_huge_deposit(mm, pmdp, pgtable); +} + #define __HAVE_ARCH_PGTABLE_WITHDRAW -extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, + pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pgtable_trans_huge_withdraw(mm, pmdp); + return hash__pgtable_trans_huge_withdraw(mm, pmdp); +} #define __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -extern void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); +static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + if (radix_enabled()) + return radix__pmdp_huge_split_prepare(vma, address, pmdp); + return hash__pmdp_huge_split_prepare(vma, address, pmdp); +} #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, struct spinlock *old_pmd_ptl) { + if (radix_enabled()) + return false; /* * Archs like ppc64 use pgtable to store per pmd * specific information. So when we switch the pmd, @@ -315,5 +976,6 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix-4k.h b/arch/powerpc/include/asm/book3s/64/radix-4k.h new file mode 100644 index 000000000..7c3b1fe16 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix-4k.h @@ -0,0 +1,12 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_4K_H +#define _ASM_POWERPC_PGTABLE_RADIX_4K_H + +/* + * For 4K page size supported index is 13/9/9/9 + */ +#define RADIX_PTE_INDEX_SIZE 9 /* 2MB huge page */ +#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ +#define RADIX_PUD_INDEX_SIZE 9 +#define RADIX_PGD_INDEX_SIZE 13 + +#endif /* _ASM_POWERPC_PGTABLE_RADIX_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/radix-64k.h b/arch/powerpc/include/asm/book3s/64/radix-64k.h new file mode 100644 index 000000000..82dc355f0 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix-64k.h @@ -0,0 +1,12 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_64K_H +#define _ASM_POWERPC_PGTABLE_RADIX_64K_H + +/* + * For 64K page size supported index is 13/9/9/5 + */ +#define RADIX_PTE_INDEX_SIZE 5 /* 2MB huge page */ +#define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ +#define RADIX_PUD_INDEX_SIZE 9 +#define RADIX_PGD_INDEX_SIZE 13 + +#endif /* _ASM_POWERPC_PGTABLE_RADIX_64K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h new file mode 100644 index 000000000..df294224e --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -0,0 +1,247 @@ +#ifndef _ASM_POWERPC_PGTABLE_RADIX_H +#define _ASM_POWERPC_PGTABLE_RADIX_H + +#ifndef __ASSEMBLY__ +#include <asm/cmpxchg.h> +#endif + +#ifdef CONFIG_PPC_64K_PAGES +#include <asm/book3s/64/radix-64k.h> +#else +#include <asm/book3s/64/radix-4k.h> +#endif + +/* An empty PTE can still have a R or C writeback */ +#define RADIX_PTE_NONE_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) + +/* Bits to set in a RPMD/RPUD/RPGD */ +#define RADIX_PMD_VAL_BITS (0x8000000000000000UL | RADIX_PTE_INDEX_SIZE) +#define RADIX_PUD_VAL_BITS (0x8000000000000000UL | RADIX_PMD_INDEX_SIZE) +#define RADIX_PGD_VAL_BITS (0x8000000000000000UL | RADIX_PUD_INDEX_SIZE) + +/* Don't have anything in the reserved bits and leaf bits */ +#define RADIX_PMD_BAD_BITS 0x60000000000000e0UL +#define RADIX_PUD_BAD_BITS 0x60000000000000e0UL +#define RADIX_PGD_BAD_BITS 0x60000000000000e0UL + +/* + * Size of EA range mapped by our pagetables. + */ +#define RADIX_PGTABLE_EADDR_SIZE (RADIX_PTE_INDEX_SIZE + RADIX_PMD_INDEX_SIZE + \ + RADIX_PUD_INDEX_SIZE + RADIX_PGD_INDEX_SIZE + PAGE_SHIFT) +#define RADIX_PGTABLE_RANGE (ASM_CONST(1) << RADIX_PGTABLE_EADDR_SIZE) + +/* + * We support 52 bit address space, Use top bit for kernel + * virtual mapping. Also make sure kernel fit in the top + * quadrant. + * + * +------------------+ + * +------------------+ Kernel virtual map (0xc008000000000000) + * | | + * | | + * | | + * 0b11......+------------------+ Kernel linear map (0xc....) + * | | + * | 2 quadrant | + * | | + * 0b10......+------------------+ + * | | + * | 1 quadrant | + * | | + * 0b01......+------------------+ + * | | + * | 0 quadrant | + * | | + * 0b00......+------------------+ + * + * + * 3rd quadrant expanded: + * +------------------------------+ + * | | + * | | + * | | + * +------------------------------+ Kernel IO map end (0xc010000000000000) + * | | + * | | + * | 1/2 of virtual map | + * | | + * | | + * +------------------------------+ Kernel IO map start + * | | + * | 1/4 of virtual map | + * | | + * +------------------------------+ Kernel vmemap start + * | | + * | 1/4 of virtual map | + * | | + * +------------------------------+ Kernel virt start (0xc008000000000000) + * | | + * | | + * | | + * +------------------------------+ Kernel linear (0xc.....) + */ + +#define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) +#define RADIX_KERN_VIRT_SIZE ASM_CONST(0x0008000000000000) + +/* + * The vmalloc space starts at the beginning of that region, and + * occupies a quarter of it on radix config. + * (we keep a quarter for the virtual memmap) + */ +#define RADIX_VMALLOC_START RADIX_KERN_VIRT_START +#define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2) +#define RADIX_VMALLOC_END (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE) +/* + * Defines the address of the vmemap area, in its own region on + * hash table CPUs. + */ +#define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) + +#ifndef __ASSEMBLY__ +#define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) +#define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) +#define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) +#define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) + +static inline unsigned long radix__pte_update(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, unsigned long clr, + unsigned long set, + int huge) +{ + pte_t pte; + unsigned long old_pte, new_pte; + + do { + pte = READ_ONCE(*ptep); + old_pte = pte_val(pte); + new_pte = (old_pte | set) & ~clr; + + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* We already do a sync in cmpxchg, is ptesync needed ?*/ + asm volatile("ptesync" : : : "memory"); + /* huge pages use the old page table lock */ + if (!huge) + assert_pte_locked(mm, addr); + + return old_pte; +} + +/* + * Set the dirty and/or accessed bits atomically in a linux PTE, this + * function doesn't need to invalidate tlb. + */ +static inline void radix__ptep_set_access_flags(pte_t *ptep, pte_t entry) +{ + pte_t pte; + unsigned long old_pte, new_pte; + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | + _PAGE_RW | _PAGE_EXEC); + do { + pte = READ_ONCE(*ptep); + old_pte = pte_val(pte); + new_pte = old_pte | set; + + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + + /* We already do a sync in cmpxchg, is ptesync needed ?*/ + asm volatile("ptesync" : : : "memory"); +} + +static inline int radix__pte_same(pte_t pte_a, pte_t pte_b) +{ + return ((pte_raw(pte_a) ^ pte_raw(pte_b)) == 0); +} + +static inline int radix__pte_none(pte_t pte) +{ + return (pte_val(pte) & ~RADIX_PTE_NONE_MASK) == 0; +} + +static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, int percpu) +{ + *ptep = pte; + asm volatile("ptesync" : : : "memory"); +} + +static inline int radix__pmd_bad(pmd_t pmd) +{ + return !!(pmd_val(pmd) & RADIX_PMD_BAD_BITS); +} + +static inline int radix__pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return ((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) == 0); +} + +static inline int radix__pud_bad(pud_t pud) +{ + return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); +} + + +static inline int radix__pgd_bad(pgd_t pgd) +{ + return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +static inline int radix__pmd_trans_huge(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + +static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | _PAGE_PTE); +} +static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + /* Nothing to do for radix. */ + return; +} + +extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set); +extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable); +extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); +extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); +extern int radix__has_transparent_hugepage(void); +#endif + +extern int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void radix__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); + +extern int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, unsigned int psz); + +static inline unsigned long radix__get_tree_size(void) +{ + unsigned long rts_field; + /* + * we support 52 bits, hence 52-31 = 21, 0b10101 + * RTS encoding details + * bits 0 - 3 of rts -> bits 6 - 8 unsigned long + * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long + */ + rts_field = (0x5UL << 5); /* 6 - 8 bits */ + rts_field |= (0x2UL << 61); + + return rts_field; +} +#endif /* __ASSEMBLY__ */ +#endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 1b753f96b..f12ddf5e8 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -1,8 +1,6 @@ #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H -#define MMU_NO_CONTEXT 0 - /* * TLB flushing for 64-bit hash-MMU CPUs */ @@ -29,14 +27,21 @@ extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch); static inline void arch_enter_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + struct ppc64_tlb_batch *batch; + if (radix_enabled()) + return; + batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } static inline void arch_leave_lazy_mmu_mode(void) { - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + struct ppc64_tlb_batch *batch; + + if (radix_enabled()) + return; + batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->index) __flush_tlb_pending(batch); @@ -52,40 +57,42 @@ extern void flush_hash_range(unsigned long number, int local); extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr, pmd_t *pmdp, unsigned int psize, int ssize, unsigned long flags); - -static inline void local_flush_tlb_mm(struct mm_struct *mm) +static inline void hash__local_flush_tlb_mm(struct mm_struct *mm) { } -static inline void flush_tlb_mm(struct mm_struct *mm) +static inline void hash__flush_tlb_mm(struct mm_struct *mm) { } -static inline void local_flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, - unsigned long vmaddr) +static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma, + unsigned long vmaddr) { } -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) +static inline void hash__flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { } -static inline void flush_tlb_kernel_range(unsigned long start, - unsigned long end) +static inline void hash__flush_tlb_kernel_range(unsigned long start, + unsigned long end) { } + +struct mmu_gather; +extern void hash__tlb_flush(struct mmu_gather *tlb); /* Private function for use by PCI IO mapping code */ extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h new file mode 100644 index 000000000..3fa94fcac --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -0,0 +1,36 @@ +#ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H +#define _ASM_POWERPC_TLBFLUSH_RADIX_H + +struct vm_area_struct; +struct mm_struct; +struct mmu_gather; + +static inline int mmu_get_ap(int psize) +{ + return mmu_psize_defs[psize].ap; +} + +extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end); +extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); + +extern void radix__local_flush_tlb_mm(struct mm_struct *mm); +extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid); +extern void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); +extern void radix__tlb_flush(struct mmu_gather *tlb); +#ifdef CONFIG_SMP +extern void radix__flush_tlb_mm(struct mm_struct *mm); +extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid); +extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); +#else +#define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) +#define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) +#define radix___flush_tlb_page(mm,addr,p,i) radix___local_flush_tlb_page(mm,addr,p,i) +#define radix__flush_tlb_pwc(tlb, addr) radix__local_flush_tlb_pwc(tlb, addr) +#endif + +#endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h new file mode 100644 index 000000000..96e5769b1 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -0,0 +1,90 @@ +#ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H +#define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H + +#define MMU_NO_CONTEXT ~0UL + + +#include <asm/book3s/64/tlbflush-hash.h> +#include <asm/book3s/64/tlbflush-radix.h> + +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (radix_enabled()) + return radix__flush_tlb_range(vma, start, end); + return hash__flush_tlb_range(vma, start, end); +} + +static inline void flush_tlb_kernel_range(unsigned long start, + unsigned long end) +{ + if (radix_enabled()) + return radix__flush_tlb_kernel_range(start, end); + return hash__flush_tlb_kernel_range(start, end); +} + +static inline void local_flush_tlb_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__local_flush_tlb_mm(mm); + return hash__local_flush_tlb_mm(mm); +} + +static inline void local_flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__local_flush_tlb_page(vma, vmaddr); + return hash__local_flush_tlb_page(vma, vmaddr); +} + +static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_tlb_page(vma, vmaddr); + return hash__flush_tlb_page_nohash(vma, vmaddr); +} + +static inline void tlb_flush(struct mmu_gather *tlb) +{ + if (radix_enabled()) + return radix__tlb_flush(tlb); + return hash__tlb_flush(tlb); +} + +#ifdef CONFIG_SMP +static inline void flush_tlb_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__flush_tlb_mm(mm); + return hash__flush_tlb_mm(mm); +} + +static inline void flush_tlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_tlb_page(vma, vmaddr); + return hash__flush_tlb_page(vma, vmaddr); +} +#else +#define flush_tlb_mm(mm) local_flush_tlb_mm(mm) +#define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) +#endif /* CONFIG_SMP */ +/* + * flush the page walk cache for the address + */ +static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + /* + * Flush the page table walk cache on freeing a page table. We already + * have marked the upper/higher level page table entry none by now. + * So it is safe to flush PWC here. + */ + if (!radix_enabled()) + return; + + radix__flush_tlb_pwc(tlb, address); +} +#endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */ diff --git a/arch/powerpc/include/asm/book3s/pgalloc.h b/arch/powerpc/include/asm/book3s/pgalloc.h new file mode 100644 index 000000000..c0a69ae92 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/pgalloc.h @@ -0,0 +1,14 @@ +#ifndef _ASM_POWERPC_BOOK3S_PGALLOC_H +#define _ASM_POWERPC_BOOK3S_PGALLOC_H + +#include <linux/mm.h> + +extern void tlb_remove_table(struct mmu_gather *tlb, void *table); + +#ifdef CONFIG_PPC64 +#include <asm/book3s/64/pgalloc.h> +#else +#include <asm/book3s/32/pgalloc.h> +#endif + +#endif /* _ASM_POWERPC_BOOK3S_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index 42814f056..e2d9f4996 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -8,6 +8,8 @@ extern struct kmem_cache *hugepte_cache; #ifdef CONFIG_PPC_BOOK3S_64 + +#include <asm/book3s/64/hugetlb-radix.h> /* * This should work for other subarchs too. But right now we use the * new format only for 64bit book3s @@ -31,7 +33,19 @@ static inline unsigned int hugepd_shift(hugepd_t hpd) { return mmu_psize_to_shift(hugepd_mmu_psize(hpd)); } +static inline void flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__flush_hugetlb_page(vma, vmaddr); +} +static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma, + unsigned long vmaddr) +{ + if (radix_enabled()) + return radix__local_flush_hugetlb_page(vma, vmaddr); +} #else static inline pte_t *hugepd_page(hugepd_t hpd) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 7529aab06..1f4497fb5 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -276,19 +276,24 @@ static inline unsigned long hpte_make_readonly(unsigned long ptel) return ptel; } -static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) +static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci) { - unsigned int wimg = ptel & HPTE_R_WIMG; + unsigned int wimg = hptel & HPTE_R_WIMG; /* Handle SAO */ if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) && cpu_has_feature(CPU_FTR_ARCH_206)) wimg = HPTE_R_M; - if (!io_type) + if (!is_ci) return wimg == HPTE_R_M; - - return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; + /* + * if host is mapped cache inhibited, make sure hptel also have + * cache inhibited. + */ + if (wimg & HPTE_R_W) /* FIXME!! is this ok for all guest. ? */ + return false; + return !!(wimg & HPTE_R_I); } /* @@ -305,9 +310,9 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) */ old_pte = READ_ONCE(*ptep); /* - * wait until _PAGE_BUSY is clear then set it atomically + * wait until H_PAGE_BUSY is clear then set it atomically */ - if (unlikely(pte_val(old_pte) & _PAGE_BUSY)) { + if (unlikely(pte_val(old_pte) & H_PAGE_BUSY)) { cpu_relax(); continue; } @@ -319,27 +324,12 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing) if (writing && pte_write(old_pte)) new_pte = pte_mkdirty(new_pte); - if (pte_val(old_pte) == __cmpxchg_u64((unsigned long *)ptep, - pte_val(old_pte), - pte_val(new_pte))) { + if (pte_xchg(ptep, old_pte, new_pte)) break; - } } return new_pte; } - -/* Return HPTE cache control bits corresponding to Linux pte bits */ -static inline unsigned long hpte_cache_bits(unsigned long pte_val) -{ -#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W - return pte_val & (HPTE_R_W | HPTE_R_I); -#else - return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) + - ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0); -#endif -} - static inline bool hpte_read_permission(unsigned long pp, unsigned long key) { if (key) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d7b343170..ec35af34a 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -40,6 +40,9 @@ #define KVM_MAX_VCORES NR_CPUS #define KVM_USER_MEM_SLOTS 512 +#include <asm/cputhreads.h> +#define KVM_MAX_VCPU_ID (threads_per_subcore * KVM_MAX_VCORES) + #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif @@ -113,6 +116,7 @@ struct kvm_vcpu_stat { u32 ext_intr_exits; u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 dbell_exits; u32 gdbell_exits; @@ -724,5 +728,6 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h new file mode 100644 index 000000000..a402f7f94 --- /dev/null +++ b/arch/powerpc/include/asm/livepatch.h @@ -0,0 +1,62 @@ +/* + * livepatch.h - powerpc-specific Kernel Live Patching Core + * + * Copyright (C) 2015-2016, SUSE, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _ASM_POWERPC_LIVEPATCH_H +#define _ASM_POWERPC_LIVEPATCH_H + +#include <linux/module.h> +#include <linux/ftrace.h> + +#ifdef CONFIG_LIVEPATCH +static inline int klp_check_compiler_support(void) +{ + return 0; +} + +static inline int klp_write_module_reloc(struct module *mod, unsigned long + type, unsigned long loc, unsigned long value) +{ + /* This requires infrastructure changes; we need the loadinfos. */ + return -ENOSYS; +} + +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->nip = ip; +} + +#define klp_get_ftrace_location klp_get_ftrace_location +static inline unsigned long klp_get_ftrace_location(unsigned long faddr) +{ + /* + * Live patch works only with -mprofile-kernel on PPC. In this case, + * the ftrace location is always within the first 16 bytes. + */ + return ftrace_location_range(faddr, faddr + 16); +} + +static inline void klp_init_thread_info(struct thread_info *ti) +{ + /* + 1 to account for STACK_END_MAGIC */ + ti->livepatch_sp = (unsigned long *)(ti + 1) + 1; +} +#else +static void klp_init_thread_info(struct thread_info *ti) { } +#endif /* CONFIG_LIVEPATCH */ + +#endif /* _ASM_POWERPC_LIVEPATCH_H */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index fd22442d3..6bdcd0da9 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -256,6 +256,7 @@ struct machdep_calls { #ifdef CONFIG_ARCH_RANDOM int (*get_random_seed)(unsigned long *v); #endif + int (*update_partition_table)(u64); }; extern void e500_idle(void); diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 8ca1c983b..e53ebebff 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -88,6 +88,11 @@ */ #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) +/* + * Radix page table available + */ +#define MMU_FTR_RADIX ASM_CONST(0x80000000) + /* MMU feature bit sets for various CPUs */ #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \ MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2 @@ -110,9 +115,25 @@ DECLARE_PER_CPU(int, next_tlbcam_idx); #endif +enum { + MMU_FTRS_POSSIBLE = MMU_FTR_HPTE_TABLE | MMU_FTR_TYPE_8xx | + MMU_FTR_TYPE_40x | MMU_FTR_TYPE_44x | MMU_FTR_TYPE_FSL_E | + MMU_FTR_TYPE_47x | MMU_FTR_USE_HIGH_BATS | MMU_FTR_BIG_PHYS | + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_USE_TLBILX | + MMU_FTR_LOCK_BCAST_INVAL | MMU_FTR_NEED_DTLB_SW_LRU | + MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS | + MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | + MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | + MMU_FTR_1T_SEGMENT | +#ifdef CONFIG_PPC_RADIX_MMU + MMU_FTR_RADIX | +#endif + 0, +}; + static inline int mmu_has_feature(unsigned long feature) { - return (cur_cpu_spec->mmu_features & feature); + return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature); } static inline void mmu_clear_feature(unsigned long feature) @@ -122,13 +143,6 @@ static inline void mmu_clear_feature(unsigned long feature) extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; -/* MMU initialization */ -extern void early_init_mmu(void); -extern void early_init_mmu_secondary(void); - -extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, - phys_addr_t first_memblock_size); - #ifdef CONFIG_PPC64 /* This is our real memory area size on ppc64 server, on embedded, we * make it match the size our of bolted TLB area @@ -181,10 +195,20 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) #define MMU_PAGE_COUNT 15 -#if defined(CONFIG_PPC_STD_MMU_64) -/* 64-bit classic hash table MMU */ -#include <asm/book3s/64/mmu-hash.h> -#elif defined(CONFIG_PPC_STD_MMU_32) +#ifdef CONFIG_PPC_BOOK3S_64 +#include <asm/book3s/64/mmu.h> +#else /* CONFIG_PPC_BOOK3S_64 */ + +#ifndef __ASSEMBLY__ +/* MMU initialization */ +extern void early_init_mmu(void); +extern void early_init_mmu_secondary(void); +extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size); +#endif /* __ASSEMBLY__ */ +#endif + +#if defined(CONFIG_PPC_STD_MMU_32) /* 32-bit classic hash table MMU */ #include <asm/book3s/32/mmu-hash.h> #elif defined(CONFIG_40x) @@ -201,6 +225,9 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) # include <asm/mmu-8xx.h> #endif +#ifndef radix_enabled +#define radix_enabled() (0) +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 4eaab40e3..9d2cd0c36 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -33,16 +33,27 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif - -extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next); extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); #ifdef CONFIG_PPC_BOOK3S_64 +extern void radix__switch_mmu_context(struct mm_struct *prev, + struct mm_struct *next); +static inline void switch_mmu_context(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +{ + if (radix_enabled()) + return radix__switch_mmu_context(prev, next); + return switch_slb(tsk, next); +} + extern int __init_new_context(void); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } #else +extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); extern unsigned long __init_new_context(void); extern void __destroy_context(unsigned long context_id); extern void mmu_context_init(void); @@ -88,17 +99,11 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (cpu_has_feature(CPU_FTR_ALTIVEC)) asm volatile ("dssall"); #endif /* CONFIG_ALTIVEC */ - - /* The actual HW switching method differs between the various - * sub architectures. + /* + * The actual HW switching method differs between the various + * sub architectures. Out of line for now */ -#ifdef CONFIG_PPC_STD_MMU_64 - switch_slb(tsk, next); -#else - /* Out of line for now */ - switch_mmu_context(prev, next); -#endif - + switch_mmu_context(prev, next, tsk); } #define deactivate_mm(tsk,mm) do { } while (0) diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 76d6b9e0c..76d6b9e0c 100644 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 8d5fc3ac4..897d2e1c8 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -53,12 +53,11 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) #ifndef CONFIG_PPC_64K_PAGES -#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) +#define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), - GFP_KERNEL|__GFP_REPEAT); + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), GFP_KERNEL); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -68,19 +67,19 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - pud_set(pud, __pgtable_ptr_val(pmd)); + pud_set(pud, (unsigned long)pmd); } static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte)); + pmd_set(pmd, (unsigned long)pte); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); + pmd_set(pmd, (unsigned long)page_address(pte_page)); } #define pmd_pgtable(pmd) pmd_page(pmd) @@ -88,7 +87,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, @@ -119,119 +118,65 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) __free_page(ptepage); } -static inline void pgtable_free(void *table, unsigned index_size) -{ - if (!index_size) - free_page((unsigned long)table); - else { - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); - kmem_cache_free(PGT_CACHE(index_size), table); - } -} - +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - unsigned long pgf = (unsigned long)table; - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); - pgf |= shift; - tlb_remove_table(tlb, (void *)pgf); -} - -static inline void __tlb_remove_table(void *_table) -{ - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; - - pgtable_free(table, shift); -} -#else /* !CONFIG_SMP */ -static inline void pgtable_free_tlb(struct mmu_gather *tlb, - void *table, int shift) -{ - pgtable_free(table, shift); -} -#endif /* CONFIG_SMP */ - +extern void __tlb_remove_table(void *_table); +#endif static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(table); pgtable_free_tlb(tlb, page_address(table), 0); } #else /* if CONFIG_PPC_64K_PAGES */ -extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); -extern void page_table_free(struct mm_struct *, unsigned long *, int); +extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); +extern void pte_fragment_free(unsigned long *, int); extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); #ifdef CONFIG_SMP extern void __tlb_remove_table(void *_table); #endif -#ifndef __PAGETABLE_PUD_FOLDED -/* book3s 64 is 4 level page table */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) -{ - pgd_set(pgd, __pgtable_ptr_val(pud)); -} - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), - GFP_KERNEL|__GFP_REPEAT); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); -} -#endif - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - pud_set(pud, __pgtable_ptr_val(pmd)); -} +#define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { - pmd_set(pmd, __pgtable_ptr_val(pte)); + pmd_set(pmd, (unsigned long)pte); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t pte_page) { - pmd_set(pmd, __pgtable_ptr_val(pte_page)); + pmd_set(pmd, (unsigned long)pte_page); } static inline pgtable_t pmd_pgtable(pmd_t pmd) { - return (pgtable_t)pmd_page_vaddr(pmd); + return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS); } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)page_table_alloc(mm, address, 1); + return (pte_t *)pte_fragment_alloc(mm, address, 1); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - return (pgtable_t)page_table_alloc(mm, address, 0); + return (pgtable_t)pte_fragment_alloc(mm, address, 0); } static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - page_table_free(mm, (unsigned long *)pte, 1); + pte_fragment_free((unsigned long *)pte, 1); } static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) { - page_table_free(mm, (unsigned long *)ptepage, 0); + pte_fragment_free((unsigned long *)ptepage, 0); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, @@ -244,8 +189,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), - GFP_KERNEL|__GFP_REPEAT); + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), GFP_KERNEL); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) @@ -255,11 +199,11 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #define __pmd_free_tlb(tlb, pmd, addr) \ pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) -#ifndef __PAGETABLE_PUD_FOLDED +#ifndef CONFIG_PPC_64K_PAGES #define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) -#endif /* __PAGETABLE_PUD_FOLDED */ +#endif /* CONFIG_PPC_64K_PAGES */ #define check_pgt_cache() do { } while (0) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 10debb93c..d4d808cf9 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -108,9 +108,6 @@ #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ -/* Pointers in the page table tree are virtual addresses */ -#define __pgtable_ptr_val(ptr) ((unsigned long)(ptr)) - #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) @@ -362,6 +359,13 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); +extern int map_kernel_page(unsigned long ea, unsigned long pa, + unsigned long flags); +extern int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys); +extern void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size); #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h new file mode 100644 index 000000000..b39ec956d --- /dev/null +++ b/arch/powerpc/include/asm/nohash/pgalloc.h @@ -0,0 +1,23 @@ +#ifndef _ASM_POWERPC_NOHASH_PGALLOC_H +#define _ASM_POWERPC_NOHASH_PGALLOC_H + +#include <linux/mm.h> + +extern void tlb_remove_table(struct mmu_gather *tlb, void *table); +#ifdef CONFIG_PPC64 +extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); +#else +/* 44x etc which is BOOKE not BOOK3E */ +static inline void tlb_flush_pgtable(struct mmu_gather *tlb, + unsigned long address) +{ + +} +#endif /* !CONFIG_PPC_BOOK3E */ + +#ifdef CONFIG_PPC64 +#include <asm/nohash/64/pgalloc.h> +#else +#include <asm/nohash/32/pgalloc.h> +#endif +#endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index f8faaaeec..9bb8ddf0b 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -368,16 +368,16 @@ enum OpalLPCAddressType { }; enum opal_msg_type { - OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, + OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, * additional params function-specific */ - OPAL_MSG_MEM_ERR, - OPAL_MSG_EPOW, - OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ - OPAL_MSG_HMI_EVT, - OPAL_MSG_DPO, - OPAL_MSG_PRD, - OPAL_MSG_OCC, + OPAL_MSG_MEM_ERR = 1, + OPAL_MSG_EPOW = 2, + OPAL_MSG_SHUTDOWN = 3, /* params[0] = 1 reboot, 0 shutdown */ + OPAL_MSG_HMI_EVT = 4, + OPAL_MSG_DPO = 5, + OPAL_MSG_PRD = 6, + OPAL_MSG_OCC = 7, OPAL_MSG_TYPE_MAX, }; diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index ab3d8977b..51db3a37b 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -288,7 +288,11 @@ extern long long virt_phys_offset; #ifndef __ASSEMBLY__ +#ifdef CONFIG_PPC_BOOK3S_64 +#include <asm/pgtable-be-types.h> +#else #include <asm/pgtable-types.h> +#endif typedef struct { signed long pd; } hugepd_t; @@ -312,12 +316,20 @@ void arch_free_page(struct page *page, int order); #endif struct vm_area_struct; - +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * For BOOK3s 64 with 4k and 64K linux page size + * we want to use pointers, because the page table + * actually store pfn + */ +typedef pte_t *pgtable_t; +#else #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC64) typedef pte_t *pgtable_t; #else typedef struct page *pgtable_t; #endif +#endif #include <asm-generic/memory_model.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index d908a46d0..dd5f0712a 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -93,7 +93,7 @@ extern u64 ppc64_pft_size; #define SLICE_LOW_TOP (0x100000000ul) #define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT) -#define SLICE_NUM_HIGH (PGTABLE_RANGE >> SLICE_HIGH_SHIFT) +#define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT) #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) @@ -128,8 +128,6 @@ extern void slice_set_user_psize(struct mm_struct *mm, unsigned int psize); extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize); -#define slice_mm_new_context(mm) ((mm)->context.id == MMU_NO_CONTEXT) - #endif /* __ASSEMBLY__ */ #else #define slice_init() @@ -151,7 +149,6 @@ do { \ #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) -#define slice_mm_new_context(mm) 1 #endif /* CONFIG_PPC_MM_SLICES */ #ifdef CONFIG_HUGETLB_PAGE diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index f5056e339..467c0b05b 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -17,33 +17,34 @@ struct device_node; * PCI controller operations */ struct pci_controller_ops { - void (*dma_dev_setup)(struct pci_dev *dev); + void (*dma_dev_setup)(struct pci_dev *pdev); void (*dma_bus_setup)(struct pci_bus *bus); - int (*probe_mode)(struct pci_bus *); + int (*probe_mode)(struct pci_bus *bus); /* Called when pci_enable_device() is called. Returns true to * allow assignment/enabling of the device. */ - bool (*enable_device_hook)(struct pci_dev *); + bool (*enable_device_hook)(struct pci_dev *pdev); - void (*disable_device)(struct pci_dev *); + void (*disable_device)(struct pci_dev *pdev); - void (*release_device)(struct pci_dev *); + void (*release_device)(struct pci_dev *pdev); /* Called during PCI resource reassignment */ - resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type); - void (*reset_secondary_bus)(struct pci_dev *dev); + resource_size_t (*window_alignment)(struct pci_bus *bus, + unsigned long type); + void (*reset_secondary_bus)(struct pci_dev *pdev); #ifdef CONFIG_PCI_MSI - int (*setup_msi_irqs)(struct pci_dev *dev, + int (*setup_msi_irqs)(struct pci_dev *pdev, int nvec, int type); - void (*teardown_msi_irqs)(struct pci_dev *dev); + void (*teardown_msi_irqs)(struct pci_dev *pdev); #endif - int (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask); - u64 (*dma_get_required_mask)(struct pci_dev *dev); + int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); + u64 (*dma_get_required_mask)(struct pci_dev *pdev); - void (*shutdown)(struct pci_controller *); + void (*shutdown)(struct pci_controller *hose); }; /* @@ -208,14 +209,14 @@ struct pci_dn { #ifdef CONFIG_EEH struct eeh_dev *edev; /* eeh device */ #endif -#define IODA_INVALID_PE (-1) +#define IODA_INVALID_PE 0xFFFFFFFF #ifdef CONFIG_PPC_POWERNV - int pe_number; + unsigned int pe_number; int vf_index; /* VF index in the PF */ #ifdef CONFIG_PCI_IOV u16 vfs_expanded; /* number of VFs IOV BAR expanded */ u16 num_vfs; /* number of VFs enabled*/ - int *pe_num_map; /* PE# for the first VF PE or array */ + unsigned int *pe_num_map; /* PE# for the first VF PE or array */ bool m64_single_mode; /* Use M64 BAR in Single Mode */ #define IODA_INVALID_M64 (-1) int (*m64_map)[PCI_SRIOV_NUM_BARS]; @@ -234,7 +235,9 @@ extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); extern void remove_dev_pci_data(struct pci_dev *pdev); -extern void *update_dn_pci_info(struct device_node *dn, void *data); +extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, + struct device_node *dn); +extern void pci_remove_device_node_info(struct device_node *dn); static inline int pci_device_from_OF_node(struct device_node *np, u8 *bus, u8 *devfn) @@ -256,13 +259,13 @@ static inline struct eeh_dev *pdn_to_eeh_dev(struct pci_dn *pdn) #endif /** Find the bus corresponding to the indicated device node */ -extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn); +extern struct pci_bus *pci_find_bus_by_node(struct device_node *dn); /** Remove all of the PCI devices under this bus */ -extern void pcibios_remove_pci_devices(struct pci_bus *bus); +extern void pci_hp_remove_devices(struct pci_bus *bus); /** Discover new pci devices under this bus, and add them */ -extern void pcibios_add_pci_devices(struct pci_bus *bus); +extern void pci_hp_add_devices(struct pci_bus *bus); extern void isa_bridge_find_early(struct pci_controller *hose); diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index fc3ee06ea..0413457ba 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -1,25 +1,12 @@ #ifndef _ASM_POWERPC_PGALLOC_H #define _ASM_POWERPC_PGALLOC_H -#ifdef __KERNEL__ #include <linux/mm.h> -#ifdef CONFIG_PPC_BOOK3E -extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); -#else /* CONFIG_PPC_BOOK3E */ -static inline void tlb_flush_pgtable(struct mmu_gather *tlb, - unsigned long address) -{ -} -#endif /* !CONFIG_PPC_BOOK3E */ - -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -#ifdef CONFIG_PPC64 -#include <asm/pgalloc-64.h> +#ifdef CONFIG_PPC_BOOK3S +#include <asm/book3s/pgalloc.h> #else -#include <asm/pgalloc-32.h> +#include <asm/nohash/pgalloc.h> #endif -#endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h new file mode 100644 index 000000000..e2bf20860 --- /dev/null +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -0,0 +1,92 @@ +#ifndef _ASM_POWERPC_PGTABLE_BE_TYPES_H +#define _ASM_POWERPC_PGTABLE_BE_TYPES_H + +#include <asm/cmpxchg.h> + +/* PTE level */ +typedef struct { __be64 pte; } pte_t; +#define __pte(x) ((pte_t) { cpu_to_be64(x) }) +static inline unsigned long pte_val(pte_t x) +{ + return be64_to_cpu(x.pte); +} + +static inline __be64 pte_raw(pte_t x) +{ + return x.pte; +} + +/* PMD level */ +#ifdef CONFIG_PPC64 +typedef struct { __be64 pmd; } pmd_t; +#define __pmd(x) ((pmd_t) { cpu_to_be64(x) }) +static inline unsigned long pmd_val(pmd_t x) +{ + return be64_to_cpu(x.pmd); +} + +static inline __be64 pmd_raw(pmd_t x) +{ + return x.pmd; +} + +/* + * 64 bit hash always use 4 level table. Everybody else use 4 level + * only for 4K page size. + */ +#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) +typedef struct { __be64 pud; } pud_t; +#define __pud(x) ((pud_t) { cpu_to_be64(x) }) +static inline unsigned long pud_val(pud_t x) +{ + return be64_to_cpu(x.pud); +} +#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ +#endif /* CONFIG_PPC64 */ + +/* PGD level */ +typedef struct { __be64 pgd; } pgd_t; +#define __pgd(x) ((pgd_t) { cpu_to_be64(x) }) +static inline unsigned long pgd_val(pgd_t x) +{ + return be64_to_cpu(x.pgd); +} + +/* Page protection bits */ +typedef struct { unsigned long pgprot; } pgprot_t; +#define pgprot_val(x) ((x).pgprot) +#define __pgprot(x) ((pgprot_t) { (x) }) + +/* + * With hash config 64k pages additionally define a bigger "real PTE" type that + * gathers the "second half" part of the PTE for pseudo 64k pages + */ +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; +#else +typedef struct { pte_t pte; } real_pte_t; +#endif + +static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) +{ + unsigned long *p = (unsigned long *)ptep; + __be64 prev; + + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old), + (__force unsigned long)pte_raw(new)); + + return pte_raw(old) == prev; +} + +static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new) +{ + unsigned long *p = (unsigned long *)pmdp; + __be64 prev; + + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pmd_raw(old), + (__force unsigned long)pmd_raw(new)); + + return pmd_raw(old) == prev; +} + +#endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */ diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 43140f8b0..e7f4f3e0f 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -1,9 +1,6 @@ #ifndef _ASM_POWERPC_PGTABLE_TYPES_H #define _ASM_POWERPC_PGTABLE_TYPES_H -#ifdef CONFIG_STRICT_MM_TYPECHECKS -/* These are used to make use of C type-checking. */ - /* PTE level */ typedef struct { pte_basic_t pte; } pte_t; #define __pte(x) ((pte_t) { (x) }) @@ -48,49 +45,6 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) }) -#else - -/* - * .. while these make it easier on the compiler - */ - -typedef pte_basic_t pte_t; -#define __pte(x) (x) -static inline pte_basic_t pte_val(pte_t pte) -{ - return pte; -} - -#ifdef CONFIG_PPC64 -typedef unsigned long pmd_t; -#define __pmd(x) (x) -static inline unsigned long pmd_val(pmd_t pmd) -{ - return pmd; -} - -#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) -typedef unsigned long pud_t; -#define __pud(x) (x) -static inline unsigned long pud_val(pud_t pud) -{ - return pud; -} -#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ -#endif /* CONFIG_PPC64 */ - -typedef unsigned long pgd_t; -#define __pgd(x) (x) -static inline unsigned long pgd_val(pgd_t pgd) -{ - return pgd; -} - -typedef unsigned long pgprot_t; -#define pgprot_val(x) (x) -#define __pgprot(x) (x) - -#endif /* CONFIG_STRICT_MM_TYPECHECKS */ /* * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages @@ -100,4 +54,16 @@ typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; #endif + +#ifdef CONFIG_PPC_STD_MMU_64 +#include <asm/cmpxchg.h> + +static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) +{ + unsigned long *p = (unsigned long *)ptep; + + return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new)); +} +#endif + #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 47897a309..ee09e9909 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -65,7 +65,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, struct page **pages, int *nr); #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 -#define has_transparent_hugepage() 0 #endif pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *shift); diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 7ab04fc59..1d035c1cc 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -131,6 +131,7 @@ /* sorted alphabetically */ #define PPC_INST_BHRBE 0x7c00025c #define PPC_INST_CLRBHRB 0x7c00035c +#define PPC_INST_CP_ABORT 0x7c00068c #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe #define PPC_INST_DCBAL 0x7c2005ec @@ -285,6 +286,7 @@ #endif /* Deal with instructions that older assemblers aren't aware of */ +#define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index ca0c5bff7..8753e4eb9 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -33,9 +33,9 @@ extern struct pci_dev *isa_bridge_pcidev; /* may be NULL if no ISA bus */ struct device_node; struct pci_dn; -typedef void *(*traverse_func)(struct device_node *me, void *data); -void *traverse_pci_devices(struct device_node *start, traverse_func pre, - void *data); +void *pci_traverse_device_nodes(struct device_node *start, + void *(*fn)(struct device_node *, void *), + void *data); void *traverse_pci_dn(struct pci_dn *root, void *(*fn)(struct pci_dn *, void *), void *data); diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 499d9f894..2b3163237 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -427,7 +427,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) li r4,1024; \ mtctr r4; \ lis r4,KERNELBASE@h; \ + .machine push; \ + .machine "power4"; \ 0: tlbie r4; \ + .machine pop; \ addi r4,r4,0x1000; \ bdnz 0b #endif diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 1ec67b043..2eeaf80d4 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -76,6 +76,16 @@ */ #ifndef __ASSEMBLY__ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); + +/* + * Don't just check for any non zero bits in __PAGE_USER, since for book3e + * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in + * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. + */ +static inline bool pte_user(pte_t pte) +{ + return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; +} #endif /* __ASSEMBLY__ */ /* Location of the PFN in the PTE. Most 32-bit platforms use the same @@ -184,13 +194,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); /* Make modules code happy. We don't set RO yet */ #define PAGE_KERNEL_EXEC PAGE_KERNEL_X -/* - * Don't just check for any non zero bits in __PAGE_USER, since for book3e - * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in - * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. - */ -#define pte_user(val) ((val & _PAGE_USER) == _PAGE_USER) - /* Advertise special mapping type for AGP */ #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP @@ -198,3 +201,12 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); /* Advertise support for _PAGE_SPECIAL */ #define __HAVE_ARCH_PTE_SPECIAL +#ifndef _PAGE_READ +/* if not defined, we should not find _PAGE_WRITE too */ +#define _PAGE_READ 0 +#define _PAGE_WRITE _PAGE_RW +#endif + +#ifndef H_PAGE_4K_PFN +#define H_PAGE_4K_PFN 0 +#endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 166d86317..a0948f40b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -347,6 +347,7 @@ #define LPCR_LPES_SH 2 #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ +#define LPCR_UPRT 0x00400000 /* Use Process Table (ISA 3) */ #ifndef SPRN_LPID #define SPRN_LPID 0x13F /* Logical Partition Identifier */ #endif @@ -587,6 +588,7 @@ #define SPRN_PIR 0x3FF /* Processor Identification Register */ #endif #define SPRN_TIR 0x1BE /* Thread Identification Register */ +#define SPRN_PTCR 0x1D0 /* Partition table control Register */ #define SPRN_PSPB 0x09F /* Problem State Priority Boost reg */ #define SPRN_PTEHI 0x3D5 /* 981 7450 PTE HI word (S/W TLB load) */ #define SPRN_PTELO 0x3D6 /* 982 7450 PTE LO word (S/W TLB load) */ @@ -1182,6 +1184,7 @@ #define PVR_970GX 0x0045 #define PVR_POWER7p 0x004A #define PVR_POWER8E 0x004B +#define PVR_POWER8NVL 0x004C #define PVR_POWER8 0x004D #define PVR_BE 0x0070 #define PVR_PA6T 0x0090 diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 7efee4a32..8febc3f66 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -43,7 +43,9 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ unsigned long local_flags; /* private flags for thread */ - +#ifdef CONFIG_LIVEPATCH + unsigned long *livepatch_sp; +#endif /* low level flags - has atomic operations done on it */ unsigned long flags ____cacheline_aligned_in_smp; }; diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 9f77f85e3..1b38eea28 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -58,6 +58,7 @@ extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, #elif defined(CONFIG_PPC_STD_MMU_32) +#define MMU_NO_CONTEXT (0) /* * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx */ @@ -78,7 +79,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) } #elif defined(CONFIG_PPC_STD_MMU_64) -#include <asm/book3s/64/tlbflush-hash.h> +#include <asm/book3s/64/tlbflush.h> #else #error Unsupported MMU type #endif diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h new file mode 100644 index 000000000..6a9320974 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/perf_regs.h @@ -0,0 +1,50 @@ +#ifndef _UAPI_ASM_POWERPC_PERF_REGS_H +#define _UAPI_ASM_POWERPC_PERF_REGS_H + +enum perf_event_powerpc_regs { + PERF_REG_POWERPC_R0, + PERF_REG_POWERPC_R1, + PERF_REG_POWERPC_R2, + PERF_REG_POWERPC_R3, + PERF_REG_POWERPC_R4, + PERF_REG_POWERPC_R5, + PERF_REG_POWERPC_R6, + PERF_REG_POWERPC_R7, + PERF_REG_POWERPC_R8, + PERF_REG_POWERPC_R9, + PERF_REG_POWERPC_R10, + PERF_REG_POWERPC_R11, + PERF_REG_POWERPC_R12, + PERF_REG_POWERPC_R13, + PERF_REG_POWERPC_R14, + PERF_REG_POWERPC_R15, + PERF_REG_POWERPC_R16, + PERF_REG_POWERPC_R17, + PERF_REG_POWERPC_R18, + PERF_REG_POWERPC_R19, + PERF_REG_POWERPC_R20, + PERF_REG_POWERPC_R21, + PERF_REG_POWERPC_R22, + PERF_REG_POWERPC_R23, + PERF_REG_POWERPC_R24, + PERF_REG_POWERPC_R25, + PERF_REG_POWERPC_R26, + PERF_REG_POWERPC_R27, + PERF_REG_POWERPC_R28, + PERF_REG_POWERPC_R29, + PERF_REG_POWERPC_R30, + PERF_REG_POWERPC_R31, + PERF_REG_POWERPC_NIP, + PERF_REG_POWERPC_MSR, + PERF_REG_POWERPC_ORIG_R3, + PERF_REG_POWERPC_CTR, + PERF_REG_POWERPC_LINK, + PERF_REG_POWERPC_XER, + PERF_REG_POWERPC_CCR, + PERF_REG_POWERPC_SOFTE, + PERF_REG_POWERPC_TRAP, + PERF_REG_POWERPC_DAR, + PERF_REG_POWERPC_DSISR, + PERF_REG_POWERPC_MAX, +}; +#endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0d0183d31..9ea09551a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -86,6 +86,10 @@ int main(void) DEFINE(KSP_LIMIT, offsetof(struct thread_struct, ksp_limit)); #endif /* CONFIG_PPC64 */ +#ifdef CONFIG_LIVEPATCH + DEFINE(TI_livepatch_sp, offsetof(struct thread_info, livepatch_sp)); +#endif + DEFINE(KSP, offsetof(struct thread_struct, ksp)); DEFINE(PT_REGS, offsetof(struct thread_struct, regs)); #ifdef CONFIG_BOOKE @@ -434,7 +438,11 @@ int main(void) DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); #endif +#ifdef MAX_PGD_TABLE_SIZE + DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE); +#else DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); +#endif DEFINE(PTE_SIZE, sizeof(pte_t)); #ifdef CONFIG_KVM diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 41c011cb6..8275858a4 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -162,7 +162,7 @@ void btext_map(void) offset = ((unsigned long) dispDeviceBase) - base; size = dispDeviceRowBytes * dispDeviceRect[3] + offset + dispDeviceRect[0]; - vbase = __ioremap(base, size, _PAGE_NO_CACHE); + vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); if (vbase == 0) return; logicalDisplayBase = vbase + offset; diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 6c662b8de..eeeacf623 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -63,7 +63,6 @@ extern void __setup_cpu_745x(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec); extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec); -extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_pa6t(void); extern void __restore_cpu_ppc970(void); extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec); @@ -72,7 +71,6 @@ extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power8(void); extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power9(void); -extern void __restore_cpu_a2(void); extern void __flush_tlb_power7(unsigned int action); extern void __flush_tlb_power8(unsigned int action); extern void __flush_tlb_power9(unsigned int action); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 675848479..c9bc78e9c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -48,7 +48,7 @@ /** Overview: - * EEH, or "Extended Error Handling" is a PCI bridge technology for + * EEH, or "Enhanced Error Handling" is a PCI bridge technology for * dealing with PCI bus errors that can't be dealt with within the * usual PCI framework, except by check-stopping the CPU. Systems * that are designed for high-availability/reliability cannot afford @@ -1336,14 +1336,11 @@ static int eeh_pe_change_owner(struct eeh_pe *pe) id->subdevice != pdev->subsystem_device) continue; - goto reset; + return eeh_pe_reset_and_recover(pe); } } return eeh_unfreeze_pe(pe, true); - -reset: - return eeh_pe_reset_and_recover(pe); } /** diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 31e4c7e1a..d70101e1e 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -575,7 +575,7 @@ static int eeh_clear_pe_frozen_state(struct eeh_pe *pe, int eeh_pe_reset_and_recover(struct eeh_pe *pe) { - int result, ret; + int ret; /* Bail if the PE is being recovered */ if (pe->state & EEH_PE_RECOVERING) @@ -601,15 +601,9 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) return ret; } - /* Notify completion of reset */ - eeh_pe_dev_traverse(pe, eeh_report_reset, &result); - /* Restore device state */ eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); - /* Resume */ - eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); - /* Clear recovery mode */ eeh_pe_state_clear(pe, EEH_PE_RECOVERING); @@ -641,20 +635,19 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, * We don't remove the corresponding PE instances because * we need the information afterwords. The attached EEH * devices are expected to be attached soon when calling - * into pcibios_add_pci_devices(). + * into pci_hp_add_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); if (bus) { if (pe->type & EEH_PE_VF) { eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); } else { - eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); pci_lock_rescan_remove(); - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); pci_unlock_rescan_remove(); } } else if (frozen_bus) { - eeh_pe_dev_traverse(pe, eeh_rmv_device, &rmv_data); + eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); } /* @@ -698,10 +691,12 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, */ edev = list_first_entry(&pe->edevs, struct eeh_dev, list); eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); - if (pe->type & EEH_PE_VF) + if (pe->type & EEH_PE_VF) { eeh_add_virt_device(edev, NULL); - else - pcibios_add_pci_devices(bus); + } else { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); + pci_hp_add_devices(bus); + } } else if (frozen_bus && rmv_data->removed) { pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); ssleep(5); @@ -711,7 +706,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, if (pe->type & EEH_PE_VF) eeh_add_virt_device(edev, NULL); else - pcibios_add_pci_devices(frozen_bus); + pci_hp_add_devices(frozen_bus); } eeh_pe_state_clear(pe, EEH_PE_KEEP); @@ -916,7 +911,7 @@ perm_error: eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); pci_lock_rescan_remove(); - pcibios_remove_pci_devices(frozen_bus); + pci_hp_remove_devices(frozen_bus); pci_unlock_rescan_remove(); } } @@ -1001,7 +996,7 @@ static void eeh_handle_special_event(void) bus = eeh_pe_bus_get(phb_pe); eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); - pcibios_remove_pci_devices(bus); + pci_hp_remove_devices(bus); } pci_unlock_rescan_remove(); } diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c index 4eefb6e34..82e7327e3 100644 --- a/arch/powerpc/kernel/eeh_event.c +++ b/arch/powerpc/kernel/eeh_event.c @@ -36,7 +36,7 @@ static DEFINE_SPINLOCK(eeh_eventlist_lock); static struct semaphore eeh_eventlist_sem; -LIST_HEAD(eeh_eventlist); +static LIST_HEAD(eeh_eventlist); /** * eeh_event_handler - Dispatch EEH events. diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index eea48d8ba..f0520da85 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -249,7 +249,7 @@ static void *__eeh_pe_get(void *data, void *flag) } else { if (edev->pe_config_addr && (edev->pe_config_addr == pe->addr)) - return pe; + return pe; } /* Try BDF address */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 9916d150b..73e461a3d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -20,6 +20,7 @@ #include <linux/errno.h> #include <linux/err.h> +#include <linux/magic.h> #include <asm/unistd.h> #include <asm/processor.h> #include <asm/page.h> @@ -36,6 +37,7 @@ #include <asm/hw_irq.h> #include <asm/context_tracking.h> #include <asm/tm.h> +#include <asm/ppc-opcode.h> /* * System calls. @@ -508,6 +510,14 @@ BEGIN_FTR_SECTION ldarx r6,0,r1 END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) +BEGIN_FTR_SECTION +/* + * A cp_abort (copy paste abort) here ensures that when context switching, a + * copy from one process can't leak into the paste of another. + */ + PPC_CP_ABORT +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself @@ -519,7 +529,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_BOOK3S +#ifdef CONFIG_PPC_STD_MMU_64 +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) BEGIN_FTR_SECTION clrrdi r6,r8,28 /* get its ESID */ clrrdi r9,r1,28 /* get current sp ESID */ @@ -565,7 +578,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* !CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_PPC_STD_MMU_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE @@ -1248,6 +1261,9 @@ _GLOBAL(ftrace_caller) addi r3,r3,function_trace_op@toc@l ld r5,0(r3) +#ifdef CONFIG_LIVEPATCH + mr r14,r7 /* remember old NIP */ +#endif /* Calculate ip from nip-4 into r3 for call below */ subi r3, r7, MCOUNT_INSN_SIZE @@ -1272,6 +1288,9 @@ ftrace_call: /* Load ctr with the possibly modified NIP */ ld r3, _NIP(r1) mtctr r3 +#ifdef CONFIG_LIVEPATCH + cmpd r14,r3 /* has NIP been altered? */ +#endif /* Restore gprs */ REST_8GPRS(0,r1) @@ -1289,6 +1308,11 @@ ftrace_call: ld r0, LRSAVE(r1) mtlr r0 +#ifdef CONFIG_LIVEPATCH + /* Based on the cmpd above, if the NIP was altered handle livepatch */ + bne- livepatch_handler +#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER stdu r1, -112(r1) .globl ftrace_graph_call @@ -1305,6 +1329,91 @@ _GLOBAL(ftrace_graph_stub) _GLOBAL(ftrace_stub) blr + +#ifdef CONFIG_LIVEPATCH + /* + * This function runs in the mcount context, between two functions. As + * such it can only clobber registers which are volatile and used in + * function linkage. + * + * We get here when a function A, calls another function B, but B has + * been live patched with a new function C. + * + * On entry: + * - we have no stack frame and can not allocate one + * - LR points back to the original caller (in A) + * - CTR holds the new NIP in C + * - r0 & r12 are free + * + * r0 can't be used as the base register for a DS-form load or store, so + * we temporarily shuffle r1 (stack pointer) into r0 and then put it back. + */ +livepatch_handler: + CURRENT_THREAD_INFO(r12, r1) + + /* Save stack pointer into r0 */ + mr r0, r1 + + /* Allocate 3 x 8 bytes */ + ld r1, TI_livepatch_sp(r12) + addi r1, r1, 24 + std r1, TI_livepatch_sp(r12) + + /* Save toc & real LR on livepatch stack */ + std r2, -24(r1) + mflr r12 + std r12, -16(r1) + + /* Store stack end marker */ + lis r12, STACK_END_MAGIC@h + ori r12, r12, STACK_END_MAGIC@l + std r12, -8(r1) + + /* Restore real stack pointer */ + mr r1, r0 + + /* Put ctr in r12 for global entry and branch there */ + mfctr r12 + bctrl + + /* + * Now we are returning from the patched function to the original + * caller A. We are free to use r0 and r12, and we can use r2 until we + * restore it. + */ + + CURRENT_THREAD_INFO(r12, r1) + + /* Save stack pointer into r0 */ + mr r0, r1 + + ld r1, TI_livepatch_sp(r12) + + /* Check stack marker hasn't been trashed */ + lis r2, STACK_END_MAGIC@h + ori r2, r2, STACK_END_MAGIC@l + ld r12, -8(r1) +1: tdne r12, r2 + EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0 + + /* Restore LR & toc from livepatch stack */ + ld r12, -16(r1) + mtlr r12 + ld r2, -24(r1) + + /* Pop livepatch stack frame */ + CURRENT_THREAD_INFO(r12, r0) + subi r1, r1, 24 + std r1, TI_livepatch_sp(r12) + + /* Restore real stack pointer */ + mr r1, r0 + + /* Return to original caller of live patched function */ + blr +#endif + + #else _GLOBAL_TOC(_mcount) /* Taken from output of objdump from lib64/glibc */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2b66f25d4..8bcc1b457 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -189,7 +189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif /* CONFIG_PPC_P7_NAP */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION - b machine_check_pSeries_early + b machine_check_powernv_early FTR_SECTION_ELSE b machine_check_pSeries_0 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) @@ -209,11 +209,6 @@ data_access_slb_pSeries: EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR -#ifdef __DISABLED__ - /* Keep that around for when we re-implement dynamic VSIDs */ - cmpdi r3,0 - bge slb_miss_user_pseries -#endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -240,11 +235,6 @@ instruction_access_slb_pSeries: EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ -#ifdef __DISABLED__ - /* Keep that around for when we re-implement dynamic VSIDs */ - cmpdi r3,0 - bge slb_miss_user_pseries -#endif /* __DISABLED__ */ mfspr r12,SPRN_SRR1 #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -443,7 +433,7 @@ denorm_exception_hv: .align 7 /* moved from 0x200 */ -machine_check_pSeries_early: +machine_check_powernv_early: BEGIN_FTR_SECTION EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) /* @@ -709,34 +699,6 @@ system_reset_fwnmi: #endif /* CONFIG_PPC_PSERIES */ -#ifdef __DISABLED__ -/* - * This is used for when the SLB miss handler has to go virtual, - * which doesn't happen for now anymore but will once we re-implement - * dynamic VSIDs for shared page tables - */ -slb_miss_user_pseries: - std r10,PACA_EXGEN+EX_R10(r13) - std r11,PACA_EXGEN+EX_R11(r13) - std r12,PACA_EXGEN+EX_R12(r13) - GET_SCRATCH0(r10) - ld r11,PACA_EXSLB+EX_R9(r13) - ld r12,PACA_EXSLB+EX_R3(r13) - std r10,PACA_EXGEN+EX_R13(r13) - std r11,PACA_EXGEN+EX_R9(r13) - std r12,PACA_EXGEN+EX_R3(r13) - clrrdi r12,r13,32 - mfmsr r10 - mfspr r11,SRR0 /* save SRR0 */ - ori r12,r12,slb_miss_user_common@l /* virt addr of handler */ - ori r10,r10,MSR_IR|MSR_DR|MSR_RI - mtspr SRR0,r12 - mfspr r12,SRR1 /* and SRR1 */ - mtspr SRR1,r10 - rfid - b . /* prevent spec. execution */ -#endif /* __DISABLED__ */ - #ifdef CONFIG_KVM_BOOK3S_64_HANDLER kvmppc_skip_interrupt: /* @@ -764,11 +726,10 @@ kvmppc_skip_Hinterrupt: #endif /* - * Code from here down to __end_handlers is invoked from the - * exception prologs above. Because the prologs assemble the - * addresses of these handlers using the LOAD_HANDLER macro, - * which uses an ori instruction, these handlers must be in - * the first 64k of the kernel image. + * Ensure that any handlers that get invoked from the exception prologs + * above are below the first 64KB (0x10000) of the kernel image because + * the prologs assemble the addresses of these handlers using the + * LOAD_HANDLER macro, which uses an ori instruction. */ /*** Common interrupt handlers ***/ @@ -978,7 +939,13 @@ data_access_common: ld r3,PACA_EXGEN+EX_DAR(r13) lwz r4,PACA_EXGEN+EX_DSISR(r13) li r5,0x300 + std r3,_DAR(r1) + std r4,_DSISR(r1) +BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ +MMU_FTR_SECTION_ELSE + b handle_page_fault +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) .align 7 .globl h_data_storage_common @@ -1003,73 +970,15 @@ instruction_access_common: ld r3,_NIP(r1) andis. r4,r12,0x5820 li r5,0x400 + std r3,_DAR(r1) + std r4,_DSISR(r1) +BEGIN_MMU_FTR_SECTION b do_hash_page /* Try to handle as hpte fault */ - - STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) - -/* - * Here is the common SLB miss user that is used when going to virtual - * mode for SLB misses, that is currently not used - */ -#ifdef __DISABLED__ - .align 7 - .globl slb_miss_user_common -slb_miss_user_common: - mflr r10 - std r3,PACA_EXGEN+EX_DAR(r13) - stw r9,PACA_EXGEN+EX_CCR(r13) - std r10,PACA_EXGEN+EX_LR(r13) - std r11,PACA_EXGEN+EX_SRR0(r13) - bl slb_allocate_user - - ld r10,PACA_EXGEN+EX_LR(r13) - ld r3,PACA_EXGEN+EX_R3(r13) - lwz r9,PACA_EXGEN+EX_CCR(r13) - ld r11,PACA_EXGEN+EX_SRR0(r13) - mtlr r10 - beq- slb_miss_fault - - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- unrecov_user_slb - mfmsr r10 - -.machine push -.machine "power4" - mtcrf 0x80,r9 -.machine pop - - clrrdi r10,r10,2 /* clear RI before setting SRR0/1 */ - mtmsrd r10,1 - - mtspr SRR0,r11 - mtspr SRR1,r12 - - ld r9,PACA_EXGEN+EX_R9(r13) - ld r10,PACA_EXGEN+EX_R10(r13) - ld r11,PACA_EXGEN+EX_R11(r13) - ld r12,PACA_EXGEN+EX_R12(r13) - ld r13,PACA_EXGEN+EX_R13(r13) - rfid - b . - -slb_miss_fault: - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN) - ld r4,PACA_EXGEN+EX_DAR(r13) - li r5,0 - std r4,_DAR(r1) - std r5,_DSISR(r1) +MMU_FTR_SECTION_ELSE b handle_page_fault +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) -unrecov_user_slb: - EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) - RECONCILE_IRQ_STATE(r10, r11) - bl save_nvgprs -1: addi r3,r1,STACK_FRAME_OVERHEAD - bl unrecoverable_exception - b 1b - -#endif /* __DISABLED__ */ - + STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) /* * Machine check is different because we use a different @@ -1225,10 +1134,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) - .align 7 - .globl __end_handlers -__end_handlers: - /* Equivalents to the above handlers for relocation-on interrupt vectors */ STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) @@ -1482,8 +1387,11 @@ slb_miss_realmode: stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ +#ifdef CONFIG_PPC_STD_MMU_64 +BEGIN_MMU_FTR_SECTION bl slb_allocate_realmode - +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX) +#endif /* All done -- return from exception. */ ld r10,PACA_EXSLB+EX_LR(r13) @@ -1491,9 +1399,12 @@ slb_miss_realmode: lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ +BEGIN_MMU_FTR_SECTION beq- 2f +FTR_SECTION_ELSE + b 2f +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) .machine push .machine "power4" @@ -1542,9 +1453,7 @@ power4_fixup_nap: */ .align 7 do_hash_page: - std r3,_DAR(r1) - std r4,_DSISR(r1) - +#ifdef CONFIG_PPC_STD_MMU_64 andis. r0,r4,0xa410 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ andis. r0,r4,DSISR_DABRMATCH@h @@ -1572,6 +1481,7 @@ do_hash_page: /* Error */ blt- 13f +#endif /* CONFIG_PPC_STD_MMU_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1598,6 +1508,7 @@ handle_dabr_fault: 12: b ret_from_except_lite +#ifdef CONFIG_PPC_STD_MMU_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ @@ -1607,6 +1518,7 @@ handle_dabr_fault: ld r4,_DAR(r1) bl low_hash_fault b ret_from_except +#endif /* * We come here as a result of a DSI at a point where we don't want diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 9dac18dab..1123a4d8d 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -607,3 +607,13 @@ unsigned long __init arch_syscall_addr(int nr) return sys_call_table[nr*2]; } #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */ + +#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) +char *arch_ftrace_match_adjust(char *str, const char *search) +{ + if (str[0] == '.' && search[0] != '.') + return str + 1; + else + return str; +} +#endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 4286775cb..2d14774af 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -973,13 +973,16 @@ start_here_common: * This stuff goes at the beginning of the bss, which is page-aligned. */ .section ".bss" +/* + * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K. + * We will need to find a better way to fix this + */ + .align 16 - .align PAGE_SHIFT + .globl swapper_pg_dir +swapper_pg_dir: + .space PGD_TABLE_SIZE .globl empty_zero_page empty_zero_page: .space PAGE_SIZE - - .globl swapper_pg_dir -swapper_pg_dir: - .space PGD_TABLE_SIZE diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index ac86c53e2..a89f4f7a6 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -408,7 +408,7 @@ static ssize_t modalias_show(struct device *dev, return len+1; } -struct device_attribute ibmebus_bus_device_attrs[] = { +static struct device_attribute ibmebus_bus_device_attrs[] = { __ATTR_RO(devspec), __ATTR_RO(name), __ATTR_RO(modalias), diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 290559df1..3cb46a3b1 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -66,6 +66,7 @@ #include <asm/udbg.h> #include <asm/smp.h> #include <asm/debug.h> +#include <asm/livepatch.h> #ifdef CONFIG_PPC64 #include <asm/paca.h> @@ -607,10 +608,12 @@ void irq_ctx_init(void) memset((void *)softirq_ctx[i], 0, THREAD_SIZE); tp = softirq_ctx[i]; tp->cpu = i; + klp_init_thread_info(tp); memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); tp = hardirq_ctx[i]; tp->cpu = i; + klp_init_thread_info(tp); } } diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 0f1997097..ae1316106 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -109,14 +109,14 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, size = 0x10000; __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, _PAGE_NO_CACHE|_PAGE_GUARDED); + size, pgprot_val(pgprot_noncached(__pgprot(0)))); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, _PAGE_NO_CACHE|_PAGE_GUARDED); + 0x10000, pgprot_val(pgprot_noncached(__pgprot(0)))); } diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 015ae55c1..2694d0787 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -228,17 +228,12 @@ static struct property memory_limit_prop = { static void __init export_crashk_values(struct device_node *node) { - struct property *prop; - /* There might be existing crash kernel properties, but we can't * be sure what's in them, so remove them. */ - prop = of_find_property(node, "linux,crashkernel-base", NULL); - if (prop) - of_remove_property(node, prop); - - prop = of_find_property(node, "linux,crashkernel-size", NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, + "linux,crashkernel-base", NULL)); + of_remove_property(node, of_find_property(node, + "linux,crashkernel-size", NULL)); if (crashk_res.start != 0) { crashk_base = cpu_to_be_ulong(crashk_res.start), @@ -258,16 +253,13 @@ static void __init export_crashk_values(struct device_node *node) static int __init kexec_setup(void) { struct device_node *node; - struct property *prop; node = of_find_node_by_path("/chosen"); if (!node) return -ENOENT; /* remove any stale properties so ours can be found */ - prop = of_find_property(node, kernel_end_prop.name, NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); /* information needed by userspace when using default_machine_kexec */ kernel_end = cpu_to_be_ulong(__pa(_end)); diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 0fbd75d18..b8c202d63 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -76,6 +76,7 @@ int default_machine_kexec_prepare(struct kimage *image) * end of the blocked region (begin >= high). Use the * boolean identity !(a || b) === (!a && !b). */ +#ifdef CONFIG_PPC_STD_MMU_64 if (htab_address) { low = __pa(htab_address); high = low + htab_size_bytes; @@ -88,6 +89,7 @@ int default_machine_kexec_prepare(struct kimage *image) return -ETXTBSY; } } +#endif /* CONFIG_PPC_STD_MMU_64 */ /* We also should not overwrite the tce tables */ for_each_node_by_type(node, "pci") { @@ -381,7 +383,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifndef CONFIG_PPC_BOOK3E +#ifdef CONFIG_PPC_STD_MMU_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -401,7 +403,6 @@ static struct property htab_size_prop = { static int __init export_htab_values(void) { struct device_node *node; - struct property *prop; /* On machines with no htab htab_address is NULL */ if (!htab_address) @@ -412,12 +413,8 @@ static int __init export_htab_values(void) return -ENODEV; /* remove any stale propertys so ours can be found */ - prop = of_find_property(node, htab_base_prop.name, NULL); - if (prop) - of_remove_property(node, prop); - prop = of_find_property(node, htab_size_prop.name, NULL); - if (prop) - of_remove_property(node, prop); + of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL)); + of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL)); htab_base = cpu_to_be64(__pa(htab_address)); of_add_property(node, &htab_base_prop); @@ -428,4 +425,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* !CONFIG_PPC_BOOK3E */ +#endif /* CONFIG_PPC_STD_MMU_64 */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index b2eb4686b..ef267fd9d 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -37,7 +37,7 @@ static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); static void machine_check_process_queued_event(struct irq_work *work); -struct irq_work mce_event_process_work = { +static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; @@ -284,7 +284,7 @@ void machine_check_print_event_info(struct machine_check_event *evt) printk("%s Effective address: %016llx\n", level, evt->u.ue_error.effective_address); if (evt->u.ue_error.physical_address_provided) - printk("%s Physial address: %016llx\n", + printk("%s Physical address: %016llx\n", level, evt->u.ue_error.physical_address); break; case MCE_ERROR_TYPE_SLB: diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index ee62b1975..7353991c4 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -72,11 +72,15 @@ void __flush_tlb_power8(unsigned int action) void __flush_tlb_power9(unsigned int action) { + if (radix_enabled()) + flush_tlb_206(POWER9_TLB_SETS_RADIX, action); + flush_tlb_206(POWER9_TLB_SETS_HASH, action); } /* flush SLBs and reload */ +#ifdef CONFIG_PPC_STD_MMU_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -110,6 +114,7 @@ static void flush_and_reload_slb(void) asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); } } +#endif static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) { @@ -120,6 +125,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) * reset the error bits whenever we handle them so that at the end * we can check whether we handled all of them or not. * */ +#ifdef CONFIG_PPC_STD_MMU_64 if (dsisr & slb_error_bits) { flush_and_reload_slb(); /* reset error bits */ @@ -131,6 +137,7 @@ static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) /* reset error bits */ dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; } +#endif /* Any other errors we don't understand? */ if (dsisr & 0xffffffffUL) handled = 0; @@ -150,6 +157,7 @@ static long mce_handle_common_ierror(uint64_t srr1) switch (P7_SRR1_MC_IFETCH(srr1)) { case 0: break; +#ifdef CONFIG_PPC_STD_MMU_64 case P7_SRR1_MC_IFETCH_SLB_PARITY: case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: /* flush and reload SLBs for SLB errors. */ @@ -162,6 +170,7 @@ static long mce_handle_common_ierror(uint64_t srr1) handled = 1; } break; +#endif default: break; } @@ -175,10 +184,12 @@ static long mce_handle_ierror_p7(uint64_t srr1) handled = mce_handle_common_ierror(srr1); +#ifdef CONFIG_PPC_STD_MMU_64 if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { flush_and_reload_slb(); handled = 1; } +#endif return handled; } @@ -321,10 +332,12 @@ static long mce_handle_ierror_p8(uint64_t srr1) handled = mce_handle_common_ierror(srr1); +#ifdef CONFIG_PPC_STD_MMU_64 if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { flush_and_reload_slb(); handled = 1; } +#endif return handled; } diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index bf5160fbf..285ca8c6c 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -599,12 +599,6 @@ _GLOBAL(__bswapdi2) mr r4,r10 blr -_GLOBAL(abs) - srawi r4,r3,31 - xor r3,r3,r4 - sub r3,r3,r4 - blr - #ifdef CONFIG_SMP _GLOBAL(start_secondary_resume) /* Reset stack */ diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 0cab9e8c3..856f9a794 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -15,8 +15,6 @@ * parsing code. */ -#include <linux/module.h> - #include <linux/types.h> #include <linux/errno.h> #include <linux/fs.h> @@ -1231,12 +1229,4 @@ static int __init nvram_init(void) return rc; } - -static void __exit nvram_cleanup(void) -{ - misc_deregister( &nvram_dev ); -} - -module_init(nvram_init); -module_exit(nvram_cleanup); -MODULE_LICENSE("GPL"); +device_initcall(nvram_init); diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c index 59c436189..2d71269e7 100644 --- a/arch/powerpc/kernel/pci-hotplug.c +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -21,6 +21,35 @@ #include <asm/firmware.h> #include <asm/eeh.h> +static struct pci_bus *find_bus_among_children(struct pci_bus *bus, + struct device_node *dn) +{ + struct pci_bus *child = NULL; + struct pci_bus *tmp; + + if (pci_bus_to_OF_node(bus) == dn) + return bus; + + list_for_each_entry(tmp, &bus->children, node) { + child = find_bus_among_children(tmp, dn); + if (child) + break; + } + + return child; +} + +struct pci_bus *pci_find_bus_by_node(struct device_node *dn) +{ + struct pci_dn *pdn = PCI_DN(dn); + + if (!pdn || !pdn->phb || !pdn->phb->bus) + return NULL; + + return find_bus_among_children(pdn->phb->bus, dn); +} +EXPORT_SYMBOL_GPL(pci_find_bus_by_node); + /** * pcibios_release_device - release PCI device * @dev: PCI device @@ -38,20 +67,20 @@ void pcibios_release_device(struct pci_dev *dev) } /** - * pcibios_remove_pci_devices - remove all devices under this bus + * pci_hp_remove_devices - remove all devices under this bus * @bus: the indicated PCI bus * * Remove all of the PCI devices under this bus both from the * linux pci device tree, and from the powerpc EEH address cache. */ -void pcibios_remove_pci_devices(struct pci_bus *bus) +void pci_hp_remove_devices(struct pci_bus *bus) { struct pci_dev *dev, *tmp; struct pci_bus *child_bus; /* First go down child busses */ list_for_each_entry(child_bus, &bus->children, node) - pcibios_remove_pci_devices(child_bus); + pci_hp_remove_devices(child_bus); pr_debug("PCI: Removing devices on bus %04x:%02x\n", pci_domain_nr(bus), bus->number); @@ -60,11 +89,10 @@ void pcibios_remove_pci_devices(struct pci_bus *bus) pci_stop_and_remove_bus_device(dev); } } - -EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); +EXPORT_SYMBOL_GPL(pci_hp_remove_devices); /** - * pcibios_add_pci_devices - adds new pci devices to bus + * pci_hp_add_devices - adds new pci devices to bus * @bus: the indicated PCI bus * * This routine will find and fixup new pci devices under @@ -74,7 +102,7 @@ EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); * is how this routine differs from other, similar pcibios * routines.) */ -void pcibios_add_pci_devices(struct pci_bus * bus) +void pci_hp_add_devices(struct pci_bus *bus) { int slotno, mode, pass, max; struct pci_dev *dev; @@ -92,7 +120,8 @@ void pcibios_add_pci_devices(struct pci_bus * bus) if (mode == PCI_PROBE_DEVTREE) { /* use ofdt-based probe */ of_rescan_bus(dn, bus); - } else if (mode == PCI_PROBE_NORMAL) { + } else if (mode == PCI_PROBE_NORMAL && + dn->child && PCI_DN(dn->child)) { /* * Use legacy probe. In the partial hotplug case, we * probably have grandchildren devices unplugged. So @@ -114,4 +143,4 @@ void pcibios_add_pci_devices(struct pci_bus * bus) } pcibios_finish_adding_to_bus(bus); } -EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); +EXPORT_SYMBOL_GPL(pci_hp_add_devices); diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 60bb187cb..a5ae49a2d 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -38,7 +38,7 @@ * ISA drivers use hard coded offsets. If no ISA bus exists nothing * is mapped on the first 64K of IO space */ -unsigned long pci_io_base = ISA_IO_BASE; +unsigned long pci_io_base; EXPORT_SYMBOL(pci_io_base); static int __init pcibios_init(void) @@ -159,7 +159,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) /* Establish the mapping */ if (__ioremap_at(phys_page, area->addr, size_page, - _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL) + pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL) return -ENOMEM; /* Fixup hose IO resource */ diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 38102cb9b..ecdccce78 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -282,13 +282,9 @@ void remove_dev_pci_data(struct pci_dev *pdev) #endif /* CONFIG_PCI_IOV */ } -/* - * Traverse_func that inits the PCI fields of the device node. - * NOTE: this *must* be done before read/write config to the device. - */ -void *update_dn_pci_info(struct device_node *dn, void *data) +struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, + struct device_node *dn) { - struct pci_controller *phb = data; const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL); const __be32 *regs; struct device_node *parent; @@ -299,7 +295,7 @@ void *update_dn_pci_info(struct device_node *dn, void *data) return NULL; dn->data = pdn; pdn->node = dn; - pdn->phb = phb; + pdn->phb = hose; #ifdef CONFIG_PPC_POWERNV pdn->pe_number = IODA_INVALID_PE; #endif @@ -331,8 +327,32 @@ void *update_dn_pci_info(struct device_node *dn, void *data) if (pdn->parent) list_add_tail(&pdn->list, &pdn->parent->child_list); - return NULL; + return pdn; } +EXPORT_SYMBOL_GPL(pci_add_device_node_info); + +void pci_remove_device_node_info(struct device_node *dn) +{ + struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; +#ifdef CONFIG_EEH + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + + if (edev) + edev->pdn = NULL; +#endif + + if (!pdn) + return; + + WARN_ON(!list_empty(&pdn->child_list)); + list_del(&pdn->list); + if (pdn->parent) + of_node_put(pdn->parent->node); + + dn->data = NULL; + kfree(pdn); +} +EXPORT_SYMBOL_GPL(pci_remove_device_node_info); /* * Traverse a device tree stopping each PCI device in the tree. @@ -352,8 +372,9 @@ void *update_dn_pci_info(struct device_node *dn, void *data) * one of these nodes we also assume its siblings are non-pci for * performance. */ -void *traverse_pci_devices(struct device_node *start, traverse_func pre, - void *data) +void *pci_traverse_device_nodes(struct device_node *start, + void *(*fn)(struct device_node *, void *), + void *data) { struct device_node *dn, *nextdn; void *ret; @@ -368,8 +389,11 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, if (classp) class = of_read_number(classp, 1); - if (pre && ((ret = pre(dn, data)) != NULL)) - return ret; + if (fn) { + ret = fn(dn, data); + if (ret) + return ret; + } /* If we are a PCI bridge, go down */ if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || @@ -391,6 +415,7 @@ void *traverse_pci_devices(struct device_node *start, traverse_func pre, } return NULL; } +EXPORT_SYMBOL_GPL(pci_traverse_device_nodes); static struct pci_dn *pci_dn_next_one(struct pci_dn *root, struct pci_dn *pdn) @@ -432,6 +457,18 @@ void *traverse_pci_dn(struct pci_dn *root, return NULL; } +static void *add_pdn(struct device_node *dn, void *data) +{ + struct pci_controller *hose = data; + struct pci_dn *pdn; + + pdn = pci_add_device_node_info(hose, dn); + if (!pdn) + return ERR_PTR(-ENOMEM); + + return NULL; +} + /** * pci_devs_phb_init_dynamic - setup pci devices under this PHB * phb: pci-to-host bridge (top-level bridge connecting to cpu) @@ -446,8 +483,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) struct pci_dn *pdn; /* PHB nodes themselves must not match */ - update_dn_pci_info(dn, phb); - pdn = dn->data; + pdn = pci_add_device_node_info(phb, dn); if (pdn) { pdn->devfn = pdn->busno = -1; pdn->vendor_id = pdn->device_id = pdn->class_code = 0; @@ -456,7 +492,7 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) } /* Update dn->phb ptrs for new phb and children devices */ - traverse_pci_devices(dn, update_dn_pci_info, phb); + pci_traverse_device_nodes(dn, add_pdn, phb); } /** diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index bec85055f..0b9389342 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -38,6 +38,7 @@ #include <linux/random.h> #include <linux/hw_breakpoint.h> #include <linux/uaccess.h> +#include <linux/elf-randomize.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -55,6 +56,9 @@ #include <asm/firmware.h> #endif #include <asm/code-patching.h> +#include <asm/exec.h> +#include <asm/livepatch.h> + #include <linux/kprobes.h> #include <linux/kdebug.h> @@ -1075,7 +1079,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1083,7 +1087,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1129,7 +1133,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1138,8 +1142,7 @@ struct task_struct *__switch_to(struct task_struct *prev, if (current_thread_info()->task->thread.regs) restore_math(current_thread_info()->task->thread.regs); - -#endif /* CONFIG_PPC_BOOK3S_64 */ +#endif /* CONFIG_PPC_STD_MMU_64 */ return last; } @@ -1326,10 +1329,6 @@ void show_regs(struct pt_regs * regs) show_instructions(regs); } -void exit_thread(void) -{ -} - void flush_thread(void) { #ifdef CONFIG_HAVE_HW_BREAKPOINT @@ -1374,6 +1373,9 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; + if (radix_enabled()) + return; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T) << SLB_VSID_SHIFT_1T; @@ -1400,13 +1402,15 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, extern void ret_from_kernel_thread(void); void (*f)(void); unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; + struct thread_info *ti = task_thread_info(p); + + klp_init_thread_info(ti); /* Copy registers */ sp -= sizeof(struct pt_regs); childregs = (struct pt_regs *) sp; if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ - struct thread_info *ti = (void *)task_stack_page(p); memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + sizeof(struct pt_regs); /* function */ @@ -1930,7 +1934,8 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * the heap, we can put it above 1TB so it is backed by a 1TB * segment. Otherwise the heap will be in the bottom 1TB * which always uses 256MB segments and this may result in a - * performance penalty. + * performance penalty. We don't need to worry about radix. For + * radix, mmu_highuser_ssize remains unchanged from 256MB. */ if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index a15fe1d4e..946e34ffe 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -34,6 +34,7 @@ #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/libfdt.h> +#include <linux/cpu.h> #include <asm/prom.h> #include <asm/rtas.h> @@ -167,6 +168,7 @@ static struct ibm_pa_feature { */ {CPU_FTR_TM_COMP, 0, 0, PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0}, + {0, MMU_FTR_RADIX, 0, 0, 40, 0, 0}, }; static void __init scan_features(unsigned long node, const unsigned char *ftrs, diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 30a03c03f..060b140f0 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -377,7 +377,7 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset, #else BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != - offsetof(struct thread_fp_state, fpr[32][0])); + offsetof(struct thread_fp_state, fpr[32])); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fp_state, 0, -1); @@ -405,7 +405,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, return 0; #else BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) != - offsetof(struct thread_fp_state, fpr[32][0])); + offsetof(struct thread_fp_state, fpr[32])); return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.fp_state, 0, -1); diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index aa610ce87..c638e2487 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -442,7 +442,7 @@ static void do_event_scan(void) } static void rtas_event_scan(struct work_struct *w); -DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); +static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); /* * Delay should be at least one second since some machines have problems if diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 44c8d0355..8ca79b750 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -128,9 +128,7 @@ void machine_restart(char *cmd) machine_shutdown(); if (ppc_md.restart) ppc_md.restart(cmd); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; @@ -141,9 +139,7 @@ void machine_power_off(void) machine_shutdown(); if (pm_power_off) pm_power_off(); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; @@ -159,9 +155,7 @@ void machine_halt(void) machine_shutdown(); if (ppc_md.halt) ppc_md.halt(); -#ifdef CONFIG_SMP smp_send_stop(); -#endif printk(KERN_EMERG "System Halted, OK to turn off power\n"); local_irq_disable(); while (1) ; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index f98be8383..96d4a2b23 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -69,6 +69,7 @@ #include <asm/kvm_ppc.h> #include <asm/hugetlb.h> #include <asm/epapr_hcalls.h> +#include <asm/livepatch.h> #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -667,16 +668,16 @@ static void __init emergency_stack_init(void) limit = min(safe_stack_limit(), ppc64_rma_size); for_each_possible_cpu(i) { - unsigned long sp; - sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); - sp += THREAD_SIZE; - paca[i].emergency_sp = __va(sp); + struct thread_info *ti; + ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); + klp_init_thread_info(ti); + paca[i].emergency_sp = (void *)ti + THREAD_SIZE; #ifdef CONFIG_PPC_BOOK3S_64 /* emergency stack for machine check exception handling. */ - sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); - sp += THREAD_SIZE; - paca[i].mc_emergency_sp = __va(sp); + ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); + klp_init_thread_info(ti); + paca[i].mc_emergency_sp = (void *)ti + THREAD_SIZE; #endif } } @@ -700,6 +701,8 @@ void __init setup_arch(char **cmdline_p) if (ppc_md.panic) setup_panic(); + klp_init_thread_info(&init_thread_info); + init_mm.start_code = (unsigned long)_stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8cac1eb41..55c924b65 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -565,7 +565,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c index 6669b1752..6ae9bd508 100644 --- a/arch/powerpc/kernel/swsusp.c +++ b/arch/powerpc/kernel/swsusp.c @@ -31,6 +31,6 @@ void save_processor_state(void) void restore_processor_state(void) { #ifdef CONFIG_PPC32 - switch_mmu_context(current->active_mm, current->active_mm); + switch_mmu_context(current->active_mm, current->active_mm, NULL); #endif } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 81b0900a3..3ed9a5a21 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -55,6 +55,7 @@ #include <linux/delay.h> #include <linux/irq_work.h> #include <linux/clk-provider.h> +#include <linux/suspend.h> #include <asm/trace.h> #include <asm/io.h> diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index bf8f34a58..b7019b559 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -110,17 +110,11 @@ _GLOBAL(tm_reclaim) std r3, STK_PARAM(R3)(r1) SAVE_NVGPRS(r1) - /* We need to setup MSR for VSX register save instructions. Here we - * also clear the MSR RI since when we do the treclaim, we won't have a - * valid kernel pointer for a while. We clear RI here as it avoids - * adding another mtmsr closer to the treclaim. This makes the region - * maked as non-recoverable wider than it needs to be but it saves on - * inserting another mtmsrd later. - */ + /* We need to setup MSR for VSX register save instructions. */ mfmsr r14 mr r15, r14 ori r15, r15, MSR_FP - li r16, MSR_RI + li r16, 0 ori r16, r16, MSR_EE /* IRQs hard off */ andc r15, r15, r16 oris r15, r15, MSR_VEC@h @@ -176,7 +170,17 @@ dont_backup_fp: 1: tdeqi r6, 0 EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0 - /* The moment we treclaim, ALL of our GPRs will switch + /* Clear MSR RI since we are about to change r1, EE is already off. */ + li r4, 0 + mtmsrd r4, 1 + + /* + * BE CAREFUL HERE: + * At this point we can't take an SLB miss since we have MSR_RI + * off. Load only to/from the stack/paca which are in SLB bolted regions + * until we turn MSR RI back on. + * + * The moment we treclaim, ALL of our GPRs will switch * to user register state. (FPRs, CCR etc. also!) * Use an sprg and a tm_scratch in the PACA to shuffle. */ @@ -197,6 +201,11 @@ dont_backup_fp: /* Store the PPR in r11 and reset to decent value */ std r11, GPR11(r1) /* Temporary stash */ + + /* Reset MSR RI so we can take SLB faults again */ + li r11, MSR_RI + mtmsrd r11, 1 + mfspr r11, SPRN_PPR HMT_MEDIUM @@ -397,11 +406,6 @@ restore_gprs: ld r5, THREAD_TM_DSCR(r3) ld r6, THREAD_TM_PPR(r3) - /* Clear the MSR RI since we are about to change R1. EE is already off - */ - li r4, 0 - mtmsrd r4, 1 - REST_GPR(0, r7) /* GPR0 */ REST_2GPRS(2, r7) /* GPR2-3 */ REST_GPR(4, r7) /* GPR4 */ @@ -439,10 +443,33 @@ restore_gprs: ld r6, _CCR(r7) mtcr r6 - REST_GPR(1, r7) /* GPR1 */ - REST_GPR(5, r7) /* GPR5-7 */ REST_GPR(6, r7) - ld r7, GPR7(r7) + + /* + * Store r1 and r5 on the stack so that we can access them + * after we clear MSR RI. + */ + + REST_GPR(5, r7) + std r5, -8(r1) + ld r5, GPR1(r7) + std r5, -16(r1) + + REST_GPR(7, r7) + + /* Clear MSR RI since we are about to change r1. EE is already off */ + li r5, 0 + mtmsrd r5, 1 + + /* + * BE CAREFUL HERE: + * At this point we can't take an SLB miss since we have MSR_RI + * off. Load only to/from the stack/paca which are in SLB bolted regions + * until we turn MSR RI back on. + */ + + ld r5, -8(r1) + ld r1, -16(r1) /* Commit register state as checkpointed state: */ TRECHKPT diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index def1b8b5e..6767605ea 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -195,7 +195,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * and end up putting it elsewhere. * Add enough to the size so that the result can be aligned. */ - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; vdso_base = get_unmapped_area(NULL, vdso_base, (vdso_pages << PAGE_SHIFT) + ((VDSO_ALIGNMENT - 1) & PAGE_MASK), diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 5f8dcdaa2..8d7358f3a 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -87,7 +87,7 @@ struct vio_cmo_dev_entry { * @curr: bytes currently allocated * @high: high water mark for IO data usage */ -struct vio_cmo { +static struct vio_cmo { spinlock_t lock; struct delayed_work balance_q; struct list_head device_list; @@ -615,7 +615,7 @@ static u64 vio_dma_get_required_mask(struct device *dev) return dma_iommu_ops.get_required_mask(dev); } -struct dma_map_ops vio_dma_mapping_ops = { +static struct dma_map_ops vio_dma_mapping_ops = { .alloc = vio_dma_iommu_alloc_coherent, .free = vio_dma_iommu_free_coherent, .mmap = dma_direct_mmap_coherent, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index b34220d2a..47018fcbf 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -54,6 +54,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "queue_intr", VCPU_STAT(queue_intr) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "pf_storage", VCPU_STAT(pf_storage) }, { "sp_storage", VCPU_STAT(sp_storage) }, diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index c7b78d833..05f09ae82 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -447,7 +447,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct revmap_entry *rev; struct page *page, *pages[1]; long index, ret, npages; - unsigned long is_io; + bool is_ci; unsigned int writing, write_ok; struct vm_area_struct *vma; unsigned long rcbits; @@ -503,7 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, smp_rmb(); ret = -EFAULT; - is_io = 0; + is_ci = false; pfn = 0; page = NULL; pte_size = PAGE_SIZE; @@ -521,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pfn = vma->vm_pgoff + ((hva - vma->vm_start) >> PAGE_SHIFT); pte_size = psize; - is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); write_ok = vma->vm_flags & VM_WRITE; } up_read(¤t->mm->mmap_sem); @@ -558,10 +558,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_put; /* Check WIMG vs. the actual page we're accessing */ - if (!hpte_cache_flags_ok(r, is_io)) { - if (is_io) + if (!hpte_cache_flags_ok(r, is_ci)) { + if (is_ci) goto out_put; - /* * Allow guest to map emulated device memory as * uncacheable, but actually make it cacheable. diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 93243554c..e20beae5c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3272,6 +3272,12 @@ static int kvmppc_core_check_processor_compat_hv(void) if (!cpu_has_feature(CPU_FTR_HVMODE) || !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; + /* + * Disable KVM for Power9, untill the required bits merged. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return -EIO; + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 4cb8db05f..99b4e9d5d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -175,7 +175,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long g_ptel; struct kvm_memory_slot *memslot; unsigned hpage_shift; - unsigned long is_io; + bool is_ci; unsigned long *rmap; pte_t *ptep; unsigned int writing; @@ -199,7 +199,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, gfn = gpa >> PAGE_SHIFT; memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); pa = 0; - is_io = ~0ul; + is_ci = false; rmap = NULL; if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { /* Emulated MMIO - mark this with key=31 */ @@ -250,7 +250,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); - is_io = hpte_cache_bits(pte_val(pte)); + is_ci = pte_ci(pte); pa = pte_pfn(pte) << PAGE_SHIFT; pa |= hva & (host_pte_size - 1); pa |= gpa & ~PAGE_MASK; @@ -267,9 +267,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, else pteh |= HPTE_V_ABSENT; - /* Check WIMG */ - if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { - if (is_io) + /*If we had host pte mapping then Check WIMG */ + if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { + if (is_ci) return H_PARAMETER; /* * Allow guest to map emulated device memory as diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 95bceca8f..8e4f64f0b 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -882,6 +882,24 @@ void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr) } #endif +static void kvmppc_setup_debug(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + u64 msr = kvmppc_get_msr(vcpu); + + kvmppc_set_msr(vcpu, msr | MSR_SE); + } +} + +static void kvmppc_clear_debug(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + u64 msr = kvmppc_get_msr(vcpu); + + kvmppc_set_msr(vcpu, msr & ~MSR_SE); + } +} + int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int exit_nr) { @@ -1207,10 +1225,18 @@ program_interrupt: break; #endif case BOOK3S_INTERRUPT_MACHINE_CHECK: - case BOOK3S_INTERRUPT_TRACE: kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; break; + case BOOK3S_INTERRUPT_TRACE: + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + run->exit_reason = KVM_EXIT_DEBUG; + r = RESUME_HOST; + } else { + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + } + break; default: { ulong shadow_srr1 = vcpu->arch.shadow_srr1; @@ -1479,6 +1505,8 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) goto out; } + kvmppc_setup_debug(vcpu); + /* * Interrupts could be timers for the guest which we have to inject * again, so let's postpone them until we're in the guest and if we @@ -1501,6 +1529,8 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ret = __kvmppc_vcpu_run(kvm_run, vcpu); + kvmppc_clear_debug(vcpu); + /* No need for kvm_guest_exit. It's done in handle_exit. We also get here with interrupts enabled. */ @@ -1683,7 +1713,11 @@ static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) static int kvmppc_core_check_processor_compat_pr(void) { - /* we are always compatible */ + /* + * Disable KVM for Power9 untill the required bits merged. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return -EIO; return 0; } diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 46871d554..a75ba38a2 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -92,7 +92,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) * we are the only setter, thus concurrent access is undefined * to begin with. */ - if (level == 1 || level == KVM_INTERRUPT_SET_LEVEL) + if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL) state->asserted = 1; else if (level == 0 || level == KVM_INTERRUPT_UNSET) { state->asserted = 0; @@ -280,7 +280,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp, if (!success) goto bail; - XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + XICS_DBG("UPD [%04lx] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", icp->server_num, old.cppr, old.mfrr, old.pending_pri, old.xisr, old.need_resend, old.out_ee); @@ -336,7 +336,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, union kvmppc_icp_state old_state, new_state; bool success; - XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority, + XICS_DBG("try deliver %#x(P:%#x) to server %#lx\n", irq, priority, icp->server_num); do { @@ -1174,9 +1174,11 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) prio = irqp->saved_priority; } val |= prio << KVM_XICS_PRIORITY_SHIFT; - if (irqp->asserted) - val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; - else if (irqp->masked_pending || irqp->resend) + if (irqp->lsi) { + val |= KVM_XICS_LEVEL_SENSITIVE; + if (irqp->asserted) + val |= KVM_XICS_PENDING; + } else if (irqp->masked_pending || irqp->resend) val |= KVM_XICS_PENDING; ret = 0; } @@ -1228,9 +1230,13 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) irqp->priority = prio; irqp->resend = 0; irqp->masked_pending = 0; + irqp->lsi = 0; irqp->asserted = 0; - if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) - irqp->asserted = 1; + if (val & KVM_XICS_LEVEL_SENSITIVE) { + irqp->lsi = 1; + if (val & KVM_XICS_PENDING) + irqp->asserted = 1; + } irqp->exists = 1; arch_spin_unlock(&ics->lock); local_irq_restore(flags); @@ -1249,11 +1255,10 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, return ics_deliver_irq(xics, irq, level); } -int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, - int irq_source_id, int level, bool line_status) +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) { - if (!level) - return -1; return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi, level, line_status); } diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h index 56ea44f98..a46b95405 100644 --- a/arch/powerpc/kvm/book3s_xics.h +++ b/arch/powerpc/kvm/book3s_xics.h @@ -39,6 +39,7 @@ struct ics_irq_state { u8 saved_priority; u8 resend; u8 masked_pending; + u8 lsi; /* level-sensitive interrupt */ u8 asserted; /* Only for LSI */ u8 exists; }; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4d66f44a1..4afae6958 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -64,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "doorbell", VCPU_STAT(dbell_exits) }, { "guest doorbell", VCPU_STAT(gdbell_exits) }, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6a6873077..02416fea7 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -800,9 +800,9 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, } } -int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned int rt, unsigned int bytes, - int is_default_endian) +static int __kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rt, unsigned int bytes, + int is_default_endian, int sign_extend) { int idx, ret; bool host_swabbed; @@ -827,7 +827,7 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.mmio_host_swabbed = host_swabbed; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 0; - vcpu->arch.mmio_sign_extend = 0; + vcpu->arch.mmio_sign_extend = sign_extend; idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -844,6 +844,13 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } + +int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rt, unsigned int bytes, + int is_default_endian) +{ + return __kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian, 0); +} EXPORT_SYMBOL_GPL(kvmppc_handle_load); /* Same as above, but sign extends */ @@ -851,12 +858,7 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int rt, unsigned int bytes, int is_default_endian) { - int r; - - vcpu->arch.mmio_sign_extend = 1; - r = kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian); - - return r; + return __kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian, 1); } int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S index c44df2dbe..99f37f241 100644 --- a/arch/powerpc/lib/copy_32.S +++ b/arch/powerpc/lib/copy_32.S @@ -217,7 +217,7 @@ _GLOBAL(memcpy) bdnz 40b 65: blr -_GLOBAL(generic_memcpy) +generic_memcpy: srwi. r7,r5,3 addi r6,r3,-4 addi r4,r4,-4 diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index dc885b30f..3362299b1 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -925,6 +925,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, } } #endif + break; /* illegal instruction */ case 31: switch ((instr >> 1) & 0x3ff) { @@ -1818,9 +1819,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) case 4: __get_user_asmx(val, op.ea, err, "lwarx"); break; +#ifdef __powerpc64__ case 8: __get_user_asmx(val, op.ea, err, "ldarx"); break; +#endif default: return 0; } @@ -1841,9 +1844,11 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) case 4: __put_user_asmx(op.val, op.ea, err, "stwcx.", cr); break; +#ifdef __powerpc64__ case 8: __put_user_asmx(op.val, op.ea, err, "stdcx.", cr); break; +#endif default: return 0; } diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c index 07f49f156..f9de69a04 100644 --- a/arch/powerpc/lib/xor_vmx.c +++ b/arch/powerpc/lib/xor_vmx.c @@ -17,7 +17,17 @@ * * Author: Anton Blanchard <anton@au.ibm.com> */ + +/* + * Sparse (as at v0.5.0) gets very, very confused by this file. + * Make it a bit simpler for it. + */ +#if !defined(__CHECKER__) #include <altivec.h> +#else +#define vec_xor(a, b) a ^ b +#define vector __attribute__((vector_size(16))) +#endif #include <linux/preempt.h> #include <linux/export.h> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index adfee3f1a..f2cea6d5e 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,10 +13,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o -obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y) -obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o -obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \ - mmu_context_hash$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o +obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o +obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o +obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o ifeq ($(CONFIG_PPC_STD_MMU_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o @@ -33,6 +34,7 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifeq ($(CONFIG_HUGETLB_PAGE),y) obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index a1b2713f6..139dec421 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -135,7 +135,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(flags)) { + if (pte_user(__pte(flags))) { TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); } diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 47d1b26ef..6333b273d 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -34,21 +34,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * PP bits. _PAGE_USER is already PP bit 0x2, so we only * need to add in 0x1 if it's a read-only user page @@ -60,22 +60,22 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K, MMU_PAGE_4K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -115,9 +115,10 @@ repeat: MMU_PAGE_4K, MMU_PAGE_4K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index b2d659cf5..16644e1f4 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -23,7 +23,7 @@ bool __rpte_sub_valid(real_pte_t rpte, unsigned long index) unsigned long g_idx; unsigned long ptev = pte_val(rpte.pte); - g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT; + g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT; index = index >> 2; if (g_idx & (0x1 << index)) return true; @@ -37,12 +37,12 @@ static unsigned long mark_subptegroup_valid(unsigned long ptev, unsigned long in { unsigned long g_idx; - if (!(ptev & _PAGE_COMBO)) + if (!(ptev & H_PAGE_COMBO)) return ptev; index = index >> 2; g_idx = 0x1 << index; - return ptev | (g_idx << _PAGE_F_GIX_SHIFT); + return ptev | (g_idx << H_PAGE_F_GIX_SHIFT); } int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, @@ -66,21 +66,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. Since this is 4K insert of 64K page size - * also add _PAGE_COMBO + * also add H_PAGE_COMBO */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + /* * Handle the subpage protection bits */ @@ -103,21 +103,21 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, /* *None of the sub 4k page is hashed */ - if (!(old_pte & _PAGE_HASHPTE)) + if (!(old_pte & H_PAGE_HASHPTE)) goto htab_insert_hpte; /* * Check if the pte was already inserted into the hash table * as a 64k HW page, and invalidate the 64k HPTE if so. */ - if (!(old_pte & _PAGE_COMBO)) { + if (!(old_pte & H_PAGE_COMBO)) { flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); /* * clear the old slot details from the old and new pte. * On hash insert failure we use old pte value and we don't * want slot information there if we have a insert failure. */ - old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); - new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); + old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); + new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); goto htab_insert_hpte; } /* @@ -143,15 +143,15 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, if (ret == -1) goto htab_insert_hpte; - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } htab_insert_hpte: /* - * handle _PAGE_4K_PFN case + * handle H_PAGE_4K_PFN case */ - if (old_pte & _PAGE_4K_PFN) { + if (old_pte & H_PAGE_4K_PFN) { /* * All the sub 4k page have the same * physical address. @@ -199,20 +199,20 @@ repeat: } /* * Insert slot number & secondary bit in PTE second half, - * clear _PAGE_BUSY and set appropriate HPTE slot bit - * Since we have _PAGE_BUSY set on ptep, we can be sure + * clear H_PAGE_BUSY and set appropriate HPTE slot bit + * Since we have H_PAGE_BUSY set on ptep, we can be sure * nobody is undating hidx. */ hidxp = (unsigned long *)(ptep + PTRS_PER_PTE); rpte.hidx &= ~(0xfUL << (subpg_index << 2)); *hidxp = rpte.hidx | (slot << (subpg_index << 2)); new_pte = mark_subptegroup_valid(new_pte, subpg_index); - new_pte |= _PAGE_HASHPTE; + new_pte |= H_PAGE_HASHPTE; /* * check __real_pte for details on matching smp_rmb() */ smp_wmb(); - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } @@ -220,7 +220,6 @@ int __hash_page_64K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned long flags, int ssize) { - unsigned long hpte_group; unsigned long rflags, pa; unsigned long old_pte, new_pte; @@ -235,27 +234,26 @@ int __hash_page_64K(unsigned long ea, unsigned long access, old_pte = pte_val(pte); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; /* * Check if PTE has the cache-inhibit bit set * If so, bail out and refault as a 4k page */ if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && - unlikely(old_pte & _PAGE_NO_CACHE)) + unlikely(pte_ci(pte))) return 0; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access. */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); rflags = htab_convert_pte_flags(new_pte); @@ -264,22 +262,22 @@ int __hash_page_64K(unsigned long ea, unsigned long access, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); vpn = hpt_vpn(ea, vsid, ssize); - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K, MMU_PAGE_64K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; hash = hpt_hash(vpn, shift, ssize); @@ -319,9 +317,10 @@ repeat: MMU_PAGE_64K, MMU_PAGE_64K, old_pte); return -1; } - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 8eaac8134..f8a871a72 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -221,7 +221,7 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, return -1; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", @@ -316,8 +316,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, DBG_LOW(" -> hit\n"); /* Update the HPTE */ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & - ~(HPTE_R_PP | HPTE_R_N)) | - (newpp & (HPTE_R_PP | HPTE_R_N | + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N | HPTE_R_C))); } native_unlock_hpte(hptep); @@ -385,8 +385,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, /* Update the HPTE */ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & - ~(HPTE_R_PP | HPTE_R_N)) | - (newpp & (HPTE_R_PP | HPTE_R_N))); + ~(HPTE_R_PPP | HPTE_R_N)) | + (newpp & (HPTE_R_PPP | HPTE_R_N))); /* * Ensure it is out of the tlb too. Bolted entries base and * actual page size will be same. @@ -550,7 +550,11 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, } } /* This works for all page sizes, and for 256M and 1T segments */ - *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + *ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT; + else + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; + shift = mmu_psize_defs[size].shift; avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); @@ -719,6 +723,12 @@ static void native_flush_hash_range(unsigned long number, int local) local_irq_restore(flags); } +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + void __init hpte_init_native(void) { ppc_md.hpte_invalidate = native_hpte_invalidate; @@ -729,4 +739,7 @@ void __init hpte_init_native(void) ppc_md.hpte_clear_all = native_hpte_clear; ppc_md.flush_hash_range = native_flush_hash_range; ppc_md.hugepage_invalidate = native_hugepage_invalidate; + + if (cpu_has_feature(CPU_FTR_ARCH_300)) + ppc_md.update_partition_table = native_update_partition_table; } diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index f4acba25f..2971ea18c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -180,36 +180,47 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags) if ((pteflags & _PAGE_EXEC) == 0) rflags |= HPTE_R_N; /* - * PP bits: + * PPP bits: * Linux uses slb key 0 for kernel and 1 for user. - * kernel areas are mapped with PP=00 - * and there is no kernel RO (_PAGE_KERNEL_RO). - * User area is mapped with PP=0x2 for read/write - * or PP=0x3 for read-only (including writeable but clean pages). + * kernel RW areas are mapped with PPP=0b000 + * User area is mapped with PPP=0b010 for read/write + * or PPP=0b011 for read-only (including writeable but clean pages). */ - if (pteflags & _PAGE_USER) { - rflags |= 0x2; - if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY))) + if (pteflags & _PAGE_PRIVILEGED) { + /* + * Kernel read only mapped with ppp bits 0b110 + */ + if (!(pteflags & _PAGE_WRITE)) + rflags |= (HPTE_R_PP0 | 0x2); + } else { + if (pteflags & _PAGE_RWX) + rflags |= 0x2; + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) rflags |= 0x1; } /* * We can't allow hardware to update hpte bits. Hence always * set 'R' bit and set 'C' if it is a write fault - * Memory coherence is always enabled */ - rflags |= HPTE_R_R | HPTE_R_M; + rflags |= HPTE_R_R; if (pteflags & _PAGE_DIRTY) rflags |= HPTE_R_C; /* * Add in WIG bits */ - if (pteflags & _PAGE_WRITETHRU) - rflags |= HPTE_R_W; - if (pteflags & _PAGE_NO_CACHE) + + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) rflags |= HPTE_R_I; - if (pteflags & _PAGE_GUARDED) - rflags |= HPTE_R_G; + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT) + rflags |= (HPTE_R_I | HPTE_R_G); + else if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) + rflags |= (HPTE_R_W | HPTE_R_I | HPTE_R_M); + else + /* + * Add memory coherence if cache inhibited is not set + */ + rflags |= HPTE_R_M; return rflags; } @@ -687,6 +698,41 @@ int remove_section_mapping(unsigned long start, unsigned long end) } #endif /* CONFIG_MEMORY_HOTPLUG */ +static void __init hash_init_partition_table(phys_addr_t hash_table, + unsigned long pteg_count) +{ + unsigned long ps_field; + unsigned long htab_size; + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + + /* + * slb llp encoding for the page size used in VPM real mode. + * We can ignore that for lpid 0 + */ + ps_field = 0; + htab_size = __ilog2(pteg_count) - 11; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, + MEMBLOCK_ALLOC_ANYWHERE)); + + /* Initialize the Partition Table with no entries */ + memset((void *)partition_tb, 0, patb_size); + partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size); + /* + * FIXME!! This should be done via update_partition table + * For now UPRT is 0 for us. + */ + partition_tb->patb1 = 0; + DBG("Partition table %p\n", partition_tb); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + +} + static void __init htab_initialize(void) { unsigned long table; @@ -755,8 +801,11 @@ static void __init htab_initialize(void) /* Initialize the HPT with no entries */ memset((void *)table, 0, htab_size_bytes); - /* Set SDR1 */ - mtspr(SPRN_SDR1, _SDR1); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + else + hash_init_partition_table(table, pteg_count); } prot = pgprot_val(PAGE_KERNEL); @@ -841,8 +890,42 @@ static void __init htab_initialize(void) #undef KB #undef MB -void __init early_init_mmu(void) +void __init hash__early_init_mmu(void) { + /* + * initialize page table size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + + __pte_index_size = H_PTE_INDEX_SIZE; + __pmd_index_size = H_PMD_INDEX_SIZE; + __pud_index_size = H_PUD_INDEX_SIZE; + __pgd_index_size = H_PGD_INDEX_SIZE; + __pmd_cache_index = H_PMD_CACHE_INDEX; + __pte_table_size = H_PTE_TABLE_SIZE; + __pmd_table_size = H_PMD_TABLE_SIZE; + __pud_table_size = H_PUD_TABLE_SIZE; + __pgd_table_size = H_PGD_TABLE_SIZE; + /* + * 4k use hugepd format, so for hash set then to + * zero + */ + __pmd_val_bits = 0; + __pud_val_bits = 0; + __pgd_val_bits = 0; + + __kernel_virt_start = H_KERN_VIRT_START; + __kernel_virt_size = H_KERN_VIRT_SIZE; + __vmalloc_start = H_VMALLOC_START; + __vmalloc_end = H_VMALLOC_END; + vmemmap = (struct page *)H_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + /* Initialize the MMU Hash table and create the linear mapping * of memory. Has to be done before SLB initialization as this is * currently where the page size encoding is obtained. @@ -854,12 +937,16 @@ void __init early_init_mmu(void) } #ifdef CONFIG_SMP -void early_init_mmu_secondary(void) +void hash__early_init_mmu_secondary(void) { /* Initialize hash table for that CPU */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - mtspr(SPRN_SDR1, _SDR1); - + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + mtspr(SPRN_SDR1, _SDR1); + else + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + } /* Initialize SLB */ slb_initialize(); } @@ -938,7 +1025,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) * Userspace sets the subpage permissions using the subpage_prot system call. * * Result is 0: full permissions, _PAGE_RW: read-only, - * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. + * _PAGE_RWX: no access. */ static int subpage_protection(struct mm_struct *mm, unsigned long ea) { @@ -964,8 +1051,13 @@ static int subpage_protection(struct mm_struct *mm, unsigned long ea) /* extract 2-bit bitfield for this 4k subpage */ spp >>= 30 - 2 * ((ea >> 12) & 0xf); - /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ - spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); + /* + * 0 -> full premission + * 1 -> Read only + * 2 -> no access. + * We return the flag that need to be cleared. + */ + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); return spp; } @@ -1102,7 +1194,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* Pre-check access permissions (will be re-checked atomically * in __hash_page_XX but this pre-check is a fast path */ - if (access & ~pte_val(*ptep)) { + if (!check_pte_access(access, pte_val(*ptep))) { DBG_LOW(" no access !\n"); rc = 1; goto bail; @@ -1140,8 +1232,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif /* Do actual hashing */ #ifdef CONFIG_PPC_64K_PAGES - /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ - if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; } @@ -1149,8 +1241,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, /* If this PTE is non-cacheable and we have restrictions on * using non cacheable large pages, then we switch to 4k */ - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && - (pte_val(*ptep) & _PAGE_NO_CACHE)) { + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { if (user_region) { demote_segment_4k(mm, ea); psize = MMU_PAGE_4K; @@ -1227,7 +1318,7 @@ EXPORT_SYMBOL_GPL(hash_page); int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, unsigned long dsisr) { - unsigned long access = _PAGE_PRESENT; + unsigned long access = _PAGE_PRESENT | _PAGE_READ; unsigned long flags = 0; struct mm_struct *mm = current->mm; @@ -1238,14 +1329,18 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, flags |= HPTE_NOHPTE_UPDATE; if (dsisr & DSISR_ISSTORE) - access |= _PAGE_RW; + access |= _PAGE_WRITE; /* - * We need to set the _PAGE_USER bit if MSR_PR is set or if we are - * accessing a userspace segment (even from the kernel). We assume - * kernel addresses always have the high bit set. + * We set _PAGE_PRIVILEGED only when + * kernel mode access kernel space. + * + * _PAGE_PRIVILEGED is NOT set + * 1) when kernel mode access user space + * 2) user space access kernel space. */ + access |= _PAGE_PRIVILEGED; if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) - access |= _PAGE_USER; + access &= ~_PAGE_PRIVILEGED; if (trap == 0x400) access |= _PAGE_EXEC; @@ -1253,6 +1348,30 @@ int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, return hash_page_mm(mm, ea, access, trap, flags); } +#ifdef CONFIG_PPC_MM_SLICES +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + int psize = get_slice_psize(mm, ea); + + /* We only prefault standard pages for now */ + if (unlikely(psize != mm->context.user_psize)) + return false; + + /* + * Don't prefault if subpage protection is enabled for the EA. + */ + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) + return false; + + return true; +} +#else +static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) +{ + return true; +} +#endif + void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long access, unsigned long trap) { @@ -1265,11 +1384,8 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, BUG_ON(REGION_ID(ea) != USER_REGION_ID); -#ifdef CONFIG_PPC_MM_SLICES - /* We only prefault standard pages for now */ - if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + if (!should_hash_preload(mm, ea)) return; -#endif DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," " trap=%lx\n", mm, mm->pgd, ea, access, trap); @@ -1300,13 +1416,13 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, WARN_ON(hugepage_shift); #ifdef CONFIG_PPC_64K_PAGES - /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on * a 64K kernel), then we don't preload, hash_page() will take * care of it once we actually try to access the page. * That way we don't have to duplicate all of the logic for segment * page size demotion here */ - if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) goto out_exit; #endif /* CONFIG_PPC_64K_PAGES */ @@ -1588,7 +1704,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #endif /* CONFIG_DEBUG_PAGEALLOC */ -void setup_initial_memory_limit(phys_addr_t first_memblock_base, +void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, phys_addr_t first_memblock_size) { /* We don't currently support the first MEMBLOCK not mapping 0 diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index eb2accdd7..ba3fc2294 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -37,20 +37,20 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, old_pmd = pmd_val(pmd); /* If PMD busy, retry the access */ - if (unlikely(old_pmd & _PAGE_BUSY)) + if (unlikely(old_pmd & H_PAGE_BUSY)) return 0; /* If PMD permissions don't match, take page fault */ - if (unlikely(access & ~old_pmd)) + if (unlikely(!check_pte_access(access, old_pmd))) return 1; /* * Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pmd |= _PAGE_DIRTY; - } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, - old_pmd, new_pmd)); + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); + rflags = htab_convert_pte_flags(new_pmd); #if 0 @@ -78,7 +78,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * base page size. This is because demote_segment won't flush * hash page table entries. */ - if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) { + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, ssize, flags); /* @@ -125,7 +125,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, hash = hpt_hash(vpn, shift, ssize); /* insert new entry */ pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; - new_pmd |= _PAGE_HASHPTE; + new_pmd |= H_PAGE_HASHPTE; repeat: hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; @@ -169,17 +169,17 @@ repeat: mark_hpte_slot_valid(hpte_slot_array, index, slot); } /* - * Mark the pte with _PAGE_COMBO, if we are trying to hash it with + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with * base page size 4k. */ if (psize == MMU_PAGE_4K) - new_pmd |= _PAGE_COMBO; + new_pmd |= H_PAGE_COMBO; /* * The hpte valid is stored in the pgtable whose address is in the * second half of the PMD. Order this against clearing of the busy bit in * huge pmd. */ smp_wmb(); - *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 8555fce90..3058560b6 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -47,18 +47,19 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, do { old_pte = pte_val(*ptep); /* If PTE busy, retry the access */ - if (unlikely(old_pte & _PAGE_BUSY)) + if (unlikely(old_pte & H_PAGE_BUSY)) return 0; /* If PTE permissions don't match, take page fault */ - if (unlikely(access & ~old_pte)) + if (unlikely(!check_pte_access(access, old_pte))) return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was * a write access */ - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - if (access & _PAGE_RW) + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_WRITE) new_pte |= _PAGE_DIRTY; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); + rflags = htab_convert_pte_flags(new_pte); sz = ((1UL) << shift); @@ -68,28 +69,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { + if (unlikely(old_pte & H_PAGE_HASHPTE)) { /* There MIGHT be an HPTE for this pte */ unsigned long hash, slot; hash = hpt_hash(vpn, shift, ssize); - if (old_pte & _PAGE_F_SECOND) + if (old_pte & H_PAGE_F_SECOND) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, mmu_psize, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } - if (likely(!(old_pte & _PAGE_HASHPTE))) { + if (likely(!(old_pte & H_PAGE_HASHPTE))) { unsigned long hash = hpt_hash(vpn, shift, ssize); pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; /* clear HPTE slot informations in new PTE */ - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, mmu_psize, ssize); @@ -105,14 +106,14 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, return -1; } - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & - (_PAGE_F_SECOND | _PAGE_F_GIX); + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & + (H_PAGE_F_SECOND | H_PAGE_F_GIX); } /* * No need to use ldarx/stdcx here */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); + *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c new file mode 100644 index 000000000..1e11559e1 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -0,0 +1,87 @@ +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> +#include <asm/mman.h> + +void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + unsigned long ap, shift; + struct hstate *hstate = hstate_file(vma->vm_file); + + shift = huge_page_shift(hstate); + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) + ap = mmu_get_ap(MMU_PAGE_2M); + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) + ap = mmu_get_ap(MMU_PAGE_1G); + else { + WARN(1, "Wrong huge page shift\n"); + return ; + } + radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); +} + +/* + * A vairant of hugetlb_get_unmapped_area doing topdown search + * FIXME!! should we do as x86 does or non hugetlb area does ? + * ie, use topdown or not based on mmap_is_legacy check ? + */ +unsigned long +radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) { + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + return addr; + } + + if (addr) { + addr = ALIGN(addr, huge_page_size(h)); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + /* + * We are always doing an topdown search here. Slice code + * does that too. + */ + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = current->mm->mmap_base; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d991b9e80..119d18611 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -73,7 +73,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, cachep = PGT_CACHE(pdshift - pshift); #endif - new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); + new = kmem_cache_zalloc(cachep, GFP_KERNEL); BUG_ON(pshift > HUGEPD_SHIFT_MASK); BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); @@ -711,6 +711,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); + if (radix_enabled()) + return radix__hugetlb_get_unmapped_area(file, addr, len, + pgoff, flags); return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); } #endif @@ -719,14 +722,14 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { #ifdef CONFIG_PPC_MM_SLICES unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); - - return 1UL << mmu_psize_to_shift(psize); -#else + /* With radix we don't use slice, so derive it from vma*/ + if (!radix_enabled()) + return 1UL << mmu_psize_to_shift(psize); +#endif if (!is_vm_hugetlb_page(vma)) return PAGE_SIZE; return huge_page_size(hstate_vma(vma)); -#endif } static inline bool is_power_of_4(unsigned long x) @@ -772,8 +775,10 @@ static int __init hugepage_setup_sz(char *str) size = memparse(str, &str); - if (add_huge_page_size(size) != 0) - printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); + if (add_huge_page_size(size) != 0) { + hugetlb_bad_size(); + pr_err("Invalid huge page size specified(%llu)\n", size); + } return 1; } @@ -823,7 +828,7 @@ static int __init hugetlbpage_init(void) { int psize; - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) return -ENODEV; for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { @@ -863,6 +868,9 @@ static int __init hugetlbpage_init(void) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; else if (mmu_psize_defs[MMU_PAGE_1M].shift) HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + else if (mmu_psize_defs[MMU_PAGE_2M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; + return 0; } @@ -1003,9 +1011,9 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, end = pte_end; pte = READ_ONCE(*ptep); - mask = _PAGE_PRESENT | _PAGE_USER; + mask = _PAGE_PRESENT | _PAGE_READ; if (write) - mask |= _PAGE_RW; + mask |= _PAGE_WRITE; if ((pte_val(pte) & mask) != mask) return 0; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index ba6556661..33709bdb0 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -66,11 +66,11 @@ #include "mmu_decl.h" #ifdef CONFIG_PPC_STD_MMU_64 -#if PGTABLE_RANGE > USER_VSID_RANGE +#if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif -#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) #warning TASK_SIZE is smaller than it needs to be. #endif #endif /* CONFIG_PPC_STD_MMU_64 */ @@ -189,75 +189,6 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } -/* On hash-based CPUs, the vmemmap is bolted in the hash table. - * - * On Book3E CPUs, the vmemmap is currently mapped in the top half of - * the vmalloc space using normal page tables, though the size of - * pages encoded in the PTEs can be different - */ - -#ifdef CONFIG_PPC_BOOK3E -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - /* Create a PTE encoding without page size */ - unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | - _PAGE_KERNEL_RW; - - /* PTEs only contain page size encodings up to 32M */ - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); - - /* Encode the size in the PTE */ - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; - - /* For each PTE for that area, map things. Note that we don't - * increment phys because all PTEs are of the large size and - * thus must have the low bits clear - */ - for (i = 0; i < page_size; i += PAGE_SIZE) - BUG_ON(map_kernel_page(start + i, phys, flags)); - - return 0; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ -} -#endif -#else /* CONFIG_PPC_BOOK3E */ -static int __meminit vmemmap_create_mapping(unsigned long start, - unsigned long page_size, - unsigned long phys) -{ - int rc = htab_bolt_mapping(start, start + page_size, phys, - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - if (rc < 0) { - int rc2 = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON(rc2 && (rc2 != -ENOENT)); - } - return rc; -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static void vmemmap_remove_mapping(unsigned long start, - unsigned long page_size) -{ - int rc = htab_remove_mapping(start, start + page_size, - mmu_vmemmap_psize, - mmu_kernel_ssize); - BUG_ON((rc < 0) && (rc != -ENOENT)); - WARN_ON(rc == -ENOENT); -} -#endif - -#endif /* CONFIG_PPC_BOOK3E */ - struct vmemmap_backing *vmemmap_list; static struct vmemmap_backing *next; static int num_left; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index ac79dbde1..2fd57fa48 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -68,12 +68,15 @@ pte_t *kmap_pte; EXPORT_SYMBOL(kmap_pte); pgprot_t kmap_prot; EXPORT_SYMBOL(kmap_prot); +#define TOP_ZONE ZONE_HIGHMEM static inline pte_t *virt_to_kpte(unsigned long vaddr) { return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), vaddr), vaddr); } +#else +#define TOP_ZONE ZONE_NORMAL #endif int page_is_ram(unsigned long pfn) @@ -267,14 +270,9 @@ void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) */ int dma_pfn_limit_to_zone(u64 pfn_limit) { - enum zone_type top_zone = ZONE_NORMAL; int i; -#ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; -#endif - - for (i = top_zone; i >= 0; i--) { + for (i = TOP_ZONE; i >= 0; i--) { if (max_zone_pfns[i] <= pfn_limit) return i; } @@ -289,7 +287,6 @@ void __init paging_init(void) { unsigned long long total_ram = memblock_phys_mem_size(); phys_addr_t top_of_ram = memblock_end_of_DRAM(); - enum zone_type top_zone; #ifdef CONFIG_PPC32 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); @@ -313,13 +310,9 @@ void __init paging_init(void) (long int)((top_of_ram - total_ram) >> 20)); #ifdef CONFIG_HIGHMEM - top_zone = ZONE_HIGHMEM; limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); -#else - top_zone = ZONE_NORMAL; #endif - - limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT); zone_limits_final = true; free_area_init_nodes(max_zone_pfns); @@ -498,7 +491,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * We don't need to worry about _PAGE_PRESENT here because we are * called with either mm->page_table_lock held or ptl lock held */ - unsigned long access = 0, trap; + unsigned long access, trap; + + if (radix_enabled()) + return; /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ if (!pte_young(*ptep) || address >= TASK_SIZE) @@ -511,13 +507,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, * * We also avoid filling the hash if not coming from a fault */ - if (current->thread.regs == NULL) - return; - trap = TRAP(current->thread.regs); - if (trap == 0x400) - access |= _PAGE_EXEC; - else if (trap != 0x300) + + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; + switch (trap) { + case 0x300: + access = 0UL; + break; + case 0x400: + access = _PAGE_EXEC; + break; + default: return; + } + hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 4087705ba..2f1e44362 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -26,6 +26,9 @@ #include <linux/mm.h> #include <linux/random.h> #include <linux/sched.h> +#include <linux/elf-randomize.h> +#include <linux/security.h> +#include <linux/mman.h> /* * Top of mmap area (just below the process stack). @@ -78,6 +81,111 @@ static inline unsigned long mmap_base(unsigned long rnd) return PAGE_ALIGN(TASK_SIZE - gap - rnd); } +#ifdef CONFIG_PPC_RADIX_MMU +/* + * Same function as generic code used only for radix, because we don't need to overload + * the generic one. But we will have to duplicate, because hash select + * HAVE_ARCH_UNMAPPED_AREA + */ +static unsigned long +radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = mm->mmap_base; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); +} + +static unsigned long +radix__arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE - mmap_min_addr) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } + + return addr; +} + +static void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor) +{ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = radix__arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor); + mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; + } +} +#else +/* dummy */ +extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, + unsigned long random_factor); +#endif /* * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: @@ -89,6 +197,8 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); + if (radix_enabled()) + return radix__arch_pick_mmap_layout(mm, random_factor); /* * Fall back to the standard layout if the personality * bit is set, or if the expected stack growth is unlimited: diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 9ca6fe16c..196222227 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -58,6 +58,17 @@ again: return index; } EXPORT_SYMBOL_GPL(__init_new_context); +static int radix__init_new_context(struct mm_struct *mm, int index) +{ + unsigned long rts_field; + + /* + * set the process table entry, + */ + rts_field = radix__get_tree_size(); + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + return 0; +} int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { @@ -67,13 +78,27 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) if (index < 0) return index; - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - */ - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); + if (radix_enabled()) { + radix__init_new_context(mm, index); + } else { + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we + * properly initialize context slice details for newly allocated + * mm's (which will have id == 0) and don't alter context slice + * inherited via fork (which will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is ok. + */ + if (mm->context.id == 0) + slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); + } mm->context.id = index; #ifdef CONFIG_PPC_ICSWX mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); @@ -144,8 +169,19 @@ void destroy_context(struct mm_struct *mm) mm->context.cop_lockp = NULL; #endif /* CONFIG_PPC_ICSWX */ + if (radix_enabled()) + process_tb[mm->context.id].prtb1 = 0; + else + subpage_prot_free(mm); destroy_pagetable_page(mm); __destroy_context(mm->context.id); - subpage_prot_free(mm); mm->context.id = MMU_NO_CONTEXT; } + +#ifdef CONFIG_PPC_RADIX_MMU +void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + mtspr(SPRN_PID, next->context.id); + asm volatile("isync": : :"memory"); +} +#endif diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 986afbc22..7d95bc402 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -226,7 +226,8 @@ static void context_check_map(void) static void context_check_map(void) { } #endif -void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) { unsigned int i, id, cpu = smp_processor_id(); unsigned long *map; @@ -334,8 +335,7 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.active = 0; #ifdef CONFIG_PPC_MM_SLICES - if (slice_mm_new_context(mm)) - slice_set_user_psize(mm, mmu_virtual_psize); + slice_set_user_psize(mm, mmu_virtual_psize); #endif return 0; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index bfb7c0bca..6af65327c 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -108,11 +108,6 @@ extern unsigned long Hash_size, Hash_mask; #endif /* CONFIG_PPC32 */ -#ifdef CONFIG_PPC64 -extern int map_kernel_page(unsigned long ea, unsigned long pa, - unsigned long flags); -#endif /* CONFIG_PPC64 */ - extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; extern phys_addr_t __initial_memory_limit_addr; diff --git a/arch/powerpc/mm/pgtable-book3e.c b/arch/powerpc/mm/pgtable-book3e.c new file mode 100644 index 000000000..a2298930f --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3e.c @@ -0,0 +1,122 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <linux/memblock.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/dma.h> + +#include "mmu_decl.h" + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ +int __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + pgdp = pgd_offset_k(ea); +#ifndef __PAGETABLE_PUD_FOLDED + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* !__PAGETABLE_PUD_FOLDED */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } + + smp_wmb(); + return 0; +} diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c new file mode 100644 index 000000000..670318766 --- /dev/null +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -0,0 +1,115 @@ +/* + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> + +#include "mmu_decl.h" +#include <trace/events/thp.h> + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pmdv; + + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + return pmd_set_protbits(__pmd(pmdv), pgprot); +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + unsigned long pmdv; + + pmdv = pmd_val(pmd); + pmdv &= _HPAGE_CHG_MASK; + return pmd_set_protbits(__pmd(pmdv), newprot); +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c new file mode 100644 index 000000000..c23e286a6 --- /dev/null +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -0,0 +1,342 @@ +/* + * Copyright 2005, Paul Mackerras, IBM Corporation. + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> + +#include "mmu_decl.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/thp.h> + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * On hash-based CPUs, the vmemmap is bolted in the hash table. + * + */ +int __meminit hash__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int rc = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, mmu_kernel_ssize); + if (rc < 0) { + int rc2 = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(rc2 && (rc2 != -ENOENT)); + } + return rc; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void hash__vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int rc = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON((rc < 0) && (rc != -ENOENT)); + WARN_ON(rc == -ENOENT); +} +#endif +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } + } + + smp_wmb(); + return 0; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + __be64 old_be, tmp; + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + and. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) + : "cc" ); + + old = be64_to_cpu(old_be); + + trace_hugepage_update(addr, old, clr, set); + if (old & H_PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + return pmd; +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + assert_spin_locked(&mm->page_table_lock); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(&mm->page_table_lock); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); + + /* + * We can't mark the pmd none here, because that will cause a race + * against exit_mmap. We need to continue mark pmd TRANS HUGE, while + * we spilt, but at the same time we wan't rest of the ppc64 code + * not to insert hash pte on this, because we will be modifying + * the deposited pgtable in the caller of this function. Hence + * clear the _PAGE_USER so that we move the fault handling to + * higher level function and that will serialize against ptl. + * We need to flush existing hash pte entries here even though, + * the translation is still valid, because we will withdraw + * pgtable_t after this. + */ + pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + const struct cpumask *tmp; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & H_PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + tmp = cpumask_of(smp_processor_id()); + if (cpumask_equal(mm_cpumask(mm), tmp)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int hash__has_transparent_hugepage(void) +{ + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c new file mode 100644 index 000000000..7931e1496 --- /dev/null +++ b/arch/powerpc/mm/pgtable-radix.c @@ -0,0 +1,525 @@ +/* + * Page table handling routines for radix page table. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/sched.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/dma.h> +#include <asm/machdep.h> +#include <asm/mmu.h> +#include <asm/firmware.h> + +#include <trace/events/thp.h> + +static int native_update_partition_table(u64 patb1) +{ + partition_tb->patb1 = cpu_to_be64(patb1); + return 0; +} + +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE)); + memset(pt, 0, size); + + return pt; +} + +int radix__map_kernel_page(unsigned long ea, unsigned long pa, + pgprot_t flags, + unsigned int map_page_size) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + /* + * Make sure task size is correct as per the max adddr + */ + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + } else { + pgdp = pgd_offset_k(ea); + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } + pudp = pud_offset(pgdp, ea); + if (map_page_size == PUD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (map_page_size == PMD_SIZE) { + ptep = (pte_t *)pudp; + goto set_the_pte; + } + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + } + +set_the_pte: + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); + smp_wmb(); + return 0; +} + +static void __init radix_init_pgtable(void) +{ + int loop_count; + u64 base, end, start_addr; + unsigned long rts_field; + struct memblock_region *reg; + unsigned long linear_page_size; + + /* We don't support slb for radix */ + mmu_slb_size = 0; + /* + * Create the linear mapping, using standard page size for now + */ + loop_count = 0; + for_each_memblock(memory, reg) { + + start_addr = reg->base; + +redo: + if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift) + linear_page_size = PUD_SIZE; + else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift) + linear_page_size = PMD_SIZE; + else + linear_page_size = PAGE_SIZE; + + base = _ALIGN_UP(start_addr, linear_page_size); + end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size); + + pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n", + (unsigned long)base, (unsigned long)end, + linear_page_size); + + while (base < end) { + radix__map_kernel_page((unsigned long)__va(base), + base, PAGE_KERNEL_X, + linear_page_size); + base += linear_page_size; + } + /* + * map the rest using lower page size + */ + if (end < reg->base + reg->size) { + start_addr = end; + loop_count++; + goto redo; + } + } + /* + * Allocate Partition table and process table for the + * host. + */ + BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 23), "Process table size too large."); + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); + /* + * Fill in the process table. + */ + rts_field = radix__get_tree_size(); + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); + /* + * Fill in the partition table. We are suppose to use effective address + * of process table here. But our linear mapping also enable us to use + * physical address here. + */ + ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR); + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); +} + +static void __init radix_init_partition_table(void) +{ + unsigned long rts_field; + + rts_field = radix__get_tree_size(); + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); + partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT); + partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | + RADIX_PGD_INDEX_SIZE | PATB_HR); + printk("Partition table %p\n", partition_tb); + + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void __init radix_init_native(void) +{ + ppc_md.update_partition_table = native_update_partition_table; +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x15: + idx = MMU_PAGE_2M; + break; + case 0x1e: + idx = MMU_PAGE_1G; + break; + } + return idx; +} + +static int __init radix_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + int size = 0; + int shift, idx; + unsigned int ap; + const __be32 *prop; + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + for (; size >= 4; size -= 4, ++prop) { + + struct mmu_psize_def *def; + + /* top 3 bit is AP encoding */ + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); + ap = be32_to_cpu(prop[0]) >> 29; + pr_info("Page size sift = %d AP=0x%x\n", shift, ap); + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + def = &mmu_psize_defs[idx]; + def->shift = shift; + def->ap = ap; + } + + /* needed ? */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 1; +} + +static void __init radix_init_page_sizes(void) +{ + int rc; + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + /* + * let's assume we have page 4k and 64k support + */ + mmu_psize_defs[MMU_PAGE_4K].shift = 12; + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; + + mmu_psize_defs[MMU_PAGE_64K].shift = 16; + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; +found: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + if (mmu_psize_defs[MMU_PAGE_2M].shift) { + /* + * map vmemmap using 2M if available + */ + mmu_vmemmap_psize = MMU_PAGE_2M; + } +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + return; +} + +void __init radix__early_init_mmu(void) +{ + unsigned long lpcr; + +#ifdef CONFIG_PPC_64K_PAGES + /* PAGE_SIZE mappings */ + mmu_virtual_psize = MMU_PAGE_64K; +#else + mmu_virtual_psize = MMU_PAGE_4K; +#endif + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* vmemmap mapping */ + mmu_vmemmap_psize = mmu_virtual_psize; +#endif + /* + * initialize page table size + */ + __pte_index_size = RADIX_PTE_INDEX_SIZE; + __pmd_index_size = RADIX_PMD_INDEX_SIZE; + __pud_index_size = RADIX_PUD_INDEX_SIZE; + __pgd_index_size = RADIX_PGD_INDEX_SIZE; + __pmd_cache_index = RADIX_PMD_INDEX_SIZE; + __pte_table_size = RADIX_PTE_TABLE_SIZE; + __pmd_table_size = RADIX_PMD_TABLE_SIZE; + __pud_table_size = RADIX_PUD_TABLE_SIZE; + __pgd_table_size = RADIX_PGD_TABLE_SIZE; + + __pmd_val_bits = RADIX_PMD_VAL_BITS; + __pud_val_bits = RADIX_PUD_VAL_BITS; + __pgd_val_bits = RADIX_PGD_VAL_BITS; + + __kernel_virt_start = RADIX_KERN_VIRT_START; + __kernel_virt_size = RADIX_KERN_VIRT_SIZE; + __vmalloc_start = RADIX_VMALLOC_START; + __vmalloc_end = RADIX_VMALLOC_END; + vmemmap = (struct page *)RADIX_VMEMMAP_BASE; + ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PCI + pci_io_base = ISA_IO_BASE; +#endif + + /* + * For now radix also use the same frag size + */ + __pte_frag_nr = H_PTE_FRAG_NR; + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; + + radix_init_page_sizes(); + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + radix_init_partition_table(); + } + + radix_init_pgtable(); +} + +void radix__early_init_mmu_secondary(void) +{ + unsigned long lpcr; + /* + * update partition table control register and UPRT + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) { + lpcr = mfspr(SPRN_LPCR); + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); + + mtspr(SPRN_PTCR, + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + } +} + +void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + /* + * We limit the allocation that depend on ppc64_rma_size + * to first_memblock_size. We also clamp it to 1GB to + * avoid some funky things such as RTAS bugs. + * + * On radix config we really don't have a limitation + * on real mode access. But keeping it as above works + * well enough. + */ + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + /* + * Finally limit subsequent allocations. We really don't want + * to limit the memblock allocations to rma_size. FIXME!! should + * we even limit at all ? + */ + memblock_set_current_limit(first_memblock_base + first_memblock_size); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int __meminit radix__vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding */ + unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; + + BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size)); + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) +{ + /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */ +} +#endif +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!radix__pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + + old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); + trace_hugepage_update(addr, old, clr, set); + + return old; +} + +pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) + +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + /*FIXME!! Verify whether we need this kick below */ + kick_all_cpus_sync(); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} + +/* + * For us pgtable_t is pte_t *. Inorder to save the deposisted + * page table, we consider the allocated page table as a list + * head. On withdraw we need to make sure we zero out the used + * list_head memory area. + */ +void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + struct list_head *lh = (struct list_head *) pgtable; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; +} + +pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pte_t *ptep; + pgtable_t pgtable; + struct list_head *lh; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); + } + ptep = (pte_t *) pgtable; + *ptep = __pte(0); + ptep++; + *ptep = __pte(0); + return pgtable; +} + + +pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + unsigned long old; + + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int radix__has_transparent_hugepage(void) +{ + /* For radix 2M at PMD level means thp */ + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) + return 1; + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index de37ff445..88a307504 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -38,14 +38,25 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { + +#if defined(CONFIG_PPC_BOOK3S_64) + if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { + if (pte_ci(pte)) + return 0; + if (pte_user(pte)) + return 1; + } + return 0; +#else return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); +#endif } static struct page *maybe_pte_to_page(pte_t pte) @@ -71,6 +82,9 @@ static struct page *maybe_pte_to_page(pte_t pte) static pte_t set_pte_filter(pte_t pte) { + if (radix_enabled()) + return pte; + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { @@ -177,8 +191,8 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * _PAGE_PRESENT, but we can be sure that it is not in hpte. * Hence we can use set_pte_at for them. */ - VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); + VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); + /* * Add the pte bit when tryint set a pte */ diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index bf7bf32b5..7f922f557 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -84,7 +84,7 @@ __init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long add pte_t *pte; if (slab_is_available()) { - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); } else { pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); if (pte) @@ -97,7 +97,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *ptepage; - gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; + gfp_t flags = GFP_KERNEL | __GFP_ZERO; ptepage = alloc_pages(flags, 0); if (!ptepage) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 347106080..f5e8d4edb 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -55,104 +55,63 @@ #include "mmu_decl.h" -#define CREATE_TRACE_POINTS -#include <trace/events/thp.h> - -/* Some sanity checking */ -#if TASK_SIZE_USER64 > PGTABLE_RANGE -#error TASK_SIZE_USER64 exceeds pagetable range -#endif - #ifdef CONFIG_PPC_STD_MMU_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range #endif #endif -unsigned long ioremap_bot = IOREMAP_BASE; - -#ifdef CONFIG_PPC_MMU_NOHASH -static __ref void *early_alloc_pgtable(unsigned long size) -{ - void *pt; - - pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); - memset(pt, 0, size); - - return pt; -} -#endif /* CONFIG_PPC_MMU_NOHASH */ - +#ifdef CONFIG_PPC_BOOK3S_64 /* - * map_kernel_page currently only called by __ioremap - * map_kernel_page adds an entry to the ioremap page table - * and adds an entry to the HPT, possibly bolting it + * partition table and process table for ISA 3.0 */ -int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) -{ - pgd_t *pgdp; - pud_t *pudp; - pmd_t *pmdp; - pte_t *ptep; - - if (slab_is_available()) { - pgdp = pgd_offset_k(ea); - pudp = pud_alloc(&init_mm, pgdp, ea); - if (!pudp) - return -ENOMEM; - pmdp = pmd_alloc(&init_mm, pudp, ea); - if (!pmdp) - return -ENOMEM; - ptep = pte_alloc_kernel(pmdp, ea); - if (!ptep) - return -ENOMEM; - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); - } else { -#ifdef CONFIG_PPC_MMU_NOHASH - pgdp = pgd_offset_k(ea); -#ifdef PUD_TABLE_SIZE - if (pgd_none(*pgdp)) { - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); - BUG_ON(pudp == NULL); - pgd_populate(&init_mm, pgdp, pudp); - } -#endif /* PUD_TABLE_SIZE */ - pudp = pud_offset(pgdp, ea); - if (pud_none(*pudp)) { - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); - BUG_ON(pmdp == NULL); - pud_populate(&init_mm, pudp, pmdp); - } - pmdp = pmd_offset(pudp, ea); - if (!pmd_present(*pmdp)) { - ptep = early_alloc_pgtable(PAGE_SIZE); - BUG_ON(ptep == NULL); - pmd_populate_kernel(&init_mm, pmdp, ptep); - } - ptep = pte_offset_kernel(pmdp, ea); - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, - __pgprot(flags))); -#else /* CONFIG_PPC_MMU_NOHASH */ - /* - * If the mm subsystem is not fully up, we cannot create a - * linux page table entry for this mapping. Simply bolt an - * entry in the hardware page table. - * - */ - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, - mmu_io_psize, mmu_kernel_ssize)) { - printk(KERN_ERR "Failed to do bolted mapping IO " - "memory at %016lx !\n", pa); - return -ENOMEM; - } -#endif /* !CONFIG_PPC_MMU_NOHASH */ - } - - smp_wmb(); - return 0; -} - +struct prtb_entry *process_tb; +struct patb_entry *partition_tb; +/* + * page table size + */ +unsigned long __pte_index_size; +EXPORT_SYMBOL(__pte_index_size); +unsigned long __pmd_index_size; +EXPORT_SYMBOL(__pmd_index_size); +unsigned long __pud_index_size; +EXPORT_SYMBOL(__pud_index_size); +unsigned long __pgd_index_size; +EXPORT_SYMBOL(__pgd_index_size); +unsigned long __pmd_cache_index; +EXPORT_SYMBOL(__pmd_cache_index); +unsigned long __pte_table_size; +EXPORT_SYMBOL(__pte_table_size); +unsigned long __pmd_table_size; +EXPORT_SYMBOL(__pmd_table_size); +unsigned long __pud_table_size; +EXPORT_SYMBOL(__pud_table_size); +unsigned long __pgd_table_size; +EXPORT_SYMBOL(__pgd_table_size); +unsigned long __pmd_val_bits; +EXPORT_SYMBOL(__pmd_val_bits); +unsigned long __pud_val_bits; +EXPORT_SYMBOL(__pud_val_bits); +unsigned long __pgd_val_bits; +EXPORT_SYMBOL(__pgd_val_bits); +unsigned long __kernel_virt_start; +EXPORT_SYMBOL(__kernel_virt_start); +unsigned long __kernel_virt_size; +EXPORT_SYMBOL(__kernel_virt_size); +unsigned long __vmalloc_start; +EXPORT_SYMBOL(__vmalloc_start); +unsigned long __vmalloc_end; +EXPORT_SYMBOL(__vmalloc_end); +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); +unsigned long __pte_frag_nr; +EXPORT_SYMBOL(__pte_frag_nr); +unsigned long __pte_frag_size_shift; +EXPORT_SYMBOL(__pte_frag_size_shift); +unsigned long ioremap_bot; +#else /* !CONFIG_PPC_BOOK3S_64 */ +unsigned long ioremap_bot = IOREMAP_BASE; +#endif /** * __ioremap_at - Low level function to establish the page tables @@ -167,12 +126,8 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, if ((flags & _PAGE_PRESENT) == 0) flags |= pgprot_val(PAGE_KERNEL); - /* Non-cacheable page cannot be coherent */ - if (flags & _PAGE_NO_CACHE) - flags &= ~_PAGE_COHERENT; - /* We don't support the 4K PFN hack with ioremap */ - if (flags & _PAGE_4K_PFN) + if (flags & H_PAGE_4K_PFN) return NULL; WARN_ON(pa & ~PAGE_MASK); @@ -253,7 +208,7 @@ void __iomem * __ioremap(phys_addr_t addr, unsigned long size, void __iomem * ioremap(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; + unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -263,7 +218,7 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size) void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) { - unsigned long flags = _PAGE_NO_CACHE; + unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); void *caller = __builtin_return_address(0); if (ppc_md.ioremap) @@ -277,11 +232,20 @@ void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ - if (flags & _PAGE_RW) + if (flags & _PAGE_WRITE) flags |= _PAGE_DIRTY; - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC); + /* we don't want to let _PAGE_EXEC leak out */ + flags &= ~_PAGE_EXEC; + /* + * Force kernel mapping. + */ +#if defined(CONFIG_PPC_BOOK3S_64) + flags |= _PAGE_PRIVILEGED; +#else + flags &= ~_PAGE_USER; +#endif + #ifdef _PAGE_BAP_SR /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format @@ -386,8 +350,7 @@ static pte_t *get_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | - __GFP_REPEAT | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); if (!page) return NULL; if (!kernel && !pgtable_page_ctor(page)) { @@ -411,7 +374,7 @@ static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) return (pte_t *)ret; } -pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) { pte_t *pte; @@ -421,8 +384,9 @@ pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) return __alloc_for_cache(mm, kernel); } +#endif /* CONFIG_PPC_64K_PAGES */ -void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) +void pte_fragment_free(unsigned long *table, int kernel) { struct page *page = virt_to_page(table); if (put_page_testzero(page)) { @@ -433,15 +397,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) } #ifdef CONFIG_SMP -static void page_table_free_rcu(void *table) -{ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } -} - void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { unsigned long pgf = (unsigned long)table; @@ -458,7 +413,7 @@ void __tlb_remove_table(void *_table) if (!shift) /* PTE page needs special handling */ - page_table_free_rcu(table); + pte_fragment_free(table, 0); else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); @@ -469,385 +424,10 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { if (!shift) { /* PTE page needs special handling */ - struct page *page = virt_to_page(table); - if (put_page_testzero(page)) { - pgtable_page_dtor(page); - free_hot_cold_page(page, 0); - } + pte_fragment_free(table, 0); } else { BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); kmem_cache_free(PGT_CACHE(shift), table); } } #endif -#endif /* CONFIG_PPC_64K_PAGES */ - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - -/* - * This is called when relaxing access to a hugepage. It's also called in the page - * fault path when we don't hit any of the major fault cases, ie, a minor - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have - * handled those two for us, we additionally deal with missing execute - * permission here on some processors - */ -int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp, pmd_t entry, int dirty) -{ - int changed; -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&vma->vm_mm->page_table_lock); -#endif - changed = !pmd_same(*(pmdp), entry); - if (changed) { - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); - /* - * Since we are not supporting SW TLB systems, we don't - * have any thing similar to flush_tlb_page_nohash() - */ - } - return changed; -} - -unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long clr, - unsigned long set) -{ - - unsigned long old, tmp; - -#ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_trans_huge(*pmdp)); - assert_spin_locked(&mm->page_table_lock); -#endif - -#ifdef PTE_ATOMIC_UPDATES - __asm__ __volatile__( - "1: ldarx %0,0,%3\n\ - andi. %1,%0,%6\n\ - bne- 1b \n\ - andc %1,%0,%4 \n\ - or %1,%1,%7\n\ - stdcx. %1,0,%3 \n\ - bne- 1b" - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) - : "cc" ); -#else - old = pmd_val(*pmdp); - *pmdp = __pmd((old & ~clr) | set); -#endif - trace_hugepage_update(addr, old, clr, set); - if (old & _PAGE_HASHPTE) - hpte_do_hugepage_flush(mm, addr, pmdp, old); - return old; -} - -pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd; - - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(pmd_trans_huge(*pmdp)); - - pmd = *pmdp; - pmd_clear(pmdp); - /* - * Wait for all pending hash_page to finish. This is needed - * in case of subpage collapse. When we collapse normal pages - * to hugepage, we first clear the pmd, then invalidate all - * the PTE entries. The assumption here is that any low level - * page fault will see a none pmd and take the slow path that - * will wait on mmap_sem. But we could very well be in a - * hash_page with local ptep pointer value. Such a hash page - * can result in adding new HPTE entries for normal subpages. - * That means we could be modifying the page content as we - * copy them to a huge page. So wait for parallel hash_page - * to finish before invalidating HPTE entries. We can do this - * by sending an IPI to all the cpus and executing a dummy - * function there. - */ - kick_all_cpus_sync(); - /* - * Now invalidate the hpte entries in the range - * covered by pmd. This make sure we take a - * fault and will find the pmd as none, which will - * result in a major fault which takes mmap_sem and - * hence wait for collapse to complete. Without this - * the __collapse_huge_page_copy can result in copying - * the old content. - */ - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); - return pmd; -} - -int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We currently remove entries from the hashtable regardless of whether - * the entry was young or dirty. The generic routines only flush if the - * entry was young or dirty which is not good enough. - * - * We should be more intelligent about this but for the moment we override - * these functions and force a tlb flush unconditionally - */ -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); -} - -/* - * We want to put the pgtable in pmd and use pgtable for tracking - * the base page size hptes - */ -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - pgtable_t *pgtable_slot; - assert_spin_locked(&mm->page_table_lock); - /* - * we store the pgtable in the second half of PMD - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - *pgtable_slot = pgtable; - /* - * expose the deposited pgtable to other cpus. - * before we set the hugepage PTE at pmd level - * hash fault code looks at the deposted pgtable - * to store hash index values. - */ - smp_wmb(); -} - -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - pgtable_t pgtable; - pgtable_t *pgtable_slot; - - assert_spin_locked(&mm->page_table_lock); - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Once we withdraw, mark the entry NULL. - */ - *pgtable_slot = NULL; - /* - * We store HPTE information in the deposited PTE fragment. - * zero out the content on withdraw. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - return pgtable; -} - -void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0); -} - - -/* - * set a new huge pmd. We should not be called for updating - * an existing pmd entry. That should go via pmd_hugepage_update. - */ -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ -#ifdef CONFIG_DEBUG_VM - WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == - (_PAGE_PRESENT | _PAGE_USER)); - assert_spin_locked(&mm->page_table_lock); - WARN_ON(!pmd_trans_huge(pmd)); -#endif - trace_hugepage_set_pmd(addr, pmd_val(pmd)); - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); -} - -/* - * We use this to invalidate a pmdp entry before switching from a - * hugepte to regular pmd entry. - */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); - - /* - * This ensures that generic code that rely on IRQ disabling - * to prevent a parallel THP split work as expected. - */ - kick_all_cpus_sync(); -} - -/* - * A linux hugepage PMD was changed and the corresponding hash table entries - * neesd to be flushed. - */ -void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, unsigned long old_pmd) -{ - int ssize; - unsigned int psize; - unsigned long vsid; - unsigned long flags = 0; - const struct cpumask *tmp; - - /* get the base page size,vsid and segment size */ -#ifdef CONFIG_DEBUG_VM - psize = get_slice_psize(mm, addr); - BUG_ON(psize == MMU_PAGE_16M); -#endif - if (old_pmd & _PAGE_COMBO) - psize = MMU_PAGE_4K; - else - psize = MMU_PAGE_64K; - - if (!is_kernel_addr(addr)) { - ssize = user_segment_size(addr); - vsid = get_vsid(mm->context.id, addr, ssize); - WARN_ON(vsid == 0); - } else { - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); - ssize = mmu_kernel_ssize; - } - - tmp = cpumask_of(smp_processor_id()); - if (cpumask_equal(mm_cpumask(mm), tmp)) - flags |= HPTE_LOCAL_UPDATE; - - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); -} - -static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) -{ - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); -} - -pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) -{ - unsigned long pmdv; - - pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK; - return pmd_set_protbits(__pmd(pmdv), pgprot); -} - -pmd_t mk_pmd(struct page *page, pgprot_t pgprot) -{ - return pfn_pmd(page_to_pfn(page), pgprot); -} - -pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) -{ - unsigned long pmdv; - - pmdv = pmd_val(pmd); - pmdv &= _HPAGE_CHG_MASK; - return pmd_set_protbits(__pmd(pmdv), newprot); -} - -/* - * This is called at the end of handling a user page fault, when the - * fault has been handled by updating a HUGE PMD entry in the linux page tables. - * We use it to preload an HPTE into the hash table corresponding to - * the updated linux HUGE PMD entry. - */ -void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd) -{ - return; -} - -pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t old_pmd; - pgtable_t pgtable; - unsigned long old; - pgtable_t *pgtable_slot; - - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); - old_pmd = __pmd(old); - /* - * We have pmd == none and we are holding page_table_lock. - * So we can safely go and clear the pgtable hash - * index info. - */ - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; - pgtable = *pgtable_slot; - /* - * Let's zero out old valid and hash index details - * hash fault look at them. - */ - memset(pgtable, 0, PTE_FRAG_SIZE); - /* - * Serialize against find_linux_pte_or_hugepte which does lock-less - * lookup in page tables with local interrupts disabled. For huge pages - * it casts pmd_t to pte_t. Since format of pte_t is different from - * pmd_t we want to prevent transit from pmd pointing to page table - * to pmd pointing to huge page (and back) while interrupts are disabled. - * We clear pmd to possibly replace it with page table pointer in - * different code paths. So make sure we wait for the parallel - * find_linux_pte_or_hugepage to finish. - */ - kick_all_cpus_sync(); - return old_pmd; -} - -int has_transparent_hugepage(void) -{ - - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER, - "hugepages can't be allocated by the buddy allocator"); - - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2, - "We need more than 2 pages to do deferred thp split"); - - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) - return 0; - /* - * We support THP only if PMD_SIZE is 16MB. - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) - return 0; - /* - * We need to make sure that we support 16MB hugepage in a segement - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE - * of 64K. - */ - /* - * If we have 64K HPTE, we will be using that by default - */ - if (mmu_psize_defs[MMU_PAGE_64K].shift && - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) - return 0; - /* - * Ok we only have 4K HPTE - */ - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) - return 0; - - return 1; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 825b68733..48fc28bab 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -32,7 +32,6 @@ enum slb_index { }; extern void slb_allocate_realmode(unsigned long ea); -extern void slb_allocate_user(unsigned long ea); static void slb_allocate(unsigned long ea) { diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 736d18b3c..dfdb90cb4 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -35,7 +35,7 @@ _GLOBAL(slb_allocate_realmode) * check for bad kernel/user address * (ea & ~REGION_MASK) >= PGTABLE_RANGE */ - rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4) + rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4) bne- 8f srdi r9,r3,60 /* get region */ @@ -91,7 +91,7 @@ slb_miss_kernel_load_vmemmap: * can be demoted from 64K -> 4K dynamically on some machines */ clrldi r11,r10,48 - cmpldi r11,(VMALLOC_SIZE >> 28) - 1 + cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f @@ -179,56 +179,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) li r11,SLB_VSID_USER /* flags don't much matter */ b slb_finish_load -#ifdef __DISABLED__ - -/* void slb_allocate_user(unsigned long ea); - * - * Create an SLB entry for the given EA (user or kernel). - * r3 = faulting address, r13 = PACA - * r9, r10, r11 are clobbered by this function - * No other registers are examined or changed. - * - * It is called with translation enabled in order to be able to walk the - * page tables. This is not currently used. - */ -_GLOBAL(slb_allocate_user) - /* r3 = faulting address */ - srdi r10,r3,28 /* get esid */ - - crset 4*cr7+lt /* set "user" flag for later */ - - /* check if we fit in the range covered by the pagetables*/ - srdi. r9,r3,PGTABLE_EADDR_SIZE - crnot 4*cr0+eq,4*cr0+eq - beqlr - - /* now we need to get to the page tables in order to get the page - * size encoding from the PMD. In the future, we'll be able to deal - * with 1T segments too by getting the encoding from the PGD instead - */ - ld r9,PACAPGDIR(r13) - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,8,25,28 - ldx r9,r9,r11 /* get pgd_t */ - cmpldi cr0,r9,0 - beqlr - rlwinm r11,r10,3,17,28 - ldx r9,r9,r11 /* get pmd_t */ - cmpldi cr0,r9,0 - beqlr - - /* build vsid flags */ - andi. r11,r9,SLB_VSID_LLP - ori r11,r11,SLB_VSID_USER - - /* get context to calculate proto-VSID */ - ld r9,PACACONTEXTID(r13) - /* fall through slb_finish_load */ - -#endif /* __DISABLED__ */ - - /* * Finish loading of an SLB entry and return * diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 42954f0b4..2b2745890 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -37,8 +37,8 @@ #include <asm/hugetlb.h> /* some sanity checks */ -#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error PGTABLE_RANGE exceeds slice_mask high_slices size +#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE +#error H_PGTABLE_RANGE exceeds slice_mask high_slices size #endif static DEFINE_SPINLOCK(slice_convert_lock); @@ -395,6 +395,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Sanity checks */ BUG_ON(mm->task_size == 0); + VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", @@ -568,6 +569,16 @@ unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) unsigned char *hpsizes; int index, mask_index; + /* + * Radix doesn't use slice, but can get enabled along with MMU_SLICE + */ + if (radix_enabled()) { +#ifdef CONFIG_PPC_64K_PAGES + return MMU_PAGE_64K; +#else + return MMU_PAGE_4K; +#endif + } if (addr < SLICE_LOW_TOP) { u64 lpsizes; lpsizes = mm->context.low_slices_psize; @@ -605,6 +616,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); + VM_BUG_ON(radix_enabled()); spin_lock_irqsave(&slice_convert_lock, flags); old_psize = mm->context.user_psize; @@ -649,6 +661,7 @@ void slice_set_range_psize(struct mm_struct *mm, unsigned long start, { struct slice_mask mask = slice_range_to_mask(start, len); + VM_BUG_ON(radix_enabled()); slice_convert(mm, mask, psize); } @@ -678,6 +691,9 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, struct slice_mask mask, available; unsigned int psize = mm->context.user_psize; + if (radix_enabled()) + return 0; + mask = slice_range_to_mask(addr, len); available = slice_mask_for_size(mm, psize); #ifdef CONFIG_PPC_64K_PAGES diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c new file mode 100644 index 000000000..ab2f60e81 --- /dev/null +++ b/arch/powerpc/mm/tlb-radix.c @@ -0,0 +1,293 @@ +/* + * TLB flush routines for radix kernels. + * + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/memblock.h> + +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); + +#define RIC_FLUSH_TLB 0 +#define RIC_FLUSH_PWC 1 +#define RIC_FLUSH_ALL 2 + +static inline void __tlbiel_pid(unsigned long pid, int set, + unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rb |= set << PPC_BITLSHIFT(51); + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +/* + * We use 128 set in radix mode and 256 set in hpt mode. + */ +static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) +{ + int set; + + for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { + __tlbiel_pid(pid, set, ric); + } + return; +} + +static inline void _tlbie_pid(unsigned long pid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = va & ~(PPC_BITMASK(52, 63)); + rb |= ap << PPC_BITLSHIFT(58); + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + + asm volatile("ptesync": : :"memory"); + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" + "(%2 << 17) | (%3 << 18) | (%4 << 21)" + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ +void radix__local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid, RIC_FLUSH_ALL); + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_mm); + +void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +{ + unsigned long pid; + struct mm_struct *mm = tlb->mm; + + preempt_disable(); + + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbiel_pid(pid, RIC_FLUSH_PWC); + + preempt_enable(); +} +EXPORT_SYMBOL(radix__local_flush_tlb_pwc); + +void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned long pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + preempt_enable(); +} + +void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + /* need the return fix for nohash.c */ + if (vma && is_vm_hugetlb_page(vma)) + return __local_flush_hugetlb_page(vma, vmaddr); +#endif + radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__local_flush_tlb_page); + +#ifdef CONFIG_SMP +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_sibling_cpumask(smp_processor_id())); +} + +void radix__flush_tlb_mm(struct mm_struct *mm) +{ + unsigned long pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(pid, RIC_FLUSH_ALL); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_pid(pid, RIC_FLUSH_ALL); +no_context: + preempt_enable(); +} +EXPORT_SYMBOL(radix__flush_tlb_mm); + +void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +{ + unsigned long pid; + struct mm_struct *mm = tlb->mm; + + preempt_disable(); + + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(pid, RIC_FLUSH_PWC); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_pid(pid, RIC_FLUSH_PWC); +no_context: + preempt_enable(); +} +EXPORT_SYMBOL(radix__flush_tlb_pwc); + +void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + unsigned long ap, int nid) +{ + unsigned long pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + if (!mm_is_core_local(mm)) { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } else + _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); +bail: + preempt_enable(); +} + +void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + return flush_hugetlb_page(vma, vmaddr); +#endif + radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_ap(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(radix__flush_tlb_page); + +#endif /* CONFIG_SMP */ + +void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + _tlbie_pid(0, RIC_FLUSH_ALL); + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); +} +EXPORT_SYMBOL(radix__flush_tlb_kernel_range); + +/* + * Currently, for range flushing, we just do a full mm flush. Because + * we use this in code path where we don' track the page size. + */ +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + struct mm_struct *mm = vma->vm_mm; + radix__flush_tlb_mm(mm); +} +EXPORT_SYMBOL(radix__flush_tlb_range); + + +void radix__tlb_flush(struct mmu_gather *tlb) +{ + struct mm_struct *mm = tlb->mm; + radix__flush_tlb_mm(mm); +} diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index f7b80391b..4517aa43a 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -155,7 +155,7 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } -void tlb_flush(struct mmu_gather *tlb) +void hash__tlb_flush(struct mmu_gather *tlb) { struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); @@ -218,7 +218,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, pte = pte_val(*ptep); if (is_thp) trace_hugepage_invalidate(start, pte); - if (!(pte & _PAGE_HASHPTE)) + if (!(pte & H_PAGE_HASHPTE)) continue; if (unlikely(is_thp)) hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); @@ -248,7 +248,7 @@ void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) start_pte = pte_offset_map(pmd, addr); for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { unsigned long pteval = pte_val(*pte); - if (pteval & _PAGE_HASHPTE) + if (pteval & H_PAGE_HASHPTE) hpte_need_flush(mm, addr, pte, pteval, 0); addr += PAGE_SIZE; } diff --git a/arch/powerpc/perf/Makefile b/arch/powerpc/perf/Makefile index f9c083a56..77b6394a7 100644 --- a/arch/powerpc/perf/Makefile +++ b/arch/powerpc/perf/Makefile @@ -1,6 +1,6 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -obj-$(CONFIG_PERF_EVENTS) += callchain.o +obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index e04a6752b..0fc267147 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -47,7 +47,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp) } void -perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -76,7 +76,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) next_ip = regs->nip; lr = regs->link; level = 0; - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_store_context(entry, PERF_CONTEXT_KERNEL); } else { if (level == 0) @@ -137,7 +137,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) offset = addr & ((1UL << shift) - 1); pte = READ_ONCE(*ptep); - if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) + if (!pte_present(pte) || !pte_user(pte)) goto err_out; pfn = pte_pfn(pte); if (!page_is_ram(pfn)) @@ -232,7 +232,7 @@ static int sane_signal_64_frame(unsigned long sp) puc == (unsigned long) &sf->uc; } -static void perf_callchain_user_64(struct perf_callchain_entry *entry, +static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned long sp, next_sp; @@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < entry->max_stack) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -274,7 +274,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry, read_user_stack_64(&uregs[PT_R1], &sp)) return; level = 0; - perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store_context(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, next_ip); continue; } @@ -319,7 +319,7 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) return rc; } -static inline void perf_callchain_user_64(struct perf_callchain_entry *entry, +static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } @@ -439,7 +439,7 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp, return mctx->mc_gregs; } -static void perf_callchain_user_32(struct perf_callchain_entry *entry, +static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { unsigned int sp, next_sp; @@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, sp = regs->gpr[1]; perf_callchain_store(entry, next_ip); - while (entry->nr < PERF_MAX_STACK_DEPTH) { + while (entry->nr < entry->max_stack) { fp = (unsigned int __user *) (unsigned long) sp; if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp)) return; @@ -473,7 +473,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, read_user_stack_32(&uregs[PT_R1], &sp)) return; level = 0; - perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store_context(entry, PERF_CONTEXT_USER); perf_callchain_store(entry, next_ip); continue; } @@ -487,7 +487,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry, } void -perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { if (current_is_64bit()) perf_callchain_user_64(entry, regs); diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c new file mode 100644 index 000000000..d24a8a366 --- /dev/null +++ b/arch/powerpc/perf/perf_regs.c @@ -0,0 +1,104 @@ +/* + * Copyright 2016 Anju T, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/perf_event.h> +#include <linux/bug.h> +#include <linux/stddef.h> +#include <asm/ptrace.h> +#include <asm/perf_regs.h> + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +#define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1)) + +static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = { + PT_REGS_OFFSET(PERF_REG_POWERPC_R0, gpr[0]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R1, gpr[1]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R2, gpr[2]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R3, gpr[3]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R4, gpr[4]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R5, gpr[5]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R6, gpr[6]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R7, gpr[7]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R8, gpr[8]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R9, gpr[9]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R10, gpr[10]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R11, gpr[11]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R12, gpr[12]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R13, gpr[13]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R14, gpr[14]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R15, gpr[15]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R16, gpr[16]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R17, gpr[17]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R18, gpr[18]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R19, gpr[19]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R20, gpr[20]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R21, gpr[21]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R22, gpr[22]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R23, gpr[23]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R24, gpr[24]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R25, gpr[25]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R26, gpr[26]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R27, gpr[27]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R28, gpr[28]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R29, gpr[29]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R30, gpr[30]), + PT_REGS_OFFSET(PERF_REG_POWERPC_R31, gpr[31]), + PT_REGS_OFFSET(PERF_REG_POWERPC_NIP, nip), + PT_REGS_OFFSET(PERF_REG_POWERPC_MSR, msr), + PT_REGS_OFFSET(PERF_REG_POWERPC_ORIG_R3, orig_gpr3), + PT_REGS_OFFSET(PERF_REG_POWERPC_CTR, ctr), + PT_REGS_OFFSET(PERF_REG_POWERPC_LINK, link), + PT_REGS_OFFSET(PERF_REG_POWERPC_XER, xer), + PT_REGS_OFFSET(PERF_REG_POWERPC_CCR, ccr), +#ifdef CONFIG_PPC64 + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, softe), +#else + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, mq), +#endif + PT_REGS_OFFSET(PERF_REG_POWERPC_TRAP, trap), + PT_REGS_OFFSET(PERF_REG_POWERPC_DAR, dar), + PT_REGS_OFFSET(PERF_REG_POWERPC_DSISR, dsisr), +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX)) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ +#ifdef CONFIG_PPC64 + if (!test_tsk_thread_flag(task, TIF_32BIT)) + return PERF_SAMPLE_REGS_ABI_64; + else +#endif + return PERF_SAMPLE_REGS_ABI_32; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h index 741b77edd..3a2e6e8eb 100644 --- a/arch/powerpc/perf/power8-events-list.h +++ b/arch/powerpc/perf/power8-events-list.h @@ -49,3 +49,43 @@ EVENT(PM_L3_PREF_ALL, 0x4e052) EVENT(PM_DTLB_MISS, 0x300fc) /* ITLB Reloaded */ EVENT(PM_ITLB_MISS, 0x400fc) +/* Run_Instructions */ +EVENT(PM_RUN_INST_CMPL, 0x500fa) +/* Alternate event code for PM_RUN_INST_CMPL */ +EVENT(PM_RUN_INST_CMPL_ALT, 0x400fa) +/* Run_cycles */ +EVENT(PM_RUN_CYC, 0x600f4) +/* Alternate event code for Run_cycles */ +EVENT(PM_RUN_CYC_ALT, 0x200f4) +/* Marked store completed */ +EVENT(PM_MRK_ST_CMPL, 0x10134) +/* Alternate event code for Marked store completed */ +EVENT(PM_MRK_ST_CMPL_ALT, 0x301e2) +/* Marked two path branch */ +EVENT(PM_BR_MRK_2PATH, 0x10138) +/* Alternate event code for PM_BR_MRK_2PATH */ +EVENT(PM_BR_MRK_2PATH_ALT, 0x40138) +/* L3 castouts in Mepf state */ +EVENT(PM_L3_CO_MEPF, 0x18082) +/* Alternate event code for PM_L3_CO_MEPF */ +EVENT(PM_L3_CO_MEPF_ALT, 0x3e05e) +/* Data cache was reloaded from a location other than L2 due to a marked load */ +EVENT(PM_MRK_DATA_FROM_L2MISS, 0x1d14e) +/* Alternate event code for PM_MRK_DATA_FROM_L2MISS */ +EVENT(PM_MRK_DATA_FROM_L2MISS_ALT, 0x401e8) +/* Alternate event code for PM_CMPLU_STALL */ +EVENT(PM_CMPLU_STALL_ALT, 0x1e054) +/* Two path branch */ +EVENT(PM_BR_2PATH, 0x20036) +/* Alternate event code for PM_BR_2PATH */ +EVENT(PM_BR_2PATH_ALT, 0x40036) +/* # PPC Dispatched */ +EVENT(PM_INST_DISP, 0x200f2) +/* Alternate event code for PM_INST_DISP */ +EVENT(PM_INST_DISP_ALT, 0x300f2) +/* Marked filter Match */ +EVENT(PM_MRK_FILT_MATCH, 0x2013c) +/* Alternate event code for PM_MRK_FILT_MATCH */ +EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e) +/* Alternate event code for PM_LD_MISS_L1 */ +EVENT(PM_LD_MISS_L1_ALT, 0x400f0) diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 690d9186a..7cf3b4378 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -274,7 +274,8 @@ static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long /* Ignore Linux defined bits when checking event below */ base_event = event & ~EVENT_LINUX_MASK; - if (pmc >= 5 && base_event != 0x500fa && base_event != 0x600f4) + if (pmc >= 5 && base_event != PM_RUN_INST_CMPL && + base_event != PM_RUN_CYC) return -1; mask |= CNST_PMC_MASK(pmc); @@ -488,17 +489,17 @@ static int power8_compute_mmcr(u64 event[], int n_ev, /* Table of alternatives, sorted by column 0 */ static const unsigned int event_alternatives[][MAX_ALT] = { - { 0x10134, 0x301e2 }, /* PM_MRK_ST_CMPL */ - { 0x10138, 0x40138 }, /* PM_BR_MRK_2PATH */ - { 0x18082, 0x3e05e }, /* PM_L3_CO_MEPF */ - { 0x1d14e, 0x401e8 }, /* PM_MRK_DATA_FROM_L2MISS */ - { 0x1e054, 0x4000a }, /* PM_CMPLU_STALL */ - { 0x20036, 0x40036 }, /* PM_BR_2PATH */ - { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ - { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ - { 0x2013c, 0x3012e }, /* PM_MRK_FILT_MATCH */ - { 0x3e054, 0x400f0 }, /* PM_LD_MISS_L1 */ - { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ + { PM_MRK_ST_CMPL, PM_MRK_ST_CMPL_ALT }, + { PM_BR_MRK_2PATH, PM_BR_MRK_2PATH_ALT }, + { PM_L3_CO_MEPF, PM_L3_CO_MEPF_ALT }, + { PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L2MISS_ALT }, + { PM_CMPLU_STALL_ALT, PM_CMPLU_STALL }, + { PM_BR_2PATH, PM_BR_2PATH_ALT }, + { PM_INST_DISP, PM_INST_DISP_ALT }, + { PM_RUN_CYC_ALT, PM_RUN_CYC }, + { PM_MRK_FILT_MATCH, PM_MRK_FILT_MATCH_ALT }, + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, }; /* @@ -546,17 +547,17 @@ static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[]) j = num_alt; for (i = 0; i < num_alt; ++i) { switch (alt[i]) { - case 0x1e: /* PM_CYC */ - alt[j++] = 0x600f4; /* PM_RUN_CYC */ + case PM_CYC: + alt[j++] = PM_RUN_CYC; break; - case 0x600f4: /* PM_RUN_CYC */ - alt[j++] = 0x1e; + case PM_RUN_CYC: + alt[j++] = PM_CYC; break; - case 0x2: /* PM_PPC_CMPL */ - alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ + case PM_INST_CMPL: + alt[j++] = PM_RUN_INST_CMPL; break; - case 0x500fa: /* PM_RUN_INST_CMPL */ - alt[j++] = 0x2; /* PM_PPC_CMPL */ + case PM_RUN_INST_CMPL: + alt[j++] = PM_INST_CMPL; break; } } diff --git a/arch/powerpc/platforms/512x/clock-commonclk.c b/arch/powerpc/platforms/512x/clock-commonclk.c index c50ea76ba..6081fbd75 100644 --- a/arch/powerpc/platforms/512x/clock-commonclk.c +++ b/arch/powerpc/platforms/512x/clock-commonclk.c @@ -221,7 +221,7 @@ static bool soc_has_mclk_mux0_canin(void) /* convenience wrappers around the common clk API */ static inline struct clk *mpc512x_clk_fixed(const char *name, int rate) { - return clk_register_fixed_rate(NULL, name, NULL, CLK_IS_ROOT, rate); + return clk_register_fixed_rate(NULL, name, NULL, 0, rate); } static inline struct clk *mpc512x_clk_factor( diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 3048e34db..22645a7c6 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -278,14 +278,9 @@ mpc52xx_gpt_irq_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node) * GPIOLIB hooks */ #if defined(CONFIG_GPIOLIB) -static inline struct mpc52xx_gpt_priv *gc_to_mpc52xx_gpt(struct gpio_chip *gc) -{ - return container_of(gc, struct mpc52xx_gpt_priv, gc); -} - static int mpc52xx_gpt_gpio_get(struct gpio_chip *gc, unsigned int gpio) { - struct mpc52xx_gpt_priv *gpt = gc_to_mpc52xx_gpt(gc); + struct mpc52xx_gpt_priv *gpt = gpiochip_get_data(gc); return (in_be32(&gpt->regs->status) >> 8) & 1; } @@ -293,7 +288,7 @@ static int mpc52xx_gpt_gpio_get(struct gpio_chip *gc, unsigned int gpio) static void mpc52xx_gpt_gpio_set(struct gpio_chip *gc, unsigned int gpio, int v) { - struct mpc52xx_gpt_priv *gpt = gc_to_mpc52xx_gpt(gc); + struct mpc52xx_gpt_priv *gpt = gpiochip_get_data(gc); unsigned long flags; u32 r; @@ -307,7 +302,7 @@ mpc52xx_gpt_gpio_set(struct gpio_chip *gc, unsigned int gpio, int v) static int mpc52xx_gpt_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { - struct mpc52xx_gpt_priv *gpt = gc_to_mpc52xx_gpt(gc); + struct mpc52xx_gpt_priv *gpt = gpiochip_get_data(gc); unsigned long flags; dev_dbg(gpt->dev, "%s: gpio:%d\n", __func__, gpio); @@ -354,9 +349,9 @@ mpc52xx_gpt_gpio_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node) clrsetbits_be32(&gpt->regs->mode, MPC52xx_GPT_MODE_MS_MASK, MPC52xx_GPT_MODE_MS_GPIO); - rc = gpiochip_add(&gpt->gc); + rc = gpiochip_add_data(&gpt->gc, gpt); if (rc) - dev_err(gpt->dev, "gpiochip_add() failed; rc=%i\n", rc); + dev_err(gpt->dev, "gpiochip_add_data() failed; rc=%i\n", rc); dev_dbg(gpt->dev, "%s() complete.\n", __func__); } diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c index 15e8021dd..dbcd0303a 100644 --- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c +++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c @@ -16,7 +16,7 @@ #include <linux/device.h> #include <linux/mutex.h> #include <linux/i2c.h> -#include <linux/gpio.h> +#include <linux/gpio/driver.h> #include <linux/of.h> #include <linux/of_gpio.h> #include <linux/slab.h> @@ -99,7 +99,7 @@ static void mcu_power_off(void) static void mcu_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct mcu *mcu = container_of(gc, struct mcu, gc); + struct mcu *mcu = gpiochip_get_data(gc); u8 bit = 1 << (4 + gpio); mutex_lock(&mcu->lock); @@ -136,7 +136,7 @@ static int mcu_gpiochip_add(struct mcu *mcu) gc->direction_output = mcu_gpio_dir_out; gc->of_node = np; - return gpiochip_add(gc); + return gpiochip_add_data(gc, mcu); } static int mcu_gpiochip_remove(struct mcu *mcu) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 142dff5e9..77e9b8d59 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -72,7 +72,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS - select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES + select HAVE_ARCH_TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_NUMA_BALANCING select IRQ_WORK @@ -331,6 +331,15 @@ config PPC_STD_MMU_64 def_bool y depends on PPC_STD_MMU && PPC64 +config PPC_RADIX_MMU + bool "Radix MMU Support" + depends on PPC_BOOK3S_64 + default y + help + Enable support for the Power ISA 3.0 Radix style MMU. Currently this + is only implemented by IBM Power9 CPUs, if you don't have one of them + you can probably disable this. + config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU diff --git a/arch/powerpc/platforms/cell/spu_base.c b/arch/powerpc/platforms/cell/spu_base.c index f7af74f83..3cbe38fad 100644 --- a/arch/powerpc/platforms/cell/spu_base.c +++ b/arch/powerpc/platforms/cell/spu_base.c @@ -24,7 +24,7 @@ #include <linux/interrupt.h> #include <linux/list.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/wait.h> @@ -197,7 +197,7 @@ static int __spu_trap_data_map(struct spu *spu, unsigned long ea, u64 dsisr) (REGION_ID(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); - ret = hash_page(ea, _PAGE_PRESENT, 0x300, dsisr); + ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr); spin_lock(&spu->register_lock); if (!ret) { @@ -805,7 +805,4 @@ static int __init init_spu_base(void) out: return ret; } -module_init(init_spu_base); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>"); +device_initcall(init_spu_base); diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c index be6212ddb..85c85eb3e 100644 --- a/arch/powerpc/platforms/cell/spufs/coredump.c +++ b/arch/powerpc/platforms/cell/spufs/coredump.c @@ -137,6 +137,7 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i, char *name; char fullname[80], *buf; struct elf_note en; + size_t skip; buf = (void *)get_zeroed_page(GFP_KERNEL); if (!buf) @@ -171,8 +172,8 @@ static int spufs_arch_write_note(struct spu_context *ctx, int i, if (rc < 0) goto out; - if (!dump_skip(cprm, - roundup(cprm->written - total + sz, 4) - cprm->written)) + skip = roundup(cprm->pos - total + sz, 4) - cprm->pos; + if (!dump_skip(cprm, skip)) goto Eio; out: free_page((unsigned long)buf); diff --git a/arch/powerpc/platforms/cell/spufs/fault.c b/arch/powerpc/platforms/cell/spufs/fault.c index d98f845ac..e29e4d5af 100644 --- a/arch/powerpc/platforms/cell/spufs/fault.c +++ b/arch/powerpc/platforms/cell/spufs/fault.c @@ -141,8 +141,8 @@ int spufs_handle_class1(struct spu_context *ctx) /* we must not hold the lock when entering copro_handle_mm_fault */ spu_release(ctx); - access = (_PAGE_PRESENT | _PAGE_USER); - access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL; + access = (_PAGE_PRESENT | _PAGE_READ); + access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_WRITE : 0UL; local_irq_save(flags); ret = hash_page(ea, access, 0x300, dsisr); local_irq_restore(flags); diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 6ca5f0525..5be15cff7 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -238,7 +238,7 @@ const struct file_operations spufs_context_fops = { .release = spufs_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, - .iterate = dcache_readdir, + .iterate_shared = dcache_readdir, .fsync = noop_fsync, }; EXPORT_SYMBOL_GPL(spufs_context_fops); diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 950b3e539..9226df11b 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -75,7 +75,7 @@ static int pnv_eeh_init(void) * and P7IOC separately. So we should regard * PE#0 as valid for PHB3 and P7IOC. */ - if (phb->ioda.reserved_pe != 0) + if (phb->ioda.reserved_pe_idx != 0) eeh_add_flag(EEH_VALID_PE_ZERO); break; @@ -1009,8 +1009,9 @@ static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option) static int pnv_eeh_reset(struct eeh_pe *pe, int option) { struct pci_controller *hose = pe->phb; + struct pnv_phb *phb; struct pci_bus *bus; - int ret; + int64_t rc; /* * For PHB reset, we always have complete reset. For those PEs whose @@ -1026,45 +1027,39 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) * reset. The side effect is that EEH core has to clear the frozen * state explicitly after BAR restore. */ - if (pe->type & EEH_PE_PHB) { - ret = pnv_eeh_phb_reset(hose, option); - } else { - struct pnv_phb *phb; - s64 rc; + if (pe->type & EEH_PE_PHB) + return pnv_eeh_phb_reset(hose, option); - /* - * The frozen PE might be caused by PAPR error injection - * registers, which are expected to be cleared after hitting - * frozen PE as stated in the hardware spec. Unfortunately, - * that's not true on P7IOC. So we have to clear it manually - * to avoid recursive EEH errors during recovery. - */ - phb = hose->private_data; - if (phb->model == PNV_PHB_MODEL_P7IOC && - (option == EEH_RESET_HOT || - option == EEH_RESET_FUNDAMENTAL)) { - rc = opal_pci_reset(phb->opal_id, - OPAL_RESET_PHB_ERROR, - OPAL_ASSERT_RESET); - if (rc != OPAL_SUCCESS) { - pr_warn("%s: Failure %lld clearing " - "error injection registers\n", - __func__, rc); - return -EIO; - } + /* + * The frozen PE might be caused by PAPR error injection + * registers, which are expected to be cleared after hitting + * frozen PE as stated in the hardware spec. Unfortunately, + * that's not true on P7IOC. So we have to clear it manually + * to avoid recursive EEH errors during recovery. + */ + phb = hose->private_data; + if (phb->model == PNV_PHB_MODEL_P7IOC && + (option == EEH_RESET_HOT || + option == EEH_RESET_FUNDAMENTAL)) { + rc = opal_pci_reset(phb->opal_id, + OPAL_RESET_PHB_ERROR, + OPAL_ASSERT_RESET); + if (rc != OPAL_SUCCESS) { + pr_warn("%s: Failure %lld clearing error injection registers\n", + __func__, rc); + return -EIO; } - - bus = eeh_pe_bus_get(pe); - if (pe->type & EEH_PE_VF) - ret = pnv_eeh_reset_vf_pe(pe, option); - else if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) - ret = pnv_eeh_root_reset(hose, option); - else - ret = pnv_eeh_bridge_reset(bus->self, option); } - return ret; + bus = eeh_pe_bus_get(pe); + if (pe->type & EEH_PE_VF) + return pnv_eeh_reset_vf_pe(pe, option); + + if (pci_is_root_bus(bus) || + pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); + + return pnv_eeh_bridge_reset(bus->self, option); } /** diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 7229acd9b..0459e100b 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/pci.h> #include <linux/memblock.h> +#include <linux/iommu.h> #include <asm/iommu.h> #include <asm/pnv-pci.h> @@ -25,8 +26,6 @@ * Other types of TCE cache invalidation are not functional in the * hardware. */ -#define TCE_KILL_INVAL_ALL PPC_BIT(0) - static struct pci_dev *get_pci_dev(struct device_node *dn) { return PCI_DN(dn)->pcidev; @@ -138,22 +137,17 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, struct pnv_ioda_pe *pe; struct pci_dn *pdn; - if (npe->flags & PNV_IODA_PE_PEER) { - pe = npe->peers[0]; - pdev = pe->pdev; - } else { - pdev = pnv_pci_get_gpu_dev(npe->pdev); - if (!pdev) - return NULL; + pdev = pnv_pci_get_gpu_dev(npe->pdev); + if (!pdev) + return NULL; - pdn = pci_get_pdn(pdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return NULL; + pdn = pci_get_pdn(pdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return NULL; - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - pe = &phb->ioda.pe_array[pdn->pe_number]; - } + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + pe = &phb->ioda.pe_array[pdn->pe_number]; if (gpdev) *gpdev = pdev; @@ -161,92 +155,70 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, return pe; } -void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe) +long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, + struct iommu_table *tbl) { struct pnv_phb *phb = npe->phb; + int64_t rc; + const unsigned long size = tbl->it_indirect_levels ? + tbl->it_level_size : tbl->it_size; + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; + const __u64 win_size = tbl->it_size << tbl->it_page_shift; + + pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", + start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); + + rc = opal_pci_map_pe_dma_window(phb->opal_id, + npe->pe_number, + npe->pe_number, + tbl->it_indirect_levels + 1, + __pa(tbl->it_base), + size << 3, + IOMMU_PAGE_SIZE(tbl)); + if (rc) { + pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); + return rc; + } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); - if (WARN_ON(phb->type != PNV_PHB_NPU || - !phb->ioda.tce_inval_reg || - !(npe->flags & PNV_IODA_PE_DEV))) - return; + /* Add the table to the list so its TCE cache will get invalidated */ + pnv_pci_link_table_and_group(phb->hose->node, num, + tbl, &npe->table_group); - mb(); /* Ensure previous TCE table stores are visible */ - __raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL), - phb->ioda.tce_inval_reg); + return 0; } -void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, - struct iommu_table *tbl, - unsigned long index, - unsigned long npages, - bool rm) +long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) { struct pnv_phb *phb = npe->phb; + int64_t rc; - /* We can only invalidate the whole cache on NPU */ - unsigned long val = TCE_KILL_INVAL_ALL; - - if (WARN_ON(phb->type != PNV_PHB_NPU || - !phb->ioda.tce_inval_reg || - !(npe->flags & PNV_IODA_PE_DEV))) - return; - - mb(); /* Ensure previous TCE table stores are visible */ - if (rm) - __raw_rm_writeq(cpu_to_be64(val), - (__be64 __iomem *) phb->ioda.tce_inval_reg_phys); - else - __raw_writeq(cpu_to_be64(val), - phb->ioda.tce_inval_reg); -} - -void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) -{ - struct pnv_ioda_pe *gpe; - struct pci_dev *gpdev; - int i, avail = -1; - - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - /* Nothing to do if the PE is already connected. */ - if (gpe->peers[i] == npe) - return; + pe_info(npe, "Removing DMA window\n"); - if (!gpe->peers[i]) - avail = i; + rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, + npe->pe_number, + 0/* levels */, 0/* table address */, + 0/* table size */, 0/* page size */); + if (rc) { + pe_err(npe, "Unmapping failed, ret = %lld\n", rc); + return rc; } + pnv_pci_ioda2_tce_invalidate_entire(phb, false); - if (WARN_ON(avail < 0)) - return; - - gpe->peers[avail] = npe; - gpe->flags |= PNV_IODA_PE_PEER; + pnv_pci_unlink_table_and_group(npe->table_group.tables[num], + &npe->table_group); - /* - * We assume that the NPU devices only have a single peer PE - * (the GPU PCIe device PE). - */ - npe->peers[0] = gpe; - npe->flags |= PNV_IODA_PE_PEER; + return 0; } /* - * For the NPU we want to point the TCE table at the same table as the - * real PCI device. + * Enables 32 bit DMA on NPU. */ -static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) +static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) { - struct pnv_phb *phb = npe->phb; struct pci_dev *gpdev; struct pnv_ioda_pe *gpe; - void *addr; - unsigned int size; int64_t rc; /* @@ -260,14 +232,7 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) if (!gpe) return; - addr = (void *)gpe->table_group.tables[0]->it_base; - size = gpe->table_group.tables[0]->it_size << 3; - rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, - npe->pe_number, 1, __pa(addr), - size, 0x1000); - if (rc != OPAL_SUCCESS) - pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n", - __func__, rc, phb->hose->global_number, npe->pe_number); + rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]); /* * We don't initialise npu_pe->tce32_table as we always use @@ -277,72 +242,120 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) } /* - * Enable/disable bypass mode on the NPU. The NPU only supports one + * Enables bypass mode on the NPU. The NPU only supports one * window per link, so bypass needs to be explicitly enabled or * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be * active at the same time. */ -int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable) +static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) { struct pnv_phb *phb = npe->phb; int64_t rc = 0; + phys_addr_t top = memblock_end_of_DRAM(); if (phb->type != PNV_PHB_NPU || !npe->pdev) return -EINVAL; - if (enable) { - /* Enable the bypass window */ - phys_addr_t top = memblock_end_of_DRAM(); - - npe->tce_bypass_base = 0; - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - npe->tce_bypass_base, top); - } else { - /* - * Disable the bypass window by replacing it with the - * TCE32 window. - */ - pnv_npu_disable_bypass(npe); - } + rc = pnv_npu_unset_window(npe, 0); + if (rc != OPAL_SUCCESS) + return rc; + + /* Enable the bypass window */ + + top = roundup_pow_of_two(top); + dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", + npe->pe_number); + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + 0 /* bypass base */, top); + + if (rc == OPAL_SUCCESS) + pnv_pci_ioda2_tce_invalidate_entire(phb, false); return rc; } -int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) { - struct pci_controller *hose = pci_bus_to_host(npdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(npdev); - struct pnv_ioda_pe *npe, *gpe; - struct pci_dev *gpdev; - uint64_t top; - bool bypass = false; + int i; + struct pnv_phb *phb; + struct pci_dn *pdn; + struct pnv_ioda_pe *npe; + struct pci_dev *npdev; - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return -ENXIO; + for (i = 0; ; ++i) { + npdev = pnv_pci_get_npu_dev(gpdev, i); - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return -ENODEV; + if (!npdev) + break; + + pdn = pci_get_pdn(npdev); + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) + return; + + phb = pci_bus_to_host(npdev->bus)->private_data; + + /* We only do bypass if it's enabled on the linked device */ + npe = &phb->ioda.pe_array[pdn->pe_number]; + + if (bypass) { + dev_info(&npdev->dev, + "Using 64-bit DMA iommu bypass\n"); + pnv_npu_dma_set_bypass(npe); + } else { + dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); + pnv_npu_dma_set_32(npe); + } + } +} - if (gpe->tce_bypass_enabled) { - top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); +/* Switch ownership from platform code to external user (e.g. VFIO) */ +void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + int64_t rc; + + /* + * Note: NPU has just a single TVE in the hardware which means that + * while used by the kernel, it can have either 32bit window or + * DMA bypass but never both. So we deconfigure 32bit window only + * if it was enabled at the moment of ownership change. + */ + if (npe->table_group.tables[0]) { + pnv_npu_unset_window(npe, 0); + return; } - if (bypass) - dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n"); - else - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); + /* Disable bypass */ + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, + npe->pe_number, npe->pe_number, + 0 /* bypass base */, 0); + if (rc) { + pe_err(npe, "Failed to disable bypass, err %lld\n", rc); + return; + } + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); +} - pnv_npu_dma_set_bypass(npe, bypass); - *npdev->dev.dma_mask = dma_mask; +struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) +{ + struct pnv_phb *phb = npe->phb; + struct pci_bus *pbus = phb->hose->bus; + struct pci_dev *npdev, *gpdev = NULL, *gptmp; + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - return 0; + if (!gpe || !gpdev) + return NULL; + + list_for_each_entry(npdev, &pbus->devices, bus_list) { + gptmp = pnv_pci_get_gpu_dev(npdev); + + if (gptmp != gpdev) + continue; + + pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); + iommu_group_add_device(gpe->table_group.group, &npdev->dev); + } + + return gpe; } diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index d000f4e21..c0a8201cb 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -150,15 +150,17 @@ static void print_nx_checkstop_reason(const char *level, static void print_checkstop_reason(const char *level, struct OpalHMIEvent *hmi_evt) { - switch (hmi_evt->u.xstop_error.xstop_type) { + uint8_t type = hmi_evt->u.xstop_error.xstop_type; + switch (type) { case CHECKSTOP_TYPE_CORE: print_core_checkstop_reason(level, hmi_evt); break; case CHECKSTOP_TYPE_NX: print_nx_checkstop_reason(level, hmi_evt); break; - case CHECKSTOP_TYPE_UNKNOWN: - printk("%s Unknown Malfunction Alert.\n", level); + default: + printk("%s Unknown Malfunction Alert of type %d\n", + level, type); break; } } diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index c5baaf3cc..3a5ea8236 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -48,15 +48,16 @@ #include "powernv.h" #include "pci.h" -/* 256M DMA window, 4K TCE pages, 8 bytes TCE */ -#define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) +#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ +#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ +#define PNV_IODA1_DMA32_SEGSIZE 0x10000000 #define POWERNV_IOMMU_DEFAULT_LEVELS 1 #define POWERNV_IOMMU_MAX_LEVELS 5 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); -static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, +void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -87,13 +88,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, va_end(args); } -#define pe_err(pe, fmt, ...) \ - pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) -#define pe_warn(pe, fmt, ...) \ - pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) -#define pe_info(pe, fmt, ...) \ - pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) - static bool pnv_iommu_bypass_disabled __read_mostly; static int __init iommu_setup(char *str) @@ -122,9 +116,17 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned long flags) (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); } +static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) +{ + phb->ioda.pe_array[pe_no].phb = phb; + phb->ioda.pe_array[pe_no].pe_number = pe_no; + + return &phb->ioda.pe_array[pe_no]; +} + static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) { - if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) { pr_warn("%s: Invalid PE %d on PHB#%x\n", __func__, pe_no, phb->hose->global_number); return; @@ -134,32 +136,31 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) pr_debug("%s: PE %d was reserved on PHB#%x\n", __func__, pe_no, phb->hose->global_number); - phb->ioda.pe_array[pe_no].phb = phb; - phb->ioda.pe_array[pe_no].pe_number = pe_no; + pnv_ioda_init_pe(phb, pe_no); } -static int pnv_ioda_alloc_pe(struct pnv_phb *phb) +static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) { unsigned long pe; do { pe = find_next_zero_bit(phb->ioda.pe_alloc, - phb->ioda.total_pe, 0); - if (pe >= phb->ioda.total_pe) - return IODA_INVALID_PE; + phb->ioda.total_pe_num, 0); + if (pe >= phb->ioda.total_pe_num) + return NULL; } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); - phb->ioda.pe_array[pe].phb = phb; - phb->ioda.pe_array[pe].pe_number = pe; - return pe; + return pnv_ioda_init_pe(phb, pe); } -static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) +static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) { - WARN_ON(phb->ioda.pe_array[pe].pdev); + struct pnv_phb *phb = pe->phb; + + WARN_ON(pe->pdev); - memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); - clear_bit(pe, phb->ioda.pe_alloc); + memset(pe, 0, sizeof(struct pnv_ioda_pe)); + clear_bit(pe->pe_number, phb->ioda.pe_alloc); } /* The default M64 BAR is shared by all PEs */ @@ -199,13 +200,13 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb) * expected to be 0 or last one of PE capabicity. */ r = &phb->hose->mem_resources[1]; - if (phb->ioda.reserved_pe == 0) + if (phb->ioda.reserved_pe_idx == 0) r->start += phb->ioda.m64_segsize; - else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) r->end -= phb->ioda.m64_segsize; else pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", - phb->ioda.reserved_pe); + phb->ioda.reserved_pe_idx); return 0; @@ -219,7 +220,7 @@ fail: return -EIO; } -static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev, +static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { struct pci_controller *hose = pci_bus_to_host(pdev->bus); @@ -246,22 +247,80 @@ static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev, } } -static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus, - unsigned long *pe_bitmap, - bool all) +static int pnv_ioda1_init_m64(struct pnv_phb *phb) +{ + struct resource *r; + int index; + + /* + * There are 16 M64 BARs, each of which has 8 segments. So + * there are as many M64 segments as the maximum number of + * PEs, which is 128. + */ + for (index = 0; index < PNV_IODA1_M64_NUM; index++) { + unsigned long base, segsz = phb->ioda.m64_segsize; + int64_t rc; + + base = phb->ioda.m64_base + + index * PNV_IODA1_M64_SEGS * segsz; + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, base, 0, + PNV_IODA1_M64_SEGS * segsz); + if (rc != OPAL_SUCCESS) { + pr_warn(" Error %lld setting M64 PHB#%d-BAR#%d\n", + rc, phb->hose->global_number, index); + goto fail; + } + + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, + OPAL_ENABLE_M64_SPLIT); + if (rc != OPAL_SUCCESS) { + pr_warn(" Error %lld enabling M64 PHB#%d-BAR#%d\n", + rc, phb->hose->global_number, index); + goto fail; + } + } + + /* + * Exclude the segment used by the reserved PE, which + * is expected to be 0 or last supported PE#. + */ + r = &phb->hose->mem_resources[1]; + if (phb->ioda.reserved_pe_idx == 0) + r->start += phb->ioda.m64_segsize; + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) + r->end -= phb->ioda.m64_segsize; + else + WARN(1, "Wrong reserved PE#%d on PHB#%d\n", + phb->ioda.reserved_pe_idx, phb->hose->global_number); + + return 0; + +fail: + for ( ; index >= 0; index--) + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64); + + return -EIO; +} + +static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, + unsigned long *pe_bitmap, + bool all) { struct pci_dev *pdev; list_for_each_entry(pdev, &bus->devices, bus_list) { - pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap); + pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap); if (all && pdev->subordinate) - pnv_ioda2_reserve_m64_pe(pdev->subordinate, - pe_bitmap, all); + pnv_ioda_reserve_m64_pe(pdev->subordinate, + pe_bitmap, all); } } -static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) +static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; @@ -271,28 +330,28 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) /* Root bus shouldn't use M64 */ if (pci_is_root_bus(bus)) - return IODA_INVALID_PE; + return NULL; /* Allocate bitmap */ - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); pe_alloc = kzalloc(size, GFP_KERNEL); if (!pe_alloc) { pr_warn("%s: Out of memory !\n", __func__); - return IODA_INVALID_PE; + return NULL; } /* Figure out reserved PE numbers by the PE */ - pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all); + pnv_ioda_reserve_m64_pe(bus, pe_alloc, all); /* * the current bus might not own M64 window and that's all * contributed by its child buses. For the case, we needn't * pick M64 dependent PE#. */ - if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { + if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) { kfree(pe_alloc); - return IODA_INVALID_PE; + return NULL; } /* @@ -301,10 +360,11 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) */ master_pe = NULL; i = -1; - while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < - phb->ioda.total_pe) { + while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) < + phb->ioda.total_pe_num) { pe = &phb->ioda.pe_array[i]; + phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number; if (!master_pe) { pe->flags |= PNV_IODA_PE_MASTER; INIT_LIST_HEAD(&pe->slaves); @@ -314,10 +374,30 @@ static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) pe->master = master_pe; list_add_tail(&pe->list, &master_pe->slaves); } + + /* + * P7IOC supports M64DT, which helps mapping M64 segment + * to one particular PE#. However, PHB3 has fixed mapping + * between M64 segment and PE#. In order to have same logic + * for P7IOC and PHB3, we enforce fixed mapping between M64 + * segment and PE# on P7IOC. + */ + if (phb->type == PNV_PHB_IODA1) { + int64_t rc; + + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M64_WINDOW_TYPE, + pe->pe_number / PNV_IODA1_M64_SEGS, + pe->pe_number % PNV_IODA1_M64_SEGS); + if (rc != OPAL_SUCCESS) + pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n", + __func__, rc, phb->hose->global_number, + pe->pe_number); + } } kfree(pe_alloc); - return master_pe->pe_number; + return master_pe; } static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) @@ -328,8 +408,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) const u32 *r; u64 pci_addr; - /* FIXME: Support M64 for P7IOC */ - if (phb->type != PNV_PHB_IODA2) { + if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) { pr_info(" Not support M64 window\n"); return; } @@ -355,7 +434,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) hose->mem_offset[1] = res->start - pci_addr; phb->ioda.m64_size = resource_size(res); - phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; + phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num; phb->ioda.m64_base = pci_addr; pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", @@ -363,9 +442,12 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) /* Use last M64 BAR to cover M64 window */ phb->ioda.m64_bar_idx = 15; - phb->init_m64 = pnv_ioda2_init_m64; - phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe; - phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; + if (phb->type == PNV_PHB_IODA1) + phb->init_m64 = pnv_ioda1_init_m64; + else + phb->init_m64 = pnv_ioda2_init_m64; + phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe; + phb->pick_m64_pe = pnv_ioda_pick_m64_pe; } static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) @@ -456,7 +538,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) s64 rc; /* Sanity check on PE number */ - if (pe_no < 0 || pe_no >= phb->ioda.total_pe) + if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num) return OPAL_EEH_STOPPED_PERM_UNAVAIL; /* @@ -808,44 +890,6 @@ out: return 0; } -static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) -{ - struct pnv_ioda_pe *lpe; - - list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { - if (lpe->dma_weight < pe->dma_weight) { - list_add_tail(&pe->dma_link, &lpe->dma_link); - return; - } - } - list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); -} - -static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) -{ - /* This is quite simplistic. The "base" weight of a device - * is 10. 0 means no DMA is to be accounted for it. - */ - - /* If it's a bridge, no DMA */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) - return 0; - - /* Reduce the weight of slow USB controllers */ - if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || - dev->class == PCI_CLASS_SERIAL_USB_OHCI || - dev->class == PCI_CLASS_SERIAL_USB_EHCI) - return 3; - - /* Increase the weight of RAID (includes Obsidian) */ - if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) - return 15; - - /* Default */ - return 10; -} - #ifdef CONFIG_PCI_IOV static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) { @@ -919,7 +963,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; - int pe_num; if (!pdn) { pr_err("%s: Device tree node not associated properly\n", @@ -929,8 +972,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - pe_num = pnv_ioda_alloc_pe(phb); - if (pe_num == IODA_INVALID_PE) { + pe = pnv_ioda_alloc_pe(phb); + if (!pe) { pr_warning("%s: Not enough PE# available, disabling device\n", pci_name(dev)); return NULL; @@ -943,14 +986,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) * * At some point we want to remove the PDN completely anyways */ - pe = &phb->ioda.pe_array[pe_num]; pci_dev_get(dev); pdn->pcidev = dev; - pdn->pe_number = pe_num; + pdn->pe_number = pe->pe_number; pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = dev->bus->number << 8 | pdn->devfn; @@ -958,23 +999,15 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pdn->pe_number = IODA_INVALID_PE; pe->pdev = NULL; pci_dev_put(dev); return NULL; } - /* Assign a DMA weight to the device */ - pe->dma_weight = pnv_ioda_dma_weight(dev); - if (pe->dma_weight != 0) { - phb->ioda.dma_weight += pe->dma_weight; - phb->ioda.dma_pe_count++; - } - - /* Link the PE */ - pnv_ioda_link_pe_by_weight(phb, pe); + /* Put PE to the list */ + list_add_tail(&pe->list, &phb->ioda.pe_list); return pe; } @@ -993,7 +1026,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) } pdn->pcidev = dev; pdn->pe_number = pe->pe_number; - pe->dma_weight += pnv_ioda_dma_weight(dev); if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) pnv_ioda_setup_same_PE(dev->subordinate, pe); } @@ -1005,49 +1037,44 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) * subordinate PCI devices and buses. The second type of PE is normally * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. */ -static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) +static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { struct pci_controller *hose = pci_bus_to_host(bus); struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - int pe_num = IODA_INVALID_PE; + struct pnv_ioda_pe *pe = NULL; /* Check if PE is determined by M64 */ if (phb->pick_m64_pe) - pe_num = phb->pick_m64_pe(bus, all); + pe = phb->pick_m64_pe(bus, all); /* The PE number isn't pinned by M64 */ - if (pe_num == IODA_INVALID_PE) - pe_num = pnv_ioda_alloc_pe(phb); + if (!pe) + pe = pnv_ioda_alloc_pe(phb); - if (pe_num == IODA_INVALID_PE) { + if (!pe) { pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", __func__, pci_domain_nr(bus), bus->number); - return; + return NULL; } - pe = &phb->ioda.pe_array[pe_num]; pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); pe->pbus = bus; pe->pdev = NULL; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = bus->busn_res.start << 8; - pe->dma_weight = 0; if (all) pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", - bus->busn_res.start, bus->busn_res.end, pe_num); + bus->busn_res.start, bus->busn_res.end, pe->pe_number); else pe_info(pe, "Secondary bus %d associated with PE#%d\n", - bus->busn_res.start, pe_num); + bus->busn_res.start, pe->pe_number); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pe->pbus = NULL; - return; + return NULL; } /* Associate it with all child devices */ @@ -1056,16 +1083,7 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) /* Put PE to the list */ list_add_tail(&pe->list, &phb->ioda.pe_list); - /* Account for one DMA PE if at least one DMA capable device exist - * below the bridge - */ - if (pe->dma_weight != 0) { - phb->ioda.dma_weight += pe->dma_weight; - phb->ioda.dma_pe_count++; - } - - /* Link the PE */ - pnv_ioda_link_pe_by_weight(phb, pe); + return pe; } static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) @@ -1088,7 +1106,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) * same GPU get assigned the same PE. */ gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); - for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) { + for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { pe = &phb->ioda.pe_array[pe_num]; if (!pe->pdev) continue; @@ -1106,7 +1124,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; npu_pdn->pcidev = npu_pdev; npu_pdn->pe_number = pe_num; - pe->dma_weight += pnv_ioda_dma_weight(npu_pdev); phb->ioda.pe_rmap[rid] = pe->pe_number; /* Map the PE to this link */ @@ -1378,7 +1395,7 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) pnv_ioda_deconfigure_pe(phb, pe); - pnv_ioda_free_pe(phb, pe->pe_number); + pnv_ioda_free_pe(pe); } } @@ -1387,6 +1404,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; + struct pnv_ioda_pe *pe; struct pci_dn *pdn; struct pci_sriov *iov; u16 num_vfs, i; @@ -1411,8 +1429,11 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev) /* Release PE numbers */ if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] != IODA_INVALID_PE) - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); + if (pdn->pe_num_map[i] == IODA_INVALID_PE) + continue; + + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; + pnv_ioda_free_pe(pe); } } else bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); @@ -1454,7 +1475,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pe->flags = PNV_IODA_PE_VF; pe->pbus = NULL; pe->parent_dev = pdev; - pe->tce32_seg = -1; pe->mve_number = -1; pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | pci_iov_virtfn_devfn(pdev, vf_index); @@ -1466,8 +1486,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ - if (pe_num) - pnv_ioda_free_pe(phb, pe_num); + pnv_ioda_free_pe(pe); pe->pdev = NULL; continue; } @@ -1486,6 +1505,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) struct pci_bus *bus; struct pci_controller *hose; struct pnv_phb *phb; + struct pnv_ioda_pe *pe; struct pci_dn *pdn; int ret; u16 i; @@ -1528,18 +1548,20 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) /* Calculate available PE for required VFs */ if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb); - if (pdn->pe_num_map[i] == IODA_INVALID_PE) { + pe = pnv_ioda_alloc_pe(phb); + if (!pe) { ret = -EBUSY; goto m64_failed; } + + pdn->pe_num_map[i] = pe->pe_number; } } else { mutex_lock(&phb->ioda.pe_alloc_mutex); *pdn->pe_num_map = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe, + phb->ioda.pe_alloc, phb->ioda.total_pe_num, 0, num_vfs, 0); - if (*pdn->pe_num_map >= phb->ioda.total_pe) { + if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { mutex_unlock(&phb->ioda.pe_alloc_mutex); dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); kfree(pdn->pe_num_map); @@ -1577,8 +1599,11 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) m64_failed: if (pdn->m64_single_mode) { for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] != IODA_INVALID_PE) - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); + if (pdn->pe_num_map[i] == IODA_INVALID_PE) + continue; + + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; + pnv_ioda_free_pe(pe); } } else bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); @@ -1640,8 +1665,6 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) struct pnv_ioda_pe *pe; uint64_t top; bool bypass = false; - struct pci_dev *linked_npu_dev; - int i; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) return -ENODEV;; @@ -1662,15 +1685,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) *pdev->dev.dma_mask = dma_mask; /* Update peer npu devices */ - if (pe->flags & PNV_IODA_PE_PEER) - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - if (!pe->peers[i]) - continue; - - linked_npu_dev = pe->peers[i]->pdev; - if (dma_get_mask(&linked_npu_dev->dev) != dma_mask) - dma_set_mask(&linked_npu_dev->dev, dma_mask); - } + pnv_npu_try_dma_set_bypass(pdev, bypass); return 0; } @@ -1811,28 +1826,34 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { .get = pnv_tce_get, }; -static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) +#define TCE_KILL_INVAL_ALL PPC_BIT(0) +#define TCE_KILL_INVAL_PE PPC_BIT(1) +#define TCE_KILL_INVAL_TCE PPC_BIT(2) + +void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +{ + const unsigned long val = TCE_KILL_INVAL_ALL; + + mb(); /* Ensure previous TCE table stores are visible */ + if (rm) + __raw_rm_writeq(cpu_to_be64(val), + (__be64 __iomem *) + phb->ioda.tce_inval_reg_phys); + else + __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); +} + +static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ - unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); + unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); struct pnv_phb *phb = pe->phb; - struct pnv_ioda_pe *npe; - int i; if (!phb->ioda.tce_inval_reg) return; mb(); /* Ensure above stores are visible */ __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); - - if (pe->flags & PNV_IODA_PE_PEER) - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - npe = pe->peers[i]; - if (!npe || npe->phb->type != PNV_PHB_NPU) - continue; - - pnv_npu_tce_invalidate_entire(npe); - } } static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, @@ -1842,7 +1863,7 @@ static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ - start = 0x2ull << 60; + start = TCE_KILL_INVAL_TCE; start |= (pe_number & 0xFF); end = start; @@ -1867,28 +1888,24 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct iommu_table_group_link *tgl; list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { - struct pnv_ioda_pe *npe; struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); __be64 __iomem *invalidate = rm ? (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : pe->phb->ioda.tce_inval_reg; - int i; + if (pe->phb->type == PNV_PHB_NPU) { + /* + * The NVLink hardware does not support TCE kill + * per TCE entry so we have to invalidate + * the entire cache for it. + */ + pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm); + continue; + } pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, invalidate, tbl->it_page_shift, index, npages); - - if (pe->flags & PNV_IODA_PE_PEER) - /* Invalidate PEs using the same TCE table */ - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { - npe = pe->peers[i]; - if (!npe || npe->phb->type != PNV_PHB_NPU) - continue; - - pnv_npu_tce_invalidate(npe, tbl, index, - npages, rm); - } } } @@ -1945,56 +1962,140 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { .free = pnv_ioda2_table_free, }; -static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe, unsigned int base, - unsigned int segs) +static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) +{ + unsigned int *weight = (unsigned int *)data; + + /* This is quite simplistic. The "base" weight of a device + * is 10. 0 means no DMA is to be accounted for it. + */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return 0; + + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || + dev->class == PCI_CLASS_SERIAL_USB_OHCI || + dev->class == PCI_CLASS_SERIAL_USB_EHCI) + *weight += 3; + else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) + *weight += 15; + else + *weight += 10; + + return 0; +} + +static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe) +{ + unsigned int weight = 0; + + /* SRIOV VF has same DMA32 weight as its PF */ +#ifdef CONFIG_PCI_IOV + if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) { + pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight); + return weight; + } +#endif + + if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) { + pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight); + } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) { + struct pci_dev *pdev; + + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) + pnv_pci_ioda_dev_dma_weight(pdev, &weight); + } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) { + pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight); + } + + return weight; +} + +static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) { struct page *tce_mem = NULL; struct iommu_table *tbl; - unsigned int i; + unsigned int weight, total_weight = 0; + unsigned int tce32_segsz, base, segs, avail, i; int64_t rc; void *addr; /* XXX FIXME: Handle 64-bit only DMA devices */ /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ /* XXX FIXME: Allocate multi-level tables on PHB3 */ + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (!weight) + return; - /* We shouldn't already have a 32-bit DMA associated */ - if (WARN_ON(pe->tce32_seg >= 0)) + pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, + &total_weight); + segs = (weight * phb->ioda.dma32_count) / total_weight; + if (!segs) + segs = 1; + + /* + * Allocate contiguous DMA32 segments. We begin with the expected + * number of segments. With one more attempt, the number of DMA32 + * segments to be allocated is decreased by one until one segment + * is allocated successfully. + */ + do { + for (base = 0; base <= phb->ioda.dma32_count - segs; base++) { + for (avail = 0, i = base; i < base + segs; i++) { + if (phb->ioda.dma32_segmap[i] == + IODA_INVALID_PE) + avail++; + } + + if (avail == segs) + goto found; + } + } while (--segs); + + if (!segs) { + pe_warn(pe, "No available DMA32 segments\n"); return; + } +found: tbl = pnv_pci_table_alloc(phb->hose->node); iommu_register_group(&pe->table_group, phb->hose->global_number, pe->pe_number); pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); /* Grab a 32-bit TCE table */ - pe->tce32_seg = base; + pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n", + weight, total_weight, base, segs); pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", - (base << 28), ((base + segs) << 28) - 1); + base * PNV_IODA1_DMA32_SEGSIZE, + (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); /* XXX Currently, we allocate one big contiguous table for the * TCEs. We only really need one chunk per 256M of TCE space * (ie per segment) but that's an optimization for later, it * requires some added smarts with our get/put_tce implementation + * + * Each TCE page is 4KB in size and each TCE entry occupies 8 + * bytes */ + tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3); tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(TCE32_TABLE_SIZE * segs)); + get_order(tce32_segsz * segs)); if (!tce_mem) { pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); goto fail; } addr = page_address(tce_mem); - memset(addr, 0, TCE32_TABLE_SIZE * segs); + memset(addr, 0, tce32_segsz * segs); /* Configure HW */ for (i = 0; i < segs; i++) { rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, base + i, 1, - __pa(addr) + TCE32_TABLE_SIZE * i, - TCE32_TABLE_SIZE, 0x1000); + __pa(addr) + tce32_segsz * i, + tce32_segsz, IOMMU_PAGE_SIZE_4K); if (rc) { pe_err(pe, " Failed to configure 32-bit TCE table," " err %ld\n", rc); @@ -2002,9 +2103,14 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, } } + /* Setup DMA32 segment mapping */ + for (i = base; i < base + segs; i++) + phb->ioda.dma32_segmap[i] = pe->pe_number; + /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, - base << 28, IOMMU_PAGE_SHIFT_4K); + pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs, + base * PNV_IODA1_DMA32_SEGSIZE, + IOMMU_PAGE_SHIFT_4K); /* OPAL variant of P7IOC SW invalidated TCEs */ if (phb->ioda.tce_inval_reg) @@ -2031,10 +2137,8 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, return; fail: /* XXX Failure: Try to fallback to 64-bit only ? */ - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; if (tce_mem) - __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); + __free_pages(tce_mem, get_order(tce32_segsz * segs)); if (tbl) { pnv_pci_unlink_table_and_group(tbl, &pe->table_group); iommu_free_table(tbl, "pnv"); @@ -2075,7 +2179,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, pnv_pci_link_table_and_group(phb->hose->node, num, tbl, &pe->table_group); - pnv_pci_ioda2_tce_invalidate_entire(pe); + pnv_pci_ioda2_tce_invalidate_pe(pe); return 0; } @@ -2219,7 +2323,7 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, if (ret) pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); else - pnv_pci_ioda2_tce_invalidate_entire(pe); + pnv_pci_ioda2_tce_invalidate_pe(pe); pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); @@ -2288,6 +2392,116 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; + +static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + struct pnv_ioda_pe **ptmppe = opaque; + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); + struct pci_dn *pdn = pci_get_pdn(pdev); + + if (!pdn || pdn->pe_number == IODA_INVALID_PE) + return 0; + + hose = pci_bus_to_host(pdev->bus); + phb = hose->private_data; + if (phb->type != PNV_PHB_NPU) + return 0; + + *ptmppe = &phb->ioda.pe_array[pdn->pe_number]; + + return 1; +} + +/* + * This returns PE of associated NPU. + * This assumes that NPU is in the same IOMMU group with GPU and there is + * no other PEs. + */ +static struct pnv_ioda_pe *gpe_table_group_to_npe( + struct iommu_table_group *table_group) +{ + struct pnv_ioda_pe *npe = NULL; + int ret = iommu_group_for_each_dev(table_group->group, &npe, + gpe_table_group_to_npe_cb); + + BUG_ON(!ret || !npe); + + return npe; +} + +static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group, + int num, struct iommu_table *tbl) +{ + long ret = pnv_pci_ioda2_set_window(table_group, num, tbl); + + if (ret) + return ret; + + ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl); + if (ret) + pnv_pci_ioda2_unset_window(table_group, num); + + return ret; +} + +static long pnv_pci_ioda2_npu_unset_window( + struct iommu_table_group *table_group, + int num) +{ + long ret = pnv_pci_ioda2_unset_window(table_group, num); + + if (ret) + return ret; + + return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num); +} + +static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) +{ + /* + * Detach NPU first as pnv_ioda2_take_ownership() will destroy + * the iommu_table if 32bit DMA is enabled. + */ + pnv_npu_take_ownership(gpe_table_group_to_npe(table_group)); + pnv_ioda2_take_ownership(table_group); +} + +static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { + .get_table_size = pnv_pci_ioda2_get_table_size, + .create_table = pnv_pci_ioda2_create_table, + .set_window = pnv_pci_ioda2_npu_set_window, + .unset_window = pnv_pci_ioda2_npu_unset_window, + .take_ownership = pnv_ioda2_npu_take_ownership, + .release_ownership = pnv_ioda2_release_ownership, +}; + +static void pnv_pci_ioda_setup_iommu_api(void) +{ + struct pci_controller *hose, *tmp; + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *gpe; + + /* + * Now we have all PHBs discovered, time to add NPU devices to + * the corresponding IOMMU groups. + */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->type != PNV_PHB_NPU) + continue; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + gpe = pnv_pci_npu_setup_iommu(pe); + if (gpe) + gpe->table_group.ops = &pnv_pci_ioda2_npu_ops; + } + } +} +#else /* !CONFIG_IOMMU_API */ +static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) @@ -2443,10 +2657,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, { int64_t rc; - /* We shouldn't already have a 32-bit DMA associated */ - if (WARN_ON(pe->tce32_seg >= 0)) - return; - /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; @@ -2454,7 +2664,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->pe_number); /* The PE will reserve all possible 32-bits space */ - pe->tce32_seg = 0; pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", phb->ioda.m32_pci_base); @@ -2470,11 +2679,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, #endif rc = pnv_pci_ioda2_setup_default_config(pe); - if (rc) { - if (pe->tce32_seg >= 0) - pe->tce32_seg = -1; + if (rc) return; - } if (pe->flags & PNV_IODA_PE_DEV) iommu_add_device(&pe->pdev->dev); @@ -2485,47 +2691,24 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, static void pnv_ioda_setup_dma(struct pnv_phb *phb) { struct pci_controller *hose = phb->hose; - unsigned int residual, remaining, segs, tw, base; struct pnv_ioda_pe *pe; + unsigned int weight; /* If we have more PE# than segments available, hand out one * per PE until we run out and let the rest fail. If not, * then we assign at least one segment per PE, plus more based * on the amount of devices under that PE */ - if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) - residual = 0; - else - residual = phb->ioda.tce32_count - - phb->ioda.dma_pe_count; - - pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", - hose->global_number, phb->ioda.tce32_count); - pr_info("PCI: %d PE# for a total weight of %d\n", - phb->ioda.dma_pe_count, phb->ioda.dma_weight); + pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n", + hose->global_number, phb->ioda.dma32_count); pnv_pci_ioda_setup_opal_tce_kill(phb); - /* Walk our PE list and configure their DMA segments, hand them - * out one base segment plus any residual segments based on - * weight - */ - remaining = phb->ioda.tce32_count; - tw = phb->ioda.dma_weight; - base = 0; - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { - if (!pe->dma_weight) - continue; - if (!remaining) { - pe_warn(pe, "No DMA32 resources available\n"); + /* Walk our PE list and configure their DMA segments */ + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + weight = pnv_pci_ioda_pe_dma_weight(pe); + if (!weight) continue; - } - segs = 1; - if (residual) { - segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; - if (segs > remaining) - segs = remaining; - } /* * For IODA2 compliant PHB3, we needn't care about the weight. @@ -2533,12 +2716,9 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) * the specific PE. */ if (phb->type == PNV_PHB_IODA1) { - pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", - pe->dma_weight, segs); - pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); + pnv_pci_ioda1_setup_dma_pe(phb, pe); } else if (phb->type == PNV_PHB_IODA2) { pe_info(pe, "Assign DMA32 space\n"); - segs = 0; pnv_pci_ioda2_setup_dma_pe(phb, pe); } else if (phb->type == PNV_PHB_NPU) { /* @@ -2548,9 +2728,6 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb) * as the PHB3 TVT. */ } - - remaining -= segs; - base += segs; } } @@ -2858,7 +3035,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) pdn->m64_single_mode = false; total_vfs = pci_sriov_get_totalvfs(pdev); - mul = phb->ioda.total_pe; + mul = phb->ioda.total_pe_num; total_vf_bar_sz = 0; for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { @@ -2929,19 +3106,72 @@ truncate_iov: } #endif /* CONFIG_PCI_IOV */ +static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, + struct resource *res) +{ + struct pnv_phb *phb = pe->phb; + struct pci_bus_region region; + int index; + int64_t rc; + + if (!res || !res->flags || res->start > res->end) + return; + + if (res->flags & IORESOURCE_IO) { + region.start = res->start - phb->ioda.io_pci_base; + region.end = res->end - phb->ioda.io_pci_base; + index = region.start / phb->ioda.io_segsize; + + while (index < phb->ioda.total_pe_num && + region.start <= region.end) { + phb->ioda.io_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.io_segsize; + index++; + } + } else if ((res->flags & IORESOURCE_MEM) && + !pnv_pci_is_mem_pref_64(res->flags)) { + region.start = res->start - + phb->hose->mem_offset[0] - + phb->ioda.m32_pci_base; + region.end = res->end - + phb->hose->mem_offset[0] - + phb->ioda.m32_pci_base; + index = region.start / phb->ioda.m32_segsize; + + while (index < phb->ioda.total_pe_num && + region.start <= region.end) { + phb->ioda.m32_segmap[index] = pe->pe_number; + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); + if (rc != OPAL_SUCCESS) { + pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d", + __func__, rc, index, pe->pe_number); + break; + } + + region.start += phb->ioda.m32_segsize; + index++; + } + } +} + /* * This function is supposed to be called on basis of PE from top * to bottom style. So the the I/O or MMIO segment assigned to * parent PE could be overrided by its child PEs if necessary. */ -static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, - struct pnv_ioda_pe *pe) +static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) { - struct pnv_phb *phb = hose->private_data; - struct pci_bus_region region; - struct resource *res; - int i, index; - int rc; + struct pci_dev *pdev; + int i; /* * NOTE: We only care PCI bus based PE for now. For PCI @@ -2950,57 +3180,20 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, */ BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); - pci_bus_for_each_resource(pe->pbus, res, i) { - if (!res || !res->flags || - res->start > res->end) - continue; + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { + for (i = 0; i <= PCI_ROM_RESOURCE; i++) + pnv_ioda_setup_pe_res(pe, &pdev->resource[i]); - if (res->flags & IORESOURCE_IO) { - region.start = res->start - phb->ioda.io_pci_base; - region.end = res->end - phb->ioda.io_pci_base; - index = region.start / phb->ioda.io_segsize; - - while (index < phb->ioda.total_pe && - region.start <= region.end) { - phb->ioda.io_segmap[index] = pe->pe_number; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d when mapping IO " - "segment #%d to PE#%d\n", - __func__, rc, index, pe->pe_number); - break; - } - - region.start += phb->ioda.io_segsize; - index++; - } - } else if ((res->flags & IORESOURCE_MEM) && - !pnv_pci_is_mem_pref_64(res->flags)) { - region.start = res->start - - hose->mem_offset[0] - - phb->ioda.m32_pci_base; - region.end = res->end - - hose->mem_offset[0] - - phb->ioda.m32_pci_base; - index = region.start / phb->ioda.m32_segsize; - - while (index < phb->ioda.total_pe && - region.start <= region.end) { - phb->ioda.m32_segmap[index] = pe->pe_number; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); - if (rc != OPAL_SUCCESS) { - pr_err("%s: OPAL error %d when mapping M32 " - "segment#%d to PE#%d", - __func__, rc, index, pe->pe_number); - break; - } - - region.start += phb->ioda.m32_segsize; - index++; - } - } + /* + * If the PE contains all subordinate PCI buses, the + * windows of the child bridges should be mapped to + * the PE as well. + */ + if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev)) + continue; + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) + pnv_ioda_setup_pe_res(pe, + &pdev->resource[PCI_BRIDGE_RESOURCES + i]); } } @@ -3018,7 +3211,7 @@ static void pnv_pci_ioda_setup_seg(void) continue; list_for_each_entry(pe, &phb->ioda.pe_list, list) { - pnv_ioda_setup_pe_seg(hose, pe); + pnv_ioda_setup_pe_seg(pe); } } } @@ -3035,6 +3228,8 @@ static void pnv_pci_ioda_setup_DMA(void) phb = hose->private_data; phb->initialized = 1; } + + pnv_pci_ioda_setup_iommu_api(); } static void pnv_pci_ioda_create_dbgfs(void) @@ -3056,27 +3251,6 @@ static void pnv_pci_ioda_create_dbgfs(void) #endif /* CONFIG_DEBUG_FS */ } -static void pnv_npu_ioda_fixup(void) -{ - bool enable_bypass; - struct pci_controller *hose, *tmp; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type != PNV_PHB_NPU) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { - enable_bypass = dma_get_mask(&pe->pdev->dev) == - DMA_BIT_MASK(64); - pnv_npu_init_dma_pe(pe); - pnv_npu_dma_set_bypass(pe, enable_bypass); - } - } -} - static void pnv_pci_ioda_fixup(void) { pnv_pci_ioda_setup_PEs(); @@ -3089,9 +3263,6 @@ static void pnv_pci_ioda_fixup(void) eeh_init(); eeh_addr_cache_build(); #endif - - /* Link NPU IODA tables to their PCI devices. */ - pnv_npu_ioda_fixup(); } /* @@ -3195,12 +3366,6 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) return true; } -static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, - u32 devfn) -{ - return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; -} - static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -3210,31 +3375,39 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) } static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, + .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_bus_setup = pnv_pci_dma_bus_setup, #ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, #endif - .enable_device_hook = pnv_pci_enable_device_hook, - .window_alignment = pnv_pci_window_alignment, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, - .shutdown = pnv_pci_ioda_shutdown, + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_pci_ioda_dma_set_mask, + .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, + .shutdown = pnv_pci_ioda_shutdown, }; +static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +{ + dev_err_once(&npdev->dev, + "%s operation unsupported for NVLink devices\n", + __func__); + return -EPERM; +} + static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_dev_setup = pnv_pci_dma_dev_setup, #ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, + .setup_msi_irqs = pnv_setup_msi_irqs, + .teardown_msi_irqs = pnv_teardown_msi_irqs, #endif - .enable_device_hook = pnv_pci_enable_device_hook, - .window_alignment = pnv_pci_window_alignment, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, - .shutdown = pnv_pci_ioda_shutdown, + .enable_device_hook = pnv_pci_enable_device_hook, + .window_alignment = pnv_pci_window_alignment, + .reset_secondary_bus = pnv_pci_reset_secondary_bus, + .dma_set_mask = pnv_npu_dma_set_mask, + .shutdown = pnv_pci_ioda_shutdown, }; static void __init pnv_pci_init_ioda_phb(struct device_node *np, @@ -3242,10 +3415,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, { struct pci_controller *hose; struct pnv_phb *phb; - unsigned long size, m32map_off, pemap_off, iomap_off = 0; + unsigned long size, m64map_off, m32map_off, pemap_off; + unsigned long iomap_off = 0, dma32map_off = 0; const __be64 *prop64; const __be32 *prop32; int len; + unsigned int segno; u64 phb_id; void *aux; long rc; @@ -3306,13 +3481,13 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, pr_err(" Failed to map registers !\n"); /* Initialize more IODA stuff */ - phb->ioda.total_pe = 1; + phb->ioda.total_pe_num = 1; prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); if (prop32) - phb->ioda.total_pe = be32_to_cpup(prop32); + phb->ioda.total_pe_num = be32_to_cpup(prop32); prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); if (prop32) - phb->ioda.reserved_pe = be32_to_cpup(prop32); + phb->ioda.reserved_pe_idx = be32_to_cpup(prop32); /* Parse 64-bit MMIO range */ pnv_ioda_parse_m64_window(phb); @@ -3321,36 +3496,58 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* FW Has already off top 64k of M32 space (MSI space) */ phb->ioda.m32_size += 0x10000; - phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num; phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; phb->ioda.io_size = hose->pci_io_size; - phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num; phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ + /* Calculate how many 32-bit TCE segments we have */ + phb->ioda.dma32_count = phb->ioda.m32_pci_base / + PNV_IODA1_DMA32_SEGSIZE; + /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); + size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, + sizeof(unsigned long)); + m64map_off = size; + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); m32map_off = size; - size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]); if (phb->type == PNV_PHB_IODA1) { iomap_off = size; - size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); + size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]); + dma32map_off = size; + size += phb->ioda.dma32_count * + sizeof(phb->ioda.dma32_segmap[0]); } pemap_off = size; - size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); + size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); aux = memblock_virt_alloc(size, 0); phb->ioda.pe_alloc = aux; + phb->ioda.m64_segmap = aux + m64map_off; phb->ioda.m32_segmap = aux + m32map_off; - if (phb->type == PNV_PHB_IODA1) + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) { + phb->ioda.m64_segmap[segno] = IODA_INVALID_PE; + phb->ioda.m32_segmap[segno] = IODA_INVALID_PE; + } + if (phb->type == PNV_PHB_IODA1) { phb->ioda.io_segmap = aux + iomap_off; + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) + phb->ioda.io_segmap[segno] = IODA_INVALID_PE; + + phb->ioda.dma32_segmap = aux + dma32map_off; + for (segno = 0; segno < phb->ioda.dma32_count; segno++) + phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE; + } phb->ioda.pe_array = aux + pemap_off; - set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); + set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); - INIT_LIST_HEAD(&phb->ioda.pe_dma_list); INIT_LIST_HEAD(&phb->ioda.pe_list); mutex_init(&phb->ioda.pe_list_mutex); /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; + phb->ioda.dma32_count = phb->ioda.m32_pci_base / + PNV_IODA1_DMA32_SEGSIZE; #if 0 /* We should really do that ... */ rc = opal_pci_set_phb_mem_window(opal->phb_id, @@ -3362,7 +3559,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, #endif pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", - phb->ioda.total_pe, phb->ioda.reserved_pe, + phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx, phb->ioda.m32_size, phb->ioda.m32_segsize); if (phb->ioda.m64_size) pr_info(" M64: 0x%lx [segment=0x%lx]\n", @@ -3377,12 +3574,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->freeze_pe = pnv_ioda_freeze_pe; phb->unfreeze_pe = pnv_ioda_unfreeze_pe; - /* Setup RID -> PE mapping function */ - phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; - - /* Setup TCEs */ - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; - /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); @@ -3395,10 +3586,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, */ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; - if (phb->type == PNV_PHB_NPU) + if (phb->type == PNV_PHB_NPU) { hose->controller_ops = pnv_npu_ioda_controller_ops; - else + } else { + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; hose->controller_ops = pnv_pci_ioda_controller_ops; + } #ifdef CONFIG_PCI_IOV ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 73c8dc2a3..1d92bd93b 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -39,9 +39,6 @@ /* Delay in usec */ #define PCI_RESET_DELAY_US 3000000 -#define cfg_dbg(fmt...) do { } while(0) -//#define cfg_dbg(fmt...) printk(fmt) - #ifdef CONFIG_PCI_MSI int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { @@ -370,7 +367,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) struct pnv_phb *phb = pdn->phb->private_data; u8 fstate; __be16 pcierr; - int pe_no; + unsigned int pe_no; s64 rc; /* @@ -380,7 +377,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) */ pe_no = pdn->pe_number; if (pe_no == IODA_INVALID_PE) { - pe_no = phb->ioda.reserved_pe; + pe_no = phb->ioda.reserved_pe_idx; } /* @@ -402,8 +399,8 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn) } } - cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", - (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); + pr_devel(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", + (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); /* Clear the frozen state if applicable */ if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || @@ -451,8 +448,8 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", - __func__, pdn->busno, pdn->devfn, where, size, *val); + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, *val); return PCIBIOS_SUCCESSFUL; } @@ -462,8 +459,8 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, struct pnv_phb *phb = pdn->phb->private_data; u32 bdfn = (pdn->busno << 8) | pdn->devfn; - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", - pdn->busno, pdn->devfn, where, size, val); + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", + __func__, pdn->busno, pdn->devfn, where, size, val); switch (size) { case 1: opal_pci_config_write_byte(phb->opal_id, bdfn, where, val); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 3f814f382..7dee25e30 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -24,7 +24,6 @@ enum pnv_phb_model { #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ -#define PNV_IODA_PE_PEER (1 << 6) /* PE has peers */ /* Data associated with a PE, including IOMMU tracking etc.. */ struct pnv_phb; @@ -32,9 +31,6 @@ struct pnv_ioda_pe { unsigned long flags; struct pnv_phb *phb; -#define PNV_IODA_MAX_PEER_PES 8 - struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES]; - /* A PE can be associated with a single device or an * entire bus (& children). In the former case, pdev * is populated, in the later case, pbus is. @@ -53,14 +49,7 @@ struct pnv_ioda_pe { /* PE number */ unsigned int pe_number; - /* "Weight" assigned to the PE for the sake of DMA resource - * allocations - */ - unsigned int dma_weight; - /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ - int tce32_seg; - int tce32_segcount; struct iommu_table_group table_group; /* 64-bit TCE bypass region */ @@ -78,7 +67,6 @@ struct pnv_ioda_pe { struct list_head slaves; /* Link in list of PE#s */ - struct list_head dma_link; struct list_head list; }; @@ -110,19 +98,18 @@ struct pnv_phb { unsigned int is_64, struct msi_msg *msg); void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); void (*fixup_phb)(struct pci_controller *hose); - u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); int (*init_m64)(struct pnv_phb *phb); void (*reserve_m64_pe)(struct pci_bus *bus, unsigned long *pe_bitmap, bool all); - int (*pick_m64_pe)(struct pci_bus *bus, bool all); + struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); struct { /* Global bridge info */ - unsigned int total_pe; - unsigned int reserved_pe; + unsigned int total_pe_num; + unsigned int reserved_pe_idx; /* 32-bit MMIO window */ unsigned int m32_size; @@ -141,15 +128,19 @@ struct pnv_phb { unsigned int io_segsize; unsigned int io_pci_base; - /* PE allocation bitmap */ - unsigned long *pe_alloc; - /* PE allocation mutex */ + /* PE allocation */ struct mutex pe_alloc_mutex; + unsigned long *pe_alloc; + struct pnv_ioda_pe *pe_array; /* M32 & IO segment maps */ + unsigned int *m64_segmap; unsigned int *m32_segmap; unsigned int *io_segmap; - struct pnv_ioda_pe *pe_array; + + /* DMA32 segment maps - IODA1 only */ + unsigned int dma32_count; + unsigned int *dma32_segmap; /* IRQ chip */ int irq_chip_init; @@ -167,20 +158,6 @@ struct pnv_phb { */ unsigned char pe_rmap[0x10000]; - /* 32-bit TCE tables allocation */ - unsigned long tce32_count; - - /* Total "weight" for the sake of DMA resources - * allocation - */ - unsigned int dma_weight; - unsigned int dma_pe_count; - - /* Sorted list of used PE's, sorted at - * boot for resource allocation purposes - */ - struct list_head pe_dma_list; - /* TCE cache invalidate registers (physical and * remapped) */ @@ -236,16 +213,23 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, + const char *fmt, ...); +#define pe_err(pe, fmt, ...) \ + pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) +#define pe_warn(pe, fmt, ...) \ + pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) +#define pe_info(pe, fmt, ...) \ + pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) + /* Nvlink functions */ -extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe); -extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, - struct iommu_table *tbl, - unsigned long index, - unsigned long npages, - bool rm); -extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); -extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); -extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled); -extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask); +extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); +extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); +extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); +extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, + struct iommu_table *tbl); +extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); +extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); +extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 1acb0c72d..ee6430bed 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -273,7 +273,10 @@ static int __init pnv_probe(void) if (!of_flat_dt_is_compatible(root, "ibm,powernv")) return 0; - hpte_init_native(); + if (IS_ENABLED(CONFIG_PPC_RADIX_MMU) && radix_enabled()) + radix_init_native(); + else if (IS_ENABLED(CONFIG_PPC_STD_MMU_64)) + hpte_init_native(); if (firmware_has_feature(FW_FEATURE_OPAL)) pnv_setup_machdep_opal(); diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 2f95d33cf..c9a3e6771 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn, vflags &= ~HPTE_V_SECONDARY; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags; + hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags; spin_lock_irqsave(&ps3_htab_lock, flags); diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index a0bca05e2..492b2575e 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -205,7 +205,7 @@ static void spu_unmap(struct spu *spu) static int __init setup_areas(struct spu *spu) { struct table {char* name; unsigned long addr; unsigned long size;}; - static const unsigned long shadow_flags = _PAGE_NO_CACHE | 3; + unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, sizeof(struct spe_shadow), @@ -216,7 +216,7 @@ static int __init setup_areas(struct spu *spu) } spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys, - LS_SIZE, _PAGE_NO_CACHE); + LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); if (!spu->local_store) { pr_debug("%s:%d: ioremap local_store failed\n", diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c index 405baaf96..3998e0f9a 100644 --- a/arch/powerpc/platforms/pseries/eeh_pseries.c +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c @@ -53,7 +53,6 @@ static int ibm_read_slot_reset_state2; static int ibm_slot_error_detail; static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; -static int ibm_configure_bridge; static int ibm_configure_pe; /* @@ -81,7 +80,14 @@ static int pseries_eeh_init(void) ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); ibm_configure_pe = rtas_token("ibm,configure-pe"); - ibm_configure_bridge = rtas_token("ibm,configure-bridge"); + + /* + * ibm,configure-pe and ibm,configure-bridge have the same semantics, + * however ibm,configure-pe can be faster. If we can't find + * ibm,configure-pe then fall back to using ibm,configure-bridge. + */ + if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE) + ibm_configure_pe = rtas_token("ibm,configure-bridge"); /* * Necessary sanity check. We needn't check "get-config-addr-info" @@ -93,8 +99,7 @@ static int pseries_eeh_init(void) (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE && ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) || ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE || - (ibm_configure_pe == RTAS_UNKNOWN_SERVICE && - ibm_configure_bridge == RTAS_UNKNOWN_SERVICE)) { + ibm_configure_pe == RTAS_UNKNOWN_SERVICE) { pr_info("EEH functionality not supported\n"); return -EINVAL; } @@ -624,18 +629,9 @@ static int pseries_eeh_configure_bridge(struct eeh_pe *pe) config_addr = pe->addr; while (max_wait > 0) { - /* Use new configure-pe function, if supported */ - if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) { - ret = rtas_call(ibm_configure_pe, 3, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid)); - } else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) { - ret = rtas_call(ibm_configure_bridge, 3, 1, NULL, - config_addr, BUID_HI(pe->phb->buid), - BUID_LO(pe->phb->buid)); - } else { - return -EFAULT; - } + ret = rtas_call(ibm_configure_pe, 3, 1, NULL, + config_addr, BUID_HI(pe->phb->buid), + BUID_LO(pe->phb->buid)); if (!ret) return ret; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index e9ff44cd5..2ce138542 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -116,6 +116,155 @@ static struct property *dlpar_clone_drconf_property(struct device_node *dn) return new_prop; } +static void dlpar_update_drconf_property(struct device_node *dn, + struct property *prop) +{ + struct of_drconf_cell *lmbs; + u32 num_lmbs, *p; + int i; + + /* Convert the property back to BE */ + p = prop->value; + num_lmbs = *p; + *p = cpu_to_be32(*p); + p++; + + lmbs = (struct of_drconf_cell *)p; + for (i = 0; i < num_lmbs; i++) { + lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); + lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); + lmbs[i].flags = cpu_to_be32(lmbs[i].flags); + } + + rtas_hp_event = true; + of_update_property(dn, prop); + rtas_hp_event = false; +} + +static int dlpar_update_device_tree_lmb(struct of_drconf_cell *lmb) +{ + struct device_node *dn; + struct property *prop; + struct of_drconf_cell *lmbs; + u32 *p, num_lmbs; + int i; + + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dn) + return -ENODEV; + + prop = dlpar_clone_drconf_property(dn); + if (!prop) { + of_node_put(dn); + return -ENODEV; + } + + p = prop->value; + num_lmbs = *p++; + lmbs = (struct of_drconf_cell *)p; + + for (i = 0; i < num_lmbs; i++) { + if (lmbs[i].drc_index == lmb->drc_index) { + lmbs[i].flags = lmb->flags; + lmbs[i].aa_index = lmb->aa_index; + + dlpar_update_drconf_property(dn, prop); + break; + } + } + + of_node_put(dn); + return 0; +} + +static u32 lookup_lmb_associativity_index(struct of_drconf_cell *lmb) +{ + struct device_node *parent, *lmb_node, *dr_node; + const u32 *lmb_assoc; + const u32 *assoc_arrays; + u32 aa_index; + int aa_arrays, aa_array_entries, aa_array_sz; + int i; + + parent = of_find_node_by_path("/"); + if (!parent) + return -ENODEV; + + lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index), + parent); + of_node_put(parent); + if (!lmb_node) + return -EINVAL; + + lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL); + if (!lmb_assoc) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dr_node) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + assoc_arrays = of_get_property(dr_node, + "ibm,associativity-lookup-arrays", + NULL); + of_node_put(dr_node); + if (!assoc_arrays) { + dlpar_free_cc_nodes(lmb_node); + return -ENODEV; + } + + /* The ibm,associativity-lookup-arrays property is defined to be + * a 32-bit value specifying the number of associativity arrays + * followed by a 32-bitvalue specifying the number of entries per + * array, followed by the associativity arrays. + */ + aa_arrays = be32_to_cpu(assoc_arrays[0]); + aa_array_entries = be32_to_cpu(assoc_arrays[1]); + aa_array_sz = aa_array_entries * sizeof(u32); + + aa_index = -1; + for (i = 0; i < aa_arrays; i++) { + int indx = (i * aa_array_entries) + 2; + + if (memcmp(&assoc_arrays[indx], &lmb_assoc[1], aa_array_sz)) + continue; + + aa_index = i; + break; + } + + dlpar_free_cc_nodes(lmb_node); + return aa_index; +} + +static int dlpar_add_device_tree_lmb(struct of_drconf_cell *lmb) +{ + int aa_index; + + lmb->flags |= DRCONF_MEM_ASSIGNED; + + aa_index = lookup_lmb_associativity_index(lmb); + if (aa_index < 0) { + pr_err("Couldn't find associativity index for drc index %x\n", + lmb->drc_index); + return aa_index; + } + + lmb->aa_index = aa_index; + return dlpar_update_device_tree_lmb(lmb); +} + +static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb) +{ + lmb->flags &= ~DRCONF_MEM_ASSIGNED; + lmb->aa_index = 0xffffffff; + return dlpar_update_device_tree_lmb(lmb); +} + static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) { unsigned long section_nr; @@ -243,8 +392,8 @@ static int dlpar_remove_lmb(struct of_drconf_cell *lmb) memblock_remove(lmb->base_addr, block_sz); dlpar_release_drc(lmb->drc_index); + dlpar_remove_device_tree_lmb(lmb); - lmb->flags &= ~DRCONF_MEM_ASSIGNED; return 0; } @@ -384,43 +533,32 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop) #endif /* CONFIG_MEMORY_HOTREMOVE */ -static int dlpar_add_lmb(struct of_drconf_cell *lmb) +static int dlpar_add_lmb_memory(struct of_drconf_cell *lmb) { struct memory_block *mem_block; unsigned long block_sz; int nid, rc; - if (lmb->flags & DRCONF_MEM_ASSIGNED) - return -EINVAL; - block_sz = memory_block_size_bytes(); - rc = dlpar_acquire_drc(lmb->drc_index); - if (rc) - return rc; - /* Find the node id for this address */ nid = memory_add_physaddr_to_nid(lmb->base_addr); /* Add the memory */ rc = add_memory(nid, lmb->base_addr, block_sz); - if (rc) { - dlpar_release_drc(lmb->drc_index); + if (rc) return rc; - } /* Register this block of memory */ rc = memblock_add(lmb->base_addr, block_sz); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return rc; } mem_block = lmb_to_memblock(lmb); if (!mem_block) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return -EINVAL; } @@ -428,7 +566,6 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) put_device(&mem_block->dev); if (rc) { remove_memory(nid, lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); return rc; } @@ -436,6 +573,34 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) return 0; } +static int dlpar_add_lmb(struct of_drconf_cell *lmb) +{ + int rc; + + if (lmb->flags & DRCONF_MEM_ASSIGNED) + return -EINVAL; + + rc = dlpar_acquire_drc(lmb->drc_index); + if (rc) + return rc; + + rc = dlpar_add_device_tree_lmb(lmb); + if (rc) { + pr_err("Couldn't update device tree for drc index %x\n", + lmb->drc_index); + dlpar_release_drc(lmb->drc_index); + return rc; + } + + rc = dlpar_add_lmb_memory(lmb); + if (rc) { + dlpar_remove_device_tree_lmb(lmb); + dlpar_release_drc(lmb->drc_index); + } + + return rc; +} + static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) { struct of_drconf_cell *lmbs; @@ -536,31 +701,6 @@ static int dlpar_memory_add_by_index(u32 drc_index, struct property *prop) return rc; } -static void dlpar_update_drconf_property(struct device_node *dn, - struct property *prop) -{ - struct of_drconf_cell *lmbs; - u32 num_lmbs, *p; - int i; - - /* Convert the property back to BE */ - p = prop->value; - num_lmbs = *p; - *p = cpu_to_be32(*p); - p++; - - lmbs = (struct of_drconf_cell *)p; - for (i = 0; i < num_lmbs; i++) { - lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); - lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); - lmbs[i].flags = cpu_to_be32(lmbs[i].flags); - } - - rtas_hp_event = true; - of_update_property(dn, prop); - rtas_hp_event = false; -} - int dlpar_memory(struct pseries_hp_errorlog *hp_elog) { struct device_node *dn; @@ -608,10 +748,7 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) break; } - if (rc) - dlpar_free_drconf_property(prop); - else - dlpar_update_drconf_property(dn, prop); + dlpar_free_drconf_property(prop); dlpar_memory_out: of_node_put(dn); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 2415a0d31..7f6100d91 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -89,18 +89,21 @@ void vpa_init(int cpu) "%lx failed with %ld\n", cpu, hwcpu, addr, ret); return; } + +#ifdef CONFIG_PPC_STD_MMU_64 /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. */ - addr = __pa(paca[cpu].slb_shadow_ptr); - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) { + addr = __pa(paca[cpu].slb_shadow_ptr); ret = register_slb_shadow(hwcpu, addr); if (ret) pr_err("WARNING: SLB shadow buffer registration for " "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } +#endif /* CONFIG_PPC_STD_MMU_64 */ /* * Register dispatch trace log, if one has been allocated. @@ -123,6 +126,8 @@ void vpa_init(int cpu) } } +#ifdef CONFIG_PPC_STD_MMU_64 + static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, unsigned long rflags, unsigned long vflags, @@ -139,7 +144,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, hpte_group, vpn, pa, rflags, vflags, psize); hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; if (!(vflags & HPTE_V_BOLTED)) pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); @@ -152,10 +157,6 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, /* Exact = 0 */ flags = 0; - /* Make pHyp happy */ - if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU)) - hpte_r &= ~HPTE_R_M; - if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) flags |= H_COALESCE_CAND; @@ -659,6 +660,8 @@ static void pSeries_set_page_state(struct page *page, int order, void arch_free_page(struct page *page, int order) { + if (radix_enabled()) + return; if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO)) return; @@ -666,7 +669,8 @@ void arch_free_page(struct page *page, int order) } EXPORT_SYMBOL(arch_free_page); -#endif +#endif /* CONFIG_PPC_SMLPAR */ +#endif /* CONFIG_PPC_STD_MMU_64 */ #ifdef CONFIG_TRACEPOINTS #ifdef HAVE_JUMP_LABEL diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index c9fecf09b..afa05a2cb 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -484,8 +484,9 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); +#ifdef CONFIG_PPC_STD_MMU_64 seq_printf(m, "slb_size=%d\n", mmu_slb_size); - +#endif parse_em_data(m); return 0; diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index ceb18d349..a560a98bc 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -191,8 +191,8 @@ static int update_dt_node(__be32 phandle, s32 scope) break; case 0x80000000: - prop = of_find_property(dn, prop_name, NULL); - of_remove_property(dn, prop); + of_remove_property(dn, of_find_property(dn, + prop_name, NULL)); prop = NULL; break; diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 272e9ec1a..543a6386f 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -305,7 +305,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request) memset(&counts, 0, sizeof(struct msi_counts)); /* Work out how many devices we have below this PE */ - traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts); + pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts); if (counts.num_devices == 0) { pr_err("rtas_msi: found 0 devices under PE for %s\n", @@ -320,7 +320,7 @@ static int msi_quota_for_device(struct pci_dev *dev, int request) /* else, we have some more calculating to do */ counts.requestor = pci_device_to_OF_node(dev); counts.request = request; - traverse_pci_devices(pe_dn, count_spare_msis, &counts); + pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts); /* If the quota isn't an integer multiple of the total, we can * use the remainder as spare MSIs for anyone that wants them. */ diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index 5d4a3df59..906dbaa97 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -34,38 +34,6 @@ #include "pseries.h" -static struct pci_bus * -find_bus_among_children(struct pci_bus *bus, - struct device_node *dn) -{ - struct pci_bus *child = NULL; - struct pci_bus *tmp; - struct device_node *busdn; - - busdn = pci_bus_to_OF_node(bus); - if (busdn == dn) - return bus; - - list_for_each_entry(tmp, &bus->children, node) { - child = find_bus_among_children(tmp, dn); - if (child) - break; - }; - return child; -} - -struct pci_bus * -pcibios_find_pci_bus(struct device_node *dn) -{ - struct pci_dn *pdn = dn->data; - - if (!pdn || !pdn->phb || !pdn->phb->bus) - return NULL; - - return find_bus_among_children(pdn->phb->bus, dn); -} -EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); - struct pci_controller *init_phb_dynamic(struct device_node *dn) { struct pci_controller *phb; diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 7c7fcc042..cc66c49f0 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -303,7 +303,6 @@ static int do_remove_property(char *buf, size_t bufsize) { struct device_node *np; char *tmp; - struct property *prop; buf = parse_node(buf, bufsize, &np); if (!np) @@ -316,9 +315,7 @@ static int do_remove_property(char *buf, size_t bufsize) if (strlen(buf) == 0) return -EINVAL; - prop = of_find_property(np, buf, NULL); - - return of_remove_property(np, prop); + return of_remove_property(np, of_find_property(np, buf, NULL)); } static int do_update_property(char *buf, size_t bufsize) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 6e944fc6e..9883bc7ea 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -235,6 +235,8 @@ static void __init pseries_discover_pic(void) for_each_node_by_name(np, "interrupt-controller") { typep = of_get_property(np, "compatible", NULL); + if (!typep) + continue; if (strstr(typep, "open-pic")) { pSeries_mpic_node = of_node_get(np); ppc_md.init_IRQ = pseries_mpic_init_IRQ; @@ -265,7 +267,7 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act pdn = parent ? PCI_DN(parent) : NULL; if (pdn) { /* Create pdn and EEH device */ - update_dn_pci_info(np, pdn->phb); + pci_add_device_node_info(pdn->phb, np); eeh_dev_init(PCI_DN(np), pdn->phb); } diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index 0d112b94d..ff75d70f7 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -143,7 +143,7 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) */ static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void __pmem **kaddr, pfn_t *pfn) + void __pmem **kaddr, pfn_t *pfn, long size) { struct axon_ram_bank *bank = device->bd_disk->private_data; loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c index 8ed65365b..6c110994d 100644 --- a/arch/powerpc/sysdev/cpm1.c +++ b/arch/powerpc/sysdev/cpm1.c @@ -532,15 +532,9 @@ struct cpm1_gpio16_chip { u16 cpdata; }; -static inline struct cpm1_gpio16_chip * -to_cpm1_gpio16_chip(struct of_mm_gpio_chip *mm_gc) -{ - return container_of(mm_gc, struct cpm1_gpio16_chip, mm_gc); -} - static void cpm1_gpio16_save_regs(struct of_mm_gpio_chip *mm_gc) { - struct cpm1_gpio16_chip *cpm1_gc = to_cpm1_gpio16_chip(mm_gc); + struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport16 __iomem *iop = mm_gc->regs; cpm1_gc->cpdata = in_be16(&iop->dat); @@ -560,7 +554,7 @@ static int cpm1_gpio16_get(struct gpio_chip *gc, unsigned int gpio) static void __cpm1_gpio16_set(struct of_mm_gpio_chip *mm_gc, u16 pin_mask, int value) { - struct cpm1_gpio16_chip *cpm1_gc = to_cpm1_gpio16_chip(mm_gc); + struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport16 __iomem *iop = mm_gc->regs; if (value) @@ -574,7 +568,7 @@ static void __cpm1_gpio16_set(struct of_mm_gpio_chip *mm_gc, u16 pin_mask, static void cpm1_gpio16_set(struct gpio_chip *gc, unsigned int gpio, int value) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm1_gpio16_chip *cpm1_gc = to_cpm1_gpio16_chip(mm_gc); + struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); unsigned long flags; u16 pin_mask = 1 << (15 - gpio); @@ -588,7 +582,7 @@ static void cpm1_gpio16_set(struct gpio_chip *gc, unsigned int gpio, int value) static int cpm1_gpio16_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm1_gpio16_chip *cpm1_gc = to_cpm1_gpio16_chip(mm_gc); + struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport16 __iomem *iop = mm_gc->regs; unsigned long flags; u16 pin_mask = 1 << (15 - gpio); @@ -606,7 +600,7 @@ static int cpm1_gpio16_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int cpm1_gpio16_dir_in(struct gpio_chip *gc, unsigned int gpio) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm1_gpio16_chip *cpm1_gc = to_cpm1_gpio16_chip(mm_gc); + struct cpm1_gpio16_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport16 __iomem *iop = mm_gc->regs; unsigned long flags; u16 pin_mask = 1 << (15 - gpio); @@ -642,7 +636,7 @@ int cpm1_gpiochip_add16(struct device_node *np) gc->get = cpm1_gpio16_get; gc->set = cpm1_gpio16_set; - return of_mm_gpiochip_add(np, mm_gc); + return of_mm_gpiochip_add_data(np, mm_gc, cpm1_gc); } struct cpm1_gpio32_chip { @@ -653,15 +647,9 @@ struct cpm1_gpio32_chip { u32 cpdata; }; -static inline struct cpm1_gpio32_chip * -to_cpm1_gpio32_chip(struct of_mm_gpio_chip *mm_gc) -{ - return container_of(mm_gc, struct cpm1_gpio32_chip, mm_gc); -} - static void cpm1_gpio32_save_regs(struct of_mm_gpio_chip *mm_gc) { - struct cpm1_gpio32_chip *cpm1_gc = to_cpm1_gpio32_chip(mm_gc); + struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport32b __iomem *iop = mm_gc->regs; cpm1_gc->cpdata = in_be32(&iop->dat); @@ -681,7 +669,7 @@ static int cpm1_gpio32_get(struct gpio_chip *gc, unsigned int gpio) static void __cpm1_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, int value) { - struct cpm1_gpio32_chip *cpm1_gc = to_cpm1_gpio32_chip(mm_gc); + struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport32b __iomem *iop = mm_gc->regs; if (value) @@ -695,7 +683,7 @@ static void __cpm1_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, static void cpm1_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm1_gpio32_chip *cpm1_gc = to_cpm1_gpio32_chip(mm_gc); + struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -709,7 +697,7 @@ static void cpm1_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) static int cpm1_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm1_gpio32_chip *cpm1_gc = to_cpm1_gpio32_chip(mm_gc); + struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport32b __iomem *iop = mm_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -727,7 +715,7 @@ static int cpm1_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int cpm1_gpio32_dir_in(struct gpio_chip *gc, unsigned int gpio) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm1_gpio32_chip *cpm1_gc = to_cpm1_gpio32_chip(mm_gc); + struct cpm1_gpio32_chip *cpm1_gc = gpiochip_get_data(&mm_gc->gc); struct cpm_ioport32b __iomem *iop = mm_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -763,7 +751,7 @@ int cpm1_gpiochip_add32(struct device_node *np) gc->get = cpm1_gpio32_get; gc->set = cpm1_gpio32_set; - return of_mm_gpiochip_add(np, mm_gc); + return of_mm_gpiochip_add_data(np, mm_gc, cpm1_gc); } static int cpm_init_par_io(void) diff --git a/arch/powerpc/sysdev/cpm_common.c b/arch/powerpc/sysdev/cpm_common.c index 9d32465ed..0ac12e5fd 100644 --- a/arch/powerpc/sysdev/cpm_common.c +++ b/arch/powerpc/sysdev/cpm_common.c @@ -80,15 +80,9 @@ struct cpm2_gpio32_chip { u32 cpdata; }; -static inline struct cpm2_gpio32_chip * -to_cpm2_gpio32_chip(struct of_mm_gpio_chip *mm_gc) -{ - return container_of(mm_gc, struct cpm2_gpio32_chip, mm_gc); -} - static void cpm2_gpio32_save_regs(struct of_mm_gpio_chip *mm_gc) { - struct cpm2_gpio32_chip *cpm2_gc = to_cpm2_gpio32_chip(mm_gc); + struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(&mm_gc->gc); struct cpm2_ioports __iomem *iop = mm_gc->regs; cpm2_gc->cpdata = in_be32(&iop->dat); @@ -108,7 +102,7 @@ static int cpm2_gpio32_get(struct gpio_chip *gc, unsigned int gpio) static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, int value) { - struct cpm2_gpio32_chip *cpm2_gc = to_cpm2_gpio32_chip(mm_gc); + struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(&mm_gc->gc); struct cpm2_ioports __iomem *iop = mm_gc->regs; if (value) @@ -122,7 +116,7 @@ static void __cpm2_gpio32_set(struct of_mm_gpio_chip *mm_gc, u32 pin_mask, static void cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm2_gpio32_chip *cpm2_gc = to_cpm2_gpio32_chip(mm_gc); + struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -136,7 +130,7 @@ static void cpm2_gpio32_set(struct gpio_chip *gc, unsigned int gpio, int value) static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm2_gpio32_chip *cpm2_gc = to_cpm2_gpio32_chip(mm_gc); + struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); struct cpm2_ioports __iomem *iop = mm_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -154,7 +148,7 @@ static int cpm2_gpio32_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static int cpm2_gpio32_dir_in(struct gpio_chip *gc, unsigned int gpio) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct cpm2_gpio32_chip *cpm2_gc = to_cpm2_gpio32_chip(mm_gc); + struct cpm2_gpio32_chip *cpm2_gc = gpiochip_get_data(gc); struct cpm2_ioports __iomem *iop = mm_gc->regs; unsigned long flags; u32 pin_mask = 1 << (31 - gpio); @@ -190,6 +184,6 @@ int cpm2_gpiochip_add32(struct device_node *np) gc->get = cpm2_gpio32_get; gc->set = cpm2_gpio32_set; - return of_mm_gpiochip_add(np, mm_gc); + return of_mm_gpiochip_add_data(np, mm_gc, cpm2_gc); } #endif /* CONFIG_CPM2 || CONFIG_8xx_GPIO */ diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c index 85729f497..0ef9df49f 100644 --- a/arch/powerpc/sysdev/fsl_pci.c +++ b/arch/powerpc/sysdev/fsl_pci.c @@ -37,6 +37,7 @@ #include <asm/pci-bridge.h> #include <asm/ppc-pci.h> #include <asm/machdep.h> +#include <asm/mpc85xx.h> #include <asm/disassemble.h> #include <asm/ppc-opcode.h> #include <sysdev/fsl_soc.h> @@ -527,6 +528,8 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) u8 hdr_type, progif; struct device_node *dev; struct ccsr_pci __iomem *pci; + u16 temp; + u32 svr = mfspr(SPRN_SVR); dev = pdev->dev.of_node; @@ -596,6 +599,27 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary) PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS; if (fsl_pcie_check_link(hose)) hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK; + } else { + /* + * Set PBFR(PCI Bus Function Register)[10] = 1 to + * disable the combining of crossing cacheline + * boundary requests into one burst transaction. + * PCI-X operation is not affected. + * Fix erratum PCI 5 on MPC8548 + */ +#define PCI_BUS_FUNCTION 0x44 +#define PCI_BUS_FUNCTION_MDS 0x400 /* Master disable streaming */ + if (((SVR_SOC_VER(svr) == SVR_8543) || + (SVR_SOC_VER(svr) == SVR_8545) || + (SVR_SOC_VER(svr) == SVR_8547) || + (SVR_SOC_VER(svr) == SVR_8548)) && + !early_find_capability(hose, 0, 0, PCI_CAP_ID_PCIX)) { + early_read_config_word(hose, 0, 0, + PCI_BUS_FUNCTION, &temp); + temp |= PCI_BUS_FUNCTION_MDS; + early_write_config_word(hose, 0, 0, + PCI_BUS_FUNCTION, temp); + } } printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. " diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index afe3c7cd3..7de45b2df 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -2004,8 +2004,15 @@ static struct syscore_ops mpic_syscore_ops = { static int mpic_init_sys(void) { + int rc; + register_syscore_ops(&mpic_syscore_ops); - subsys_system_register(&mpic_subsys, NULL); + rc = subsys_system_register(&mpic_subsys, NULL); + if (rc) { + unregister_syscore_ops(&mpic_syscore_ops); + pr_err("mpic: Failed to register subsystem!\n"); + return rc; + } return 0; } diff --git a/arch/powerpc/sysdev/ppc4xx_gpio.c b/arch/powerpc/sysdev/ppc4xx_gpio.c index d7a7ef135..5382d04dd 100644 --- a/arch/powerpc/sysdev/ppc4xx_gpio.c +++ b/arch/powerpc/sysdev/ppc4xx_gpio.c @@ -27,7 +27,7 @@ #include <linux/io.h> #include <linux/of.h> #include <linux/of_gpio.h> -#include <linux/gpio.h> +#include <linux/gpio/driver.h> #include <linux/types.h> #include <linux/slab.h> @@ -67,12 +67,6 @@ struct ppc4xx_gpio_chip { * There are a maximum of 32 gpios in each gpio controller. */ -static inline struct ppc4xx_gpio_chip * -to_ppc4xx_gpiochip(struct of_mm_gpio_chip *mm_gc) -{ - return container_of(mm_gc, struct ppc4xx_gpio_chip, mm_gc); -} - static int ppc4xx_gpio_get(struct gpio_chip *gc, unsigned int gpio) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); @@ -96,8 +90,7 @@ __ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static void ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { - struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct ppc4xx_gpio_chip *chip = to_ppc4xx_gpiochip(mm_gc); + struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); unsigned long flags; spin_lock_irqsave(&chip->lock, flags); @@ -112,7 +105,7 @@ ppc4xx_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) static int ppc4xx_gpio_dir_in(struct gpio_chip *gc, unsigned int gpio) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct ppc4xx_gpio_chip *chip = to_ppc4xx_gpiochip(mm_gc); + struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); struct ppc4xx_gpio __iomem *regs = mm_gc->regs; unsigned long flags; @@ -142,7 +135,7 @@ static int ppc4xx_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct ppc4xx_gpio_chip *chip = to_ppc4xx_gpiochip(mm_gc); + struct ppc4xx_gpio_chip *chip = gpiochip_get_data(gc); struct ppc4xx_gpio __iomem *regs = mm_gc->regs; unsigned long flags; @@ -200,7 +193,7 @@ static int __init ppc4xx_add_gpiochips(void) gc->get = ppc4xx_gpio_get; gc->set = ppc4xx_gpio_set; - ret = of_mm_gpiochip_add(np, mm_gc); + ret = of_mm_gpiochip_add_data(np, mm_gc, ppc4xx_gc); if (ret) goto err; continue; diff --git a/arch/powerpc/sysdev/simple_gpio.c b/arch/powerpc/sysdev/simple_gpio.c index 56ce8ca32..ef470b470 100644 --- a/arch/powerpc/sysdev/simple_gpio.c +++ b/arch/powerpc/sysdev/simple_gpio.c @@ -19,7 +19,7 @@ #include <linux/io.h> #include <linux/of.h> #include <linux/of_gpio.h> -#include <linux/gpio.h> +#include <linux/gpio/driver.h> #include <linux/slab.h> #include <asm/prom.h> #include "simple_gpio.h" @@ -32,11 +32,6 @@ struct u8_gpio_chip { u8 data; }; -static struct u8_gpio_chip *to_u8_gpio_chip(struct of_mm_gpio_chip *mm_gc) -{ - return container_of(mm_gc, struct u8_gpio_chip, mm_gc); -} - static u8 u8_pin2mask(unsigned int pin) { return 1 << (8 - 1 - pin); @@ -52,7 +47,7 @@ static int u8_gpio_get(struct gpio_chip *gc, unsigned int gpio) static void u8_gpio_set(struct gpio_chip *gc, unsigned int gpio, int val) { struct of_mm_gpio_chip *mm_gc = to_of_mm_gpio_chip(gc); - struct u8_gpio_chip *u8_gc = to_u8_gpio_chip(mm_gc); + struct u8_gpio_chip *u8_gc = gpiochip_get_data(gc); unsigned long flags; spin_lock_irqsave(&u8_gc->lock, flags); @@ -80,7 +75,7 @@ static int u8_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) static void u8_gpio_save_regs(struct of_mm_gpio_chip *mm_gc) { - struct u8_gpio_chip *u8_gc = to_u8_gpio_chip(mm_gc); + struct u8_gpio_chip *u8_gc = gpiochip_get_data(&mm_gc->gc); u8_gc->data = in_8(mm_gc->regs); } @@ -108,7 +103,7 @@ static int __init u8_simple_gpiochip_add(struct device_node *np) gc->get = u8_gpio_get; gc->set = u8_gpio_set; - ret = of_mm_gpiochip_add(np, mm_gc); + ret = of_mm_gpiochip_add_data(np, mm_gc, u8_gc); if (ret) goto err; return 0; diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 436062dbb..0b2f77159 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -7,7 +7,7 @@ UBSAN_SANITIZE := n ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) -obj-y += xmon.o nonstdio.o +obj-y += xmon.o nonstdio.o spr_access.o ifdef CONFIG_XMON_DISASSEMBLY obj-y += ppc-dis.o ppc-opc.o diff --git a/arch/powerpc/xmon/spr_access.S b/arch/powerpc/xmon/spr_access.S new file mode 100644 index 000000000..84ad74213 --- /dev/null +++ b/arch/powerpc/xmon/spr_access.S @@ -0,0 +1,45 @@ +#include <asm/ppc_asm.h> + +/* unsigned long xmon_mfspr(sprn, default_value) */ +_GLOBAL(xmon_mfspr) + ld r5, .Lmfspr_table@got(r2) + b xmon_mxspr + +/* void xmon_mtspr(sprn, new_value) */ +_GLOBAL(xmon_mtspr) + ld r5, .Lmtspr_table@got(r2) + b xmon_mxspr + +/* + * r3 = sprn + * r4 = default or new value + * r5 = table base + */ +xmon_mxspr: + /* + * To index into the table of mxsprs we need: + * i = (sprn & 0x3ff) * 8 + * or using rwlinm: + * i = (sprn << 3) & (0x3ff << 3) + */ + rlwinm r3, r3, 3, 0x3ff << 3 + add r5, r5, r3 + mtctr r5 + mr r3, r4 /* put default_value in r3 for mfspr */ + bctr + +.Lmfspr_table: + spr = 0 + .rept 1024 + mfspr r3, spr + blr + spr = spr + 1 + .endr + +.Lmtspr_table: + spr = 0 + .rept 1024 + mtspr spr, r4 + blr + spr = spr + 1 + .endr diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 942796fa4..c5e155108 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -86,6 +86,7 @@ static char tmpstr[128]; static long bus_error_jmp[JMP_BUF_LEN]; static int catch_memory_errors; +static int catch_spr_faults; static long *xmon_fault_jmp[NR_CPUS]; /* Breakpoint stuff */ @@ -147,7 +148,7 @@ void getstring(char *, int); static void flush_input(void); static int inchar(void); static void take_input(char *); -static unsigned long read_spr(int); +static int read_spr(int, unsigned long *); static void write_spr(int, unsigned long); static void super_regs(void); static void remove_bpts(void); @@ -250,6 +251,9 @@ Commands:\n\ sdi # disassemble spu local store for spu # (in hex)\n" #endif " S print special registers\n\ + Sa print all SPRs\n\ + Sr # read SPR #\n\ + Sw #v write v to SPR #\n\ t print backtrace\n\ x exit monitor and recover\n\ X exit monitor and don't recover\n" @@ -442,6 +446,12 @@ static int xmon_core(struct pt_regs *regs, int fromipi) #ifdef CONFIG_SMP cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, &cpus_in_xmon)) { + /* + * We catch SPR read/write faults here because the 0x700, 0xf60 + * etc. handlers don't call debugger_fault_handler(). + */ + if (catch_spr_faults) + longjmp(bus_error_jmp, 1); get_output_lock(); excprint(regs); printf("cpu 0x%x: Exception %lx %s in xmon, " @@ -1635,89 +1645,87 @@ static void cacheflush(void) catch_memory_errors = 0; } -static unsigned long -read_spr(int n) +extern unsigned long xmon_mfspr(int spr, unsigned long default_value); +extern void xmon_mtspr(int spr, unsigned long value); + +static int +read_spr(int n, unsigned long *vp) { - unsigned int instrs[2]; - unsigned long (*code)(void); unsigned long ret = -1UL; -#ifdef CONFIG_PPC64 - unsigned long opd[3]; - - opd[0] = (unsigned long)instrs; - opd[1] = 0; - opd[2] = 0; - code = (unsigned long (*)(void)) opd; -#else - code = (unsigned long (*)(void)) instrs; -#endif - - /* mfspr r3,n; blr */ - instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; - store_inst(instrs); - store_inst(instrs+1); + int ok = 0; if (setjmp(bus_error_jmp) == 0) { - catch_memory_errors = 1; + catch_spr_faults = 1; sync(); - ret = code(); + ret = xmon_mfspr(n, *vp); sync(); - /* wait a little while to see if we get a machine check */ - __delay(200); - n = size; + *vp = ret; + ok = 1; } + catch_spr_faults = 0; - return ret; + return ok; } static void write_spr(int n, unsigned long val) { - unsigned int instrs[2]; - unsigned long (*code)(unsigned long); -#ifdef CONFIG_PPC64 - unsigned long opd[3]; - - opd[0] = (unsigned long)instrs; - opd[1] = 0; - opd[2] = 0; - code = (unsigned long (*)(unsigned long)) opd; -#else - code = (unsigned long (*)(unsigned long)) instrs; -#endif - - instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); - instrs[1] = 0x4e800020; - store_inst(instrs); - store_inst(instrs+1); - if (setjmp(bus_error_jmp) == 0) { - catch_memory_errors = 1; + catch_spr_faults = 1; sync(); - code(val); + xmon_mtspr(n, val); sync(); - /* wait a little while to see if we get a machine check */ - __delay(200); - n = size; + } else { + printf("SPR 0x%03x (%4d) Faulted during write\n", n, n); } + catch_spr_faults = 0; } static unsigned long regno; extern char exc_prolog; extern char dec_exc; +static void dump_one_spr(int spr, bool show_unimplemented) +{ + unsigned long val; + + val = 0xdeadbeef; + if (!read_spr(spr, &val)) { + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); + return; + } + + if (val == 0xdeadbeef) { + /* Looks like read was a nop, confirm */ + val = 0x0badcafe; + if (!read_spr(spr, &val)) { + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); + return; + } + + if (val == 0x0badcafe) { + if (show_unimplemented) + printf("SPR 0x%03x (%4d) Unimplemented\n", spr, spr); + return; + } + } + + printf("SPR 0x%03x (%4d) = 0x%lx\n", spr, spr, val); +} + static void super_regs(void) { int cmd; - unsigned long val; + int spr; cmd = skipbl(); - if (cmd == '\n') { + + switch (cmd) { + case '\n': { unsigned long sp, toc; asm("mr %0,1" : "=r" (sp) :); asm("mr %0,2" : "=r" (toc) :); @@ -1730,21 +1738,29 @@ static void super_regs(void) mfspr(SPRN_DEC), mfspr(SPRN_SPRG2)); printf("sp = "REG" sprg3= "REG"\n", sp, mfspr(SPRN_SPRG3)); printf("toc = "REG" dar = "REG"\n", toc, mfspr(SPRN_DAR)); - return; } - - scanhex(®no); - switch (cmd) { - case 'w': - val = read_spr(regno); + case 'w': { + unsigned long val; + scanhex(®no); + val = 0; + read_spr(regno, &val); scanhex(&val); write_spr(regno, val); - /* fall through */ + dump_one_spr(regno, true); + break; + } case 'r': - printf("spr %lx = %lx\n", regno, read_spr(regno)); + scanhex(®no); + dump_one_spr(regno, true); + break; + case 'a': + /* dump ALL SPRs */ + for (spr = 1; spr < 1024; ++spr) + dump_one_spr(spr, false); break; } + scannl(); } @@ -2913,7 +2929,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC_BOOK3S_64 +#ifdef CONFIG_PPC_STD_MMU_64 void dump_segments(void) { int i; |