diff options
Diffstat (limited to 'arch/powerpc/mm')
43 files changed, 18354 insertions, 0 deletions
diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c new file mode 100644 index 000000000..581096751 --- /dev/null +++ b/arch/powerpc/mm/40x_mmu.c @@ -0,0 +1,159 @@ +/* + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/highmem.h> +#include <linux/memblock.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/uaccess.h> +#include <asm/smp.h> +#include <asm/bootx.h> +#include <asm/machdep.h> +#include <asm/setup.h> + +#include "mmu_decl.h" + +extern int __map_without_ltlbs; +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + /* + * The Zone Protection Register (ZPR) defines how protection will + * be applied to every page which is a member of a given zone. At + * present, we utilize only two of the 4xx's zones. + * The zone index bits (of ZSEL) in the PTE are used for software + * indicators, except the LSB. For user access, zone 1 is used, + * for kernel access, zone 0 is used. We set all but zone 1 + * to zero, allowing only kernel access as indicated in the PTE. + * For zone 1, we set a 01 binary (a value of 10 will not work) + * to allow user access as indicated in the PTE. This also allows + * kernel access as indicated in the PTE. + */ + + mtspr(SPRN_ZPR, 0x10000000); + + flush_instruction_cache(); + + /* + * Set up the real-mode cache parameters for the exception vector + * handlers (which are run in real-mode). + */ + + mtspr(SPRN_DCWR, 0x00000000); /* All caching is write-back */ + + /* + * Cache instruction and data space where the exception + * vectors and the kernel live in real-mode. + */ + + mtspr(SPRN_DCCR, 0xFFFF0000); /* 2GByte of data space at 0x0. */ + mtspr(SPRN_ICCR, 0xFFFF0000); /* 2GByte of instr. space at 0x0. */ +} + +#define LARGE_PAGE_SIZE_16M (1<<24) +#define LARGE_PAGE_SIZE_4M (1<<22) + +unsigned long __init mmu_mapin_ram(unsigned long top) +{ + unsigned long v, s, mapped; + phys_addr_t p; + + v = KERNELBASE; + p = 0; + s = total_lowmem; + + if (__map_without_ltlbs) + return 0; + + while (s >= LARGE_PAGE_SIZE_16M) { + pmd_t *pmdp; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; + + pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + pmd_val(*pmdp++) = val; + pmd_val(*pmdp++) = val; + pmd_val(*pmdp++) = val; + pmd_val(*pmdp++) = val; + + v += LARGE_PAGE_SIZE_16M; + p += LARGE_PAGE_SIZE_16M; + s -= LARGE_PAGE_SIZE_16M; + } + + while (s >= LARGE_PAGE_SIZE_4M) { + pmd_t *pmdp; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; + + pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); + pmd_val(*pmdp) = val; + + v += LARGE_PAGE_SIZE_4M; + p += LARGE_PAGE_SIZE_4M; + s -= LARGE_PAGE_SIZE_4M; + } + + mapped = total_lowmem - s; + + /* If the size of RAM is not an exact power of two, we may not + * have covered RAM in its entirety with 16 and 4 MiB + * pages. Consequently, restrict the top end of RAM currently + * allocable so that calls to the MEMBLOCK to allocate PTEs for "tail" + * coverage with normal-sized pages (or other reasons) do not + * attempt to allocate outside the allowed range. + */ + memblock_set_current_limit(mapped); + + return mapped; +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 40x can only access 16MB at the moment (see head_40x.S) */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); +} diff --git a/arch/powerpc/mm/44x_mmu.c b/arch/powerpc/mm/44x_mmu.c new file mode 100644 index 000000000..82b1ff759 --- /dev/null +++ b/arch/powerpc/mm/44x_mmu.c @@ -0,0 +1,254 @@ +/* + * Modifications by Matt Porter (mporter@mvista.com) to support + * PPC44x Book E processors. + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/init.h> +#include <linux/memblock.h> + +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/cacheflush.h> + +#include "mmu_decl.h" + +/* Used by the 44x TLB replacement exception handler. + * Just needed it declared someplace. + */ +unsigned int tlb_44x_index; /* = 0 */ +unsigned int tlb_44x_hwater = PPC44x_TLB_SIZE - 1 - PPC44x_EARLY_TLBS; +int icache_44x_need_flush; + +unsigned long tlb_47x_boltmap[1024/8]; + +static void ppc44x_update_tlb_hwater(void) +{ + extern unsigned int tlb_44x_patch_hwater_D[]; + extern unsigned int tlb_44x_patch_hwater_I[]; + + /* The TLB miss handlers hard codes the watermark in a cmpli + * instruction to improve performances rather than loading it + * from the global variable. Thus, we patch the instructions + * in the 2 TLB miss handlers when updating the value + */ + tlb_44x_patch_hwater_D[0] = (tlb_44x_patch_hwater_D[0] & 0xffff0000) | + tlb_44x_hwater; + flush_icache_range((unsigned long)&tlb_44x_patch_hwater_D[0], + (unsigned long)&tlb_44x_patch_hwater_D[1]); + tlb_44x_patch_hwater_I[0] = (tlb_44x_patch_hwater_I[0] & 0xffff0000) | + tlb_44x_hwater; + flush_icache_range((unsigned long)&tlb_44x_patch_hwater_I[0], + (unsigned long)&tlb_44x_patch_hwater_I[1]); +} + +/* + * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 44x type MMU + */ +static void __init ppc44x_pin_tlb(unsigned int virt, unsigned int phys) +{ + unsigned int entry = tlb_44x_hwater--; + + ppc44x_update_tlb_hwater(); + + mtspr(SPRN_MMUCR, 0); + + __asm__ __volatile__( + "tlbwe %2,%3,%4\n" + "tlbwe %1,%3,%5\n" + "tlbwe %0,%3,%6\n" + : + : "r" (PPC44x_TLB_SW | PPC44x_TLB_SR | PPC44x_TLB_SX | PPC44x_TLB_G), + "r" (phys), + "r" (virt | PPC44x_TLB_VALID | PPC44x_TLB_256M), + "r" (entry), + "i" (PPC44x_TLB_PAGEID), + "i" (PPC44x_TLB_XLAT), + "i" (PPC44x_TLB_ATTRIB)); +} + +static int __init ppc47x_find_free_bolted(void) +{ + unsigned int mmube0 = mfspr(SPRN_MMUBE0); + unsigned int mmube1 = mfspr(SPRN_MMUBE1); + + if (!(mmube0 & MMUBE0_VBE0)) + return 0; + if (!(mmube0 & MMUBE0_VBE1)) + return 1; + if (!(mmube0 & MMUBE0_VBE2)) + return 2; + if (!(mmube1 & MMUBE1_VBE3)) + return 3; + if (!(mmube1 & MMUBE1_VBE4)) + return 4; + if (!(mmube1 & MMUBE1_VBE5)) + return 5; + return -1; +} + +static void __init ppc47x_update_boltmap(void) +{ + unsigned int mmube0 = mfspr(SPRN_MMUBE0); + unsigned int mmube1 = mfspr(SPRN_MMUBE1); + + if (mmube0 & MMUBE0_VBE0) + __set_bit((mmube0 >> MMUBE0_IBE0_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube0 & MMUBE0_VBE1) + __set_bit((mmube0 >> MMUBE0_IBE1_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube0 & MMUBE0_VBE2) + __set_bit((mmube0 >> MMUBE0_IBE2_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE3) + __set_bit((mmube1 >> MMUBE1_IBE3_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE4) + __set_bit((mmube1 >> MMUBE1_IBE4_SHIFT) & 0xff, + tlb_47x_boltmap); + if (mmube1 & MMUBE1_VBE5) + __set_bit((mmube1 >> MMUBE1_IBE5_SHIFT) & 0xff, + tlb_47x_boltmap); +} + +/* + * "Pins" a 256MB TLB entry in AS0 for kernel lowmem for 47x type MMU + */ +static void ppc47x_pin_tlb(unsigned int virt, unsigned int phys) +{ + unsigned int rA; + int bolted; + + /* Base rA is HW way select, way 0, bolted bit set */ + rA = 0x88000000; + + /* Look for a bolted entry slot */ + bolted = ppc47x_find_free_bolted(); + BUG_ON(bolted < 0); + + /* Insert bolted slot number */ + rA |= bolted << 24; + + pr_debug("256M TLB entry for 0x%08x->0x%08x in bolt slot %d\n", + virt, phys, bolted); + + mtspr(SPRN_MMUCR, 0); + + __asm__ __volatile__( + "tlbwe %2,%3,0\n" + "tlbwe %1,%3,1\n" + "tlbwe %0,%3,2\n" + : + : "r" (PPC47x_TLB2_SW | PPC47x_TLB2_SR | + PPC47x_TLB2_SX +#ifdef CONFIG_SMP + | PPC47x_TLB2_M +#endif + ), + "r" (phys), + "r" (virt | PPC47x_TLB0_VALID | PPC47x_TLB0_256M), + "r" (rA)); +} + +void __init MMU_init_hw(void) +{ + /* This is not useful on 47x but won't hurt either */ + ppc44x_update_tlb_hwater(); + + flush_instruction_cache(); +} + +unsigned long __init mmu_mapin_ram(unsigned long top) +{ + unsigned long addr; + unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); + + /* Pin in enough TLBs to cover any lowmem not covered by the + * initial 256M mapping established in head_44x.S */ + for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; + addr += PPC_PIN_SIZE) { + if (mmu_has_feature(MMU_FTR_TYPE_47x)) + ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); + else + ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + } + if (mmu_has_feature(MMU_FTR_TYPE_47x)) { + ppc47x_update_boltmap(); + +#ifdef DEBUG + { + int i; + + printk(KERN_DEBUG "bolted entries: "); + for (i = 0; i < 255; i++) { + if (test_bit(i, tlb_47x_boltmap)) + printk("%d ", i); + } + printk("\n"); + } +#endif /* DEBUG */ + } + return total_lowmem; +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + u64 size; + +#ifndef CONFIG_NONSTATIC_KERNEL + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); +#endif + + /* 44x has a 256M TLB entry pinned at boot */ + size = (min_t(u64, first_memblock_size, PPC_PIN_SIZE)); + memblock_set_current_limit(first_memblock_base + size); +} + +#ifdef CONFIG_SMP +void mmu_init_secondary(int cpu) +{ + unsigned long addr; + unsigned long memstart = memstart_addr & ~(PPC_PIN_SIZE - 1); + + /* Pin in enough TLBs to cover any lowmem not covered by the + * initial 256M mapping established in head_44x.S + * + * WARNING: This is called with only the first 256M of the + * linear mapping in the TLB and we can't take faults yet + * so beware of what this code uses. It runs off a temporary + * stack. current (r2) isn't initialized, smp_processor_id() + * will not work, current thread info isn't accessible, ... + */ + for (addr = memstart + PPC_PIN_SIZE; addr < lowmem_end_addr; + addr += PPC_PIN_SIZE) { + if (mmu_has_feature(MMU_FTR_TYPE_47x)) + ppc47x_pin_tlb(addr + PAGE_OFFSET, addr); + else + ppc44x_pin_tlb(addr + PAGE_OFFSET, addr); + } +} +#endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile new file mode 100644 index 000000000..9c8770b5f --- /dev/null +++ b/arch/powerpc/mm/Makefile @@ -0,0 +1,38 @@ +# +# Makefile for the linux ppc-specific parts of the memory manager. +# + +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) + +obj-y := fault.o mem.o pgtable.o mmap.o \ + init_$(CONFIG_WORD_SIZE).o \ + pgtable_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ + tlb_nohash_low.o +obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o +hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o +obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y) +obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o +obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \ + tlb_hash$(CONFIG_WORD_SIZE).o \ + mmu_context_hash$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC_ICSWX) += icswx.o +obj-$(CONFIG_PPC_ICSWX_PID) += icswx_pid.o +obj-$(CONFIG_40x) += 40x_mmu.o +obj-$(CONFIG_44x) += 44x_mmu.o +obj-$(CONFIG_PPC_FSL_BOOK3E) += fsl_booke_mmu.o +obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o +obj-$(CONFIG_PPC_SPLPAR) += vphn.o +obj-$(CONFIG_PPC_MM_SLICES) += slice.o +obj-y += hugetlbpage.o +ifeq ($(CONFIG_HUGETLB_PAGE),y) +obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o +endif +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o +obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o +obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o +obj-$(CONFIG_HIGHMEM) += highmem.o +obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c new file mode 100644 index 000000000..f031a47d7 --- /dev/null +++ b/arch/powerpc/mm/copro_fault.c @@ -0,0 +1,152 @@ +/* + * CoProcessor (SPU/AFU) mm fault handler + * + * (C) Copyright IBM Deutschland Entwicklung GmbH 2007 + * + * Author: Arnd Bergmann <arndb@de.ibm.com> + * Author: Jeremy Kerr <jk@ozlabs.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/export.h> +#include <asm/reg.h> +#include <asm/copro.h> +#include <asm/spu.h> +#include <misc/cxl.h> + +/* + * This ought to be kept in sync with the powerpc specific do_page_fault + * function. Currently, there are a few corner cases that we haven't had + * to handle fortunately. + */ +int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, + unsigned long dsisr, unsigned *flt) +{ + struct vm_area_struct *vma; + unsigned long is_write; + int ret; + + if (mm == NULL) + return -EFAULT; + + if (mm->pgd == NULL) + return -EFAULT; + + down_read(&mm->mmap_sem); + ret = -EFAULT; + vma = find_vma(mm, ea); + if (!vma) + goto out_unlock; + + if (ea < vma->vm_start) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_unlock; + if (expand_stack(vma, ea)) + goto out_unlock; + } + + is_write = dsisr & DSISR_ISSTORE; + if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto out_unlock; + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + goto out_unlock; + /* + * protfault should only happen due to us + * mapping a region readonly temporarily. PROT_NONE + * is also covered by the VMA check above. + */ + WARN_ON_ONCE(dsisr & DSISR_PROTFAULT); + } + + ret = 0; + *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0); + if (unlikely(*flt & VM_FAULT_ERROR)) { + if (*flt & VM_FAULT_OOM) { + ret = -ENOMEM; + goto out_unlock; + } else if (*flt & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { + ret = -EFAULT; + goto out_unlock; + } + BUG(); + } + + if (*flt & VM_FAULT_MAJOR) + current->maj_flt++; + else + current->min_flt++; + +out_unlock: + up_read(&mm->mmap_sem); + return ret; +} +EXPORT_SYMBOL_GPL(copro_handle_mm_fault); + +int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb) +{ + u64 vsid; + int psize, ssize; + + switch (REGION_ID(ea)) { + case USER_REGION_ID: + pr_devel("%s: 0x%llx -- USER_REGION_ID\n", __func__, ea); + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + break; + case VMALLOC_REGION_ID: + pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea); + if (ea < VMALLOC_END) + psize = mmu_vmalloc_psize; + else + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + break; + case KERNEL_REGION_ID: + pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea); + psize = mmu_linear_psize; + ssize = mmu_kernel_ssize; + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + break; + default: + pr_debug("%s: invalid region access at %016llx\n", __func__, ea); + return 1; + } + + vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER; + + vsid |= mmu_psize_defs[psize].sllp | + ((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0); + + slb->esid = (ea & (ssize == MMU_SEGSIZE_1T ? ESID_MASK_1T : ESID_MASK)) | SLB_ESID_V; + slb->vsid = vsid; + + return 0; +} +EXPORT_SYMBOL_GPL(copro_calculate_slb); + +void copro_flush_all_slbs(struct mm_struct *mm) +{ +#ifdef CONFIG_SPU_BASE + spu_flush_all_slbs(mm); +#endif + cxl_slbia(mm); +} +EXPORT_SYMBOL_GPL(copro_flush_all_slbs); diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c new file mode 100644 index 000000000..169aba446 --- /dev/null +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -0,0 +1,420 @@ +/* + * PowerPC version derived from arch/arm/mm/consistent.c + * Copyright (C) 2001 Dan Malek (dmalek@jlc.net) + * + * Copyright (C) 2000 Russell King + * + * Consistent memory allocators. Used for DMA devices that want to + * share uncached memory with the processor core. The function return + * is the virtual address and 'dma_handle' is the physical address. + * Mostly stolen from the ARM port, with some changes for PowerPC. + * -- Dan + * + * Reorganized to get rid of the arch-specific consistent_* functions + * and provide non-coherent implementations for the DMA API. -Matt + * + * Added in_interrupt() safe dma_alloc_coherent()/dma_free_coherent() + * implementation. This is pulled straight from ARM and barely + * modified. -Matt + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/highmem.h> +#include <linux/dma-mapping.h> +#include <linux/export.h> + +#include <asm/tlbflush.h> +#include <asm/dma.h> + +#include "mmu_decl.h" + +/* + * This address range defaults to a value that is safe for all + * platforms which currently set CONFIG_NOT_COHERENT_CACHE. It + * can be further configured for specific applications under + * the "Advanced Setup" menu. -Matt + */ +#define CONSISTENT_BASE (IOREMAP_TOP) +#define CONSISTENT_END (CONSISTENT_BASE + CONFIG_CONSISTENT_SIZE) +#define CONSISTENT_OFFSET(x) (((unsigned long)(x) - CONSISTENT_BASE) >> PAGE_SHIFT) + +/* + * This is the page table (2MB) covering uncached, DMA consistent allocations + */ +static DEFINE_SPINLOCK(consistent_lock); + +/* + * VM region handling support. + * + * This should become something generic, handling VM region allocations for + * vmalloc and similar (ioremap, module space, etc). + * + * I envisage vmalloc()'s supporting vm_struct becoming: + * + * struct vm_struct { + * struct vm_region region; + * unsigned long flags; + * struct page **pages; + * unsigned int nr_pages; + * unsigned long phys_addr; + * }; + * + * get_vm_area() would then call vm_region_alloc with an appropriate + * struct vm_region head (eg): + * + * struct vm_region vmalloc_head = { + * .vm_list = LIST_HEAD_INIT(vmalloc_head.vm_list), + * .vm_start = VMALLOC_START, + * .vm_end = VMALLOC_END, + * }; + * + * However, vmalloc_head.vm_start is variable (typically, it is dependent on + * the amount of RAM found at boot time.) I would imagine that get_vm_area() + * would have to initialise this each time prior to calling vm_region_alloc(). + */ +struct ppc_vm_region { + struct list_head vm_list; + unsigned long vm_start; + unsigned long vm_end; +}; + +static struct ppc_vm_region consistent_head = { + .vm_list = LIST_HEAD_INIT(consistent_head.vm_list), + .vm_start = CONSISTENT_BASE, + .vm_end = CONSISTENT_END, +}; + +static struct ppc_vm_region * +ppc_vm_region_alloc(struct ppc_vm_region *head, size_t size, gfp_t gfp) +{ + unsigned long addr = head->vm_start, end = head->vm_end - size; + unsigned long flags; + struct ppc_vm_region *c, *new; + + new = kmalloc(sizeof(struct ppc_vm_region), gfp); + if (!new) + goto out; + + spin_lock_irqsave(&consistent_lock, flags); + + list_for_each_entry(c, &head->vm_list, vm_list) { + if ((addr + size) < addr) + goto nospc; + if ((addr + size) <= c->vm_start) + goto found; + addr = c->vm_end; + if (addr > end) + goto nospc; + } + + found: + /* + * Insert this entry _before_ the one we found. + */ + list_add_tail(&new->vm_list, &c->vm_list); + new->vm_start = addr; + new->vm_end = addr + size; + + spin_unlock_irqrestore(&consistent_lock, flags); + return new; + + nospc: + spin_unlock_irqrestore(&consistent_lock, flags); + kfree(new); + out: + return NULL; +} + +static struct ppc_vm_region *ppc_vm_region_find(struct ppc_vm_region *head, unsigned long addr) +{ + struct ppc_vm_region *c; + + list_for_each_entry(c, &head->vm_list, vm_list) { + if (c->vm_start == addr) + goto out; + } + c = NULL; + out: + return c; +} + +/* + * Allocate DMA-coherent memory space and return both the kernel remapped + * virtual and bus address for that space. + */ +void * +__dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp) +{ + struct page *page; + struct ppc_vm_region *c; + unsigned long order; + u64 mask = ISA_DMA_THRESHOLD, limit; + + if (dev) { + mask = dev->coherent_dma_mask; + + /* + * Sanity check the DMA mask - it must be non-zero, and + * must be able to be satisfied by a DMA allocation. + */ + if (mask == 0) { + dev_warn(dev, "coherent DMA mask is unset\n"); + goto no_page; + } + + if ((~mask) & ISA_DMA_THRESHOLD) { + dev_warn(dev, "coherent DMA mask %#llx is smaller " + "than system GFP_DMA mask %#llx\n", + mask, (unsigned long long)ISA_DMA_THRESHOLD); + goto no_page; + } + } + + + size = PAGE_ALIGN(size); + limit = (mask + 1) & ~mask; + if ((limit && size >= limit) || + size >= (CONSISTENT_END - CONSISTENT_BASE)) { + printk(KERN_WARNING "coherent allocation too big (requested %#x mask %#Lx)\n", + size, mask); + return NULL; + } + + order = get_order(size); + + /* Might be useful if we ever have a real legacy DMA zone... */ + if (mask != 0xffffffff) + gfp |= GFP_DMA; + + page = alloc_pages(gfp, order); + if (!page) + goto no_page; + + /* + * Invalidate any data that might be lurking in the + * kernel direct-mapped region for device DMA. + */ + { + unsigned long kaddr = (unsigned long)page_address(page); + memset(page_address(page), 0, size); + flush_dcache_range(kaddr, kaddr + size); + } + + /* + * Allocate a virtual address in the consistent mapping region. + */ + c = ppc_vm_region_alloc(&consistent_head, size, + gfp & ~(__GFP_DMA | __GFP_HIGHMEM)); + if (c) { + unsigned long vaddr = c->vm_start; + struct page *end = page + (1 << order); + + split_page(page, order); + + /* + * Set the "dma handle" + */ + *handle = page_to_phys(page); + + do { + SetPageReserved(page); + map_page(vaddr, page_to_phys(page), + pgprot_val(pgprot_noncached(PAGE_KERNEL))); + page++; + vaddr += PAGE_SIZE; + } while (size -= PAGE_SIZE); + + /* + * Free the otherwise unused pages. + */ + while (page < end) { + __free_page(page); + page++; + } + + return (void *)c->vm_start; + } + + if (page) + __free_pages(page, order); + no_page: + return NULL; +} +EXPORT_SYMBOL(__dma_alloc_coherent); + +/* + * free a page as defined by the above mapping. + */ +void __dma_free_coherent(size_t size, void *vaddr) +{ + struct ppc_vm_region *c; + unsigned long flags, addr; + + size = PAGE_ALIGN(size); + + spin_lock_irqsave(&consistent_lock, flags); + + c = ppc_vm_region_find(&consistent_head, (unsigned long)vaddr); + if (!c) + goto no_area; + + if ((c->vm_end - c->vm_start) != size) { + printk(KERN_ERR "%s: freeing wrong coherent size (%ld != %d)\n", + __func__, c->vm_end - c->vm_start, size); + dump_stack(); + size = c->vm_end - c->vm_start; + } + + addr = c->vm_start; + do { + pte_t *ptep; + unsigned long pfn; + + ptep = pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(addr), + addr), + addr), + addr); + if (!pte_none(*ptep) && pte_present(*ptep)) { + pfn = pte_pfn(*ptep); + pte_clear(&init_mm, addr, ptep); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + __free_reserved_page(page); + } + } + addr += PAGE_SIZE; + } while (size -= PAGE_SIZE); + + flush_tlb_kernel_range(c->vm_start, c->vm_end); + + list_del(&c->vm_list); + + spin_unlock_irqrestore(&consistent_lock, flags); + + kfree(c); + return; + + no_area: + spin_unlock_irqrestore(&consistent_lock, flags); + printk(KERN_ERR "%s: trying to free invalid coherent area: %p\n", + __func__, vaddr); + dump_stack(); +} +EXPORT_SYMBOL(__dma_free_coherent); + +/* + * make an area consistent. + */ +void __dma_sync(void *vaddr, size_t size, int direction) +{ + unsigned long start = (unsigned long)vaddr; + unsigned long end = start + size; + + switch (direction) { + case DMA_NONE: + BUG(); + case DMA_FROM_DEVICE: + /* + * invalidate only when cache-line aligned otherwise there is + * the potential for discarding uncommitted data from the cache + */ + if ((start & (L1_CACHE_BYTES - 1)) || (size & (L1_CACHE_BYTES - 1))) + flush_dcache_range(start, end); + else + invalidate_dcache_range(start, end); + break; + case DMA_TO_DEVICE: /* writeback only */ + clean_dcache_range(start, end); + break; + case DMA_BIDIRECTIONAL: /* writeback and invalidate */ + flush_dcache_range(start, end); + break; + } +} +EXPORT_SYMBOL(__dma_sync); + +#ifdef CONFIG_HIGHMEM +/* + * __dma_sync_page() implementation for systems using highmem. + * In this case, each page of a buffer must be kmapped/kunmapped + * in order to have a virtual address for __dma_sync(). This must + * not sleep so kmap_atomic()/kunmap_atomic() are used. + * + * Note: yes, it is possible and correct to have a buffer extend + * beyond the first page. + */ +static inline void __dma_sync_page_highmem(struct page *page, + unsigned long offset, size_t size, int direction) +{ + size_t seg_size = min((size_t)(PAGE_SIZE - offset), size); + size_t cur_size = seg_size; + unsigned long flags, start, seg_offset = offset; + int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE; + int seg_nr = 0; + + local_irq_save(flags); + + do { + start = (unsigned long)kmap_atomic(page + seg_nr) + seg_offset; + + /* Sync this buffer segment */ + __dma_sync((void *)start, seg_size, direction); + kunmap_atomic((void *)start); + seg_nr++; + + /* Calculate next buffer segment size */ + seg_size = min((size_t)PAGE_SIZE, size - cur_size); + + /* Add the segment size to our running total */ + cur_size += seg_size; + seg_offset = 0; + } while (seg_nr < nr_segs); + + local_irq_restore(flags); +} +#endif /* CONFIG_HIGHMEM */ + +/* + * __dma_sync_page makes memory consistent. identical to __dma_sync, but + * takes a struct page instead of a virtual address + */ +void __dma_sync_page(struct page *page, unsigned long offset, + size_t size, int direction) +{ +#ifdef CONFIG_HIGHMEM + __dma_sync_page_highmem(page, offset, size, direction); +#else + unsigned long start = (unsigned long)page_address(page) + offset; + __dma_sync((void *)start, size, direction); +#endif +} +EXPORT_SYMBOL(__dma_sync_page); + +/* + * Return the PFN for a given cpu virtual address returned by + * __dma_alloc_coherent. This is used by dma_mmap_coherent() + */ +unsigned long __dma_get_coherent_pfn(unsigned long cpu_addr) +{ + /* This should always be populated, so we don't test every + * level. If that fails, we'll have a nice crash which + * will be as good as a BUG_ON() + */ + pgd_t *pgd = pgd_offset_k(cpu_addr); + pud_t *pud = pud_offset(pgd, cpu_addr); + pmd_t *pmd = pmd_offset(pud, cpu_addr); + pte_t *ptep = pte_offset_kernel(pmd, cpu_addr); + + if (pte_none(*ptep) || !pte_present(*ptep)) + return 0; + return pte_pfn(*ptep); +} diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c new file mode 100644 index 000000000..b396868d2 --- /dev/null +++ b/arch/powerpc/mm/fault.c @@ -0,0 +1,543 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Derived from "arch/i386/mm/fault.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Modified by Cort Dougan and Paul Mackerras. + * + * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/perf_event.h> +#include <linux/ratelimit.h> +#include <linux/context_tracking.h> +#include <linux/hugetlb.h> + +#include <asm/firmware.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/uaccess.h> +#include <asm/tlbflush.h> +#include <asm/siginfo.h> +#include <asm/debug.h> + +#include "icswx.h" + +#ifdef CONFIG_KPROBES +static inline int notify_page_fault(struct pt_regs *regs) +{ + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ + if (!user_mode(regs)) { + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 11)) + ret = 1; + preempt_enable(); + } + + return ret; +} +#else +static inline int notify_page_fault(struct pt_regs *regs) +{ + return 0; +} +#endif + +/* + * Check whether the instruction at regs->nip is a store using + * an update addressing form which will update r1. + */ +static int store_updates_sp(struct pt_regs *regs) +{ + unsigned int inst; + + if (get_user(inst, (unsigned int __user *)regs->nip)) + return 0; + /* check for 1 in the rA field */ + if (((inst >> 16) & 0x1f) != 1) + return 0; + /* check major opcode */ + switch (inst >> 26) { + case 37: /* stwu */ + case 39: /* stbu */ + case 45: /* sthu */ + case 53: /* stfsu */ + case 55: /* stfdu */ + return 1; + case 62: /* std or stdu */ + return (inst & 3) == 1; + case 31: + /* check minor opcode */ + switch ((inst >> 1) & 0x3ff) { + case 181: /* stdux */ + case 183: /* stwux */ + case 247: /* stbux */ + case 439: /* sthux */ + case 695: /* stfsux */ + case 759: /* stfdux */ + return 1; + } + } + return 0; +} +/* + * do_page_fault error handling helpers + */ + +#define MM_FAULT_RETURN 0 +#define MM_FAULT_CONTINUE -1 +#define MM_FAULT_ERR(sig) (sig) + +static int do_sigbus(struct pt_regs *regs, unsigned long address, + unsigned int fault) +{ + siginfo_t info; + unsigned int lsb = 0; + + up_read(¤t->mm->mmap_sem); + + if (!user_mode(regs)) + return MM_FAULT_ERR(SIGBUS); + + current->thread.trap_nr = BUS_ADRERR; + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *)address; +#ifdef CONFIG_MEMORY_FAILURE + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", + current->comm, current->pid, address); + info.si_code = BUS_MCEERR_AR; + } + + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; +#endif + info.si_addr_lsb = lsb; + force_sig_info(SIGBUS, &info, current); + return MM_FAULT_RETURN; +} + +static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) +{ + /* + * Pagefault was interrupted by SIGKILL. We have no reason to + * continue the pagefault. + */ + if (fatal_signal_pending(current)) { + /* + * If we have retry set, the mmap semaphore will have + * alrady been released in __lock_page_or_retry(). Else + * we release it now. + */ + if (!(fault & VM_FAULT_RETRY)) + up_read(¤t->mm->mmap_sem); + /* Coming from kernel, we need to deal with uaccess fixups */ + if (user_mode(regs)) + return MM_FAULT_RETURN; + return MM_FAULT_ERR(SIGKILL); + } + + /* No fault: be happy */ + if (!(fault & VM_FAULT_ERROR)) + return MM_FAULT_CONTINUE; + + /* Out of memory */ + if (fault & VM_FAULT_OOM) { + up_read(¤t->mm->mmap_sem); + + /* + * We ran out of memory, or some other thing happened to us that + * made us unable to handle the page fault gracefully. + */ + if (!user_mode(regs)) + return MM_FAULT_ERR(SIGKILL); + pagefault_out_of_memory(); + return MM_FAULT_RETURN; + } + + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) + return do_sigbus(regs, addr, fault); + + /* We don't understand the fault code, this is fatal */ + BUG(); + return MM_FAULT_CONTINUE; +} + +/* + * For 600- and 800-family processors, the error_code parameter is DSISR + * for a data fault, SRR1 for an instruction fault. For 400-family processors + * the error_code parameter is ESR for a data fault, 0 for an instruction + * fault. + * For 64-bit processors, the error_code parameter is + * - DSISR for a non-SLB data access fault, + * - SRR1 & 0x08000000 for a non-SLB instruction access fault + * - 0 any SLB fault. + * + * The return value is 0 if the fault was handled, or the signal + * number if this is a kernel fault that can't be handled here. + */ +int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + enum ctx_state prev_state = exception_enter(); + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + int code = SEGV_MAPERR; + int is_write = 0; + int trap = TRAP(regs); + int is_exec = trap == 0x400; + int fault; + int rc = 0, store_update_sp = 0; + +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) + /* + * Fortunately the bit assignments in SRR1 for an instruction + * fault and DSISR for a data fault are mostly the same for the + * bits we are interested in. But there are some bits which + * indicate errors in DSISR but can validly be set in SRR1. + */ + if (trap == 0x400) + error_code &= 0x48200000; + else + is_write = error_code & DSISR_ISSTORE; +#else + is_write = error_code & ESR_DST; +#endif /* CONFIG_4xx || CONFIG_BOOKE */ + +#ifdef CONFIG_PPC_ICSWX + /* + * we need to do this early because this "data storage + * interrupt" does not update the DAR/DEAR so we don't want to + * look at it + */ + if (error_code & ICSWX_DSI_UCT) { + rc = acop_handle_fault(regs, address, error_code); + if (rc) + goto bail; + } +#endif /* CONFIG_PPC_ICSWX */ + + if (notify_page_fault(regs)) + goto bail; + + if (unlikely(debugger_fault_handler(regs))) + goto bail; + + /* On a kernel SLB miss we can only check for a valid exception entry */ + if (!user_mode(regs) && (address >= TASK_SIZE)) { + rc = SIGSEGV; + goto bail; + } + +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ + defined(CONFIG_PPC_BOOK3S_64)) + if (error_code & DSISR_DABRMATCH) { + /* breakpoint match */ + do_break(regs, address, error_code); + goto bail; + } +#endif + + /* We restore the interrupt state now */ + if (!arch_irq_disabled_regs(regs)) + local_irq_enable(); + + if (in_atomic() || mm == NULL) { + if (!user_mode(regs)) { + rc = SIGSEGV; + goto bail; + } + /* in_atomic() in user mode is really bad, + as is current->mm == NULL. */ + printk(KERN_EMERG "Page fault in user mode with " + "in_atomic() = %d mm = %p\n", in_atomic(), mm); + printk(KERN_EMERG "NIP = %lx MSR = %lx\n", + regs->nip, regs->msr); + die("Weird page fault", regs, SIGSEGV); + } + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + + /* + * We want to do this outside mmap_sem, because reading code around nip + * can result in fault, which will cause a deadlock when called with + * mmap_sem held + */ + if (user_mode(regs)) + store_update_sp = store_updates_sp(regs); + + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if (!user_mode(regs) && !search_exception_tables(regs->nip)) + goto bad_area_nosemaphore; + +retry: + down_read(&mm->mmap_sem); + } else { + /* + * The above down_read_trylock() might have succeeded in + * which case we'll have missed the might_sleep() from + * down_read(): + */ + might_sleep(); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + + /* + * N.B. The POWER/Open ABI allows programs to access up to + * 288 bytes below the stack pointer. + * The kernel signal delivery code writes up to about 1.5kB + * below the stack pointer (r1) before decrementing it. + * The exec code can write slightly over 640kB to the stack + * before setting the user r1. Thus we allow the stack to + * expand to 1MB without further checks. + */ + if (address + 0x100000 < vma->vm_end) { + /* get user regs even if this fault is in kernel mode */ + struct pt_regs *uregs = current->thread.regs; + if (uregs == NULL) + goto bad_area; + + /* + * A user-mode access to an address a long way below + * the stack pointer is only valid if the instruction + * is one which would update the stack pointer to the + * address accessed if the instruction completed, + * i.e. either stwu rs,n(r1) or stwux rs,r1,rb + * (or the byte, halfword, float or double forms). + * + * If we don't check this then any write to the area + * between the last mapped region and the stack will + * expand the stack rather than segfaulting. + */ + if (address + 2048 < uregs->gpr[1] && !store_update_sp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; + +good_area: + code = SEGV_ACCERR; +#if defined(CONFIG_6xx) + if (error_code & 0x95700000) + /* an error such as lwarx to I/O controller space, + address matching DABR, eciwx, etc. */ + goto bad_area; +#endif /* CONFIG_6xx */ +#if defined(CONFIG_8xx) + /* The MPC8xx seems to always set 0x80000000, which is + * "undefined". Of those that can be set, this is the only + * one which seems bad. + */ + if (error_code & 0x10000000) + /* Guarded storage error. */ + goto bad_area; +#endif /* CONFIG_8xx */ + + if (is_exec) { + /* + * Allow execution from readable areas if the MMU does not + * provide separate controls over reading and executing. + * + * Note: That code used to not be enabled for 4xx/BookE. + * It is now as I/D cache coherency for these is done at + * set_pte_at() time and I see no reason why the test + * below wouldn't be valid on those processors. This -may- + * break programs compiled with a really old ABI though. + */ + if (!(vma->vm_flags & VM_EXEC) && + (cpu_has_feature(CPU_FTR_NOEXECUTE) || + !(vma->vm_flags & (VM_READ | VM_WRITE)))) + goto bad_area; +#ifdef CONFIG_PPC_STD_MMU + /* + * protfault should only happen due to us + * mapping a region readonly temporarily. PROT_NONE + * is also covered by the VMA check above. + */ + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); +#endif /* CONFIG_PPC_STD_MMU */ + /* a write */ + } else if (is_write) { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + flags |= FAULT_FLAG_WRITE; + /* a read */ + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); + } + + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(mm, vma, address, flags); + if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { + if (fault & VM_FAULT_SIGSEGV) + goto bad_area; + rc = mm_fault_error(regs, address, fault); + if (rc >= MM_FAULT_RETURN) + goto bail; + else + rc = 0; + } + + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + current->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); +#ifdef CONFIG_PPC_SMLPAR + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); + } +#endif /* CONFIG_PPC_SMLPAR */ + } else { + current->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + + up_read(&mm->mmap_sem); + goto bail; + +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses cause a SIGSEGV */ + if (user_mode(regs)) { + _exception(SIGSEGV, regs, code, address); + goto bail; + } + + if (is_exec && (error_code & DSISR_PROTFAULT)) + printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" + " page (%lx) - exploit attempt? (uid: %d)\n", + address, from_kuid(&init_user_ns, current_uid())); + + rc = SIGSEGV; + +bail: + exception_exit(prev_state); + return rc; + +} + +/* + * bad_page_fault is called when we have a bad access from the kernel. + * It is called from the DSI and ISI handlers in head.S and from some + * of the procedures in traps.c. + */ +void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) +{ + const struct exception_table_entry *entry; + + /* Are we prepared to handle this fault? */ + if ((entry = search_exception_tables(regs->nip)) != NULL) { + regs->nip = entry->fixup; + return; + } + + /* kernel has accessed a bad area */ + + switch (regs->trap) { + case 0x300: + case 0x380: + printk(KERN_ALERT "Unable to handle kernel paging request for " + "data at address 0x%08lx\n", regs->dar); + break; + case 0x400: + case 0x480: + printk(KERN_ALERT "Unable to handle kernel paging request for " + "instruction fetch\n"); + break; + default: + printk(KERN_ALERT "Unable to handle kernel paging request for " + "unknown fault\n"); + break; + } + printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n", + regs->nip); + + if (task_stack_end_corrupted(current)) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + + die("Kernel access of bad area", regs, sig); +} diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c new file mode 100644 index 000000000..9c90e66cf --- /dev/null +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -0,0 +1,316 @@ +/* + * Modifications by Kumar Gala (galak@kernel.crashing.org) to support + * E500 Book E processors. + * + * Copyright 2004,2010 Freescale Semiconductor, Inc. + * + * This file contains the routines for initializing the MMU + * on the 4xx series of chips. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/highmem.h> +#include <linux/memblock.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/uaccess.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/setup.h> +#include <asm/paca.h> + +#include "mmu_decl.h" + +unsigned int tlbcam_index; + +#define NUM_TLBCAMS (64) +struct tlbcam TLBCAM[NUM_TLBCAMS]; + +struct tlbcamrange { + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} tlbcam_addrs[NUM_TLBCAMS]; + +unsigned long tlbcam_sz(int idx) +{ + return tlbcam_addrs[idx].limit - tlbcam_addrs[idx].start + 1; +} + +/* + * Return PA for this VA if it is mapped by a CAM, or 0 + */ +phys_addr_t v_mapped_by_tlbcam(unsigned long va) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (va >= tlbcam_addrs[b].start && va < tlbcam_addrs[b].limit) + return tlbcam_addrs[b].phys + (va - tlbcam_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_mapped_by_tlbcam(phys_addr_t pa) +{ + int b; + for (b = 0; b < tlbcam_index; ++b) + if (pa >= tlbcam_addrs[b].phys + && pa < (tlbcam_addrs[b].limit-tlbcam_addrs[b].start) + +tlbcam_addrs[b].phys) + return tlbcam_addrs[b].start+(pa-tlbcam_addrs[b].phys); + return 0; +} + +/* + * Set up a variable-size TLB entry (tlbcam). The parameters are not checked; + * in particular size must be a power of 4 between 4k and the max supported by + * an implementation; max may further be limited by what can be represented in + * an unsigned long (for example, 32-bit implementations cannot support a 4GB + * size). + */ +static void settlbcam(int index, unsigned long virt, phys_addr_t phys, + unsigned long size, unsigned long flags, unsigned int pid) +{ + unsigned int tsize; + + tsize = __ilog2(size) - 10; + +#ifdef CONFIG_SMP + if ((flags & _PAGE_NO_CACHE) == 0) + flags |= _PAGE_COHERENT; +#endif + + TLBCAM[index].MAS0 = MAS0_TLBSEL(1) | MAS0_ESEL(index) | MAS0_NV(index+1); + TLBCAM[index].MAS1 = MAS1_VALID | MAS1_IPROT | MAS1_TSIZE(tsize) | MAS1_TID(pid); + TLBCAM[index].MAS2 = virt & PAGE_MASK; + + TLBCAM[index].MAS2 |= (flags & _PAGE_WRITETHRU) ? MAS2_W : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_NO_CACHE) ? MAS2_I : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_COHERENT) ? MAS2_M : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0; + TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; + + TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR; + TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + TLBCAM[index].MAS7 = (u64)phys >> 32; + + /* Below is unlikely -- only for large user pages or similar */ + if (pte_user(flags)) { + TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; + TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); + } + + tlbcam_addrs[index].start = virt; + tlbcam_addrs[index].limit = virt + size - 1; + tlbcam_addrs[index].phys = phys; + + loadcam_entry(index); +} + +unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys) +{ + unsigned int camsize = __ilog2(ram); + unsigned int align = __ffs(virt | phys); + unsigned long max_cam; + + if ((mfspr(SPRN_MMUCFG) & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + /* Convert (4^max) kB to (2^max) bytes */ + max_cam = ((mfspr(SPRN_TLB1CFG) >> 16) & 0xf) * 2 + 10; + camsize &= ~1U; + align &= ~1U; + } else { + /* Convert (2^max) kB to (2^max) bytes */ + max_cam = __ilog2(mfspr(SPRN_TLB1PS)) + 10; + } + + if (camsize > align) + camsize = align; + if (camsize > max_cam) + camsize = max_cam; + + return 1UL << camsize; +} + +static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt, + unsigned long ram, int max_cam_idx) +{ + int i; + unsigned long amount_mapped = 0; + + /* Calculate CAM values */ + for (i = 0; ram && i < max_cam_idx; i++) { + unsigned long cam_sz; + + cam_sz = calc_cam_sz(ram, virt, phys); + settlbcam(i, virt, phys, cam_sz, pgprot_val(PAGE_KERNEL_X), 0); + + ram -= cam_sz; + amount_mapped += cam_sz; + virt += cam_sz; + phys += cam_sz; + } + tlbcam_index = i; + +#ifdef CONFIG_PPC64 + get_paca()->tcd.esel_next = i; + get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + get_paca()->tcd.esel_first = i; +#endif + + return amount_mapped; +} + +unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx) +{ + unsigned long virt = PAGE_OFFSET; + phys_addr_t phys = memstart_addr; + + return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx); +} + +#ifdef CONFIG_PPC32 + +#if defined(CONFIG_LOWMEM_CAM_NUM_BOOL) && (CONFIG_LOWMEM_CAM_NUM >= NUM_TLBCAMS) +#error "LOWMEM_CAM_NUM must be less than NUM_TLBCAMS" +#endif + +unsigned long __init mmu_mapin_ram(unsigned long top) +{ + return tlbcam_addrs[tlbcam_index - 1].limit - PAGE_OFFSET + 1; +} + +/* + * MMU_init_hw does the chip-specific initialization of the MMU hardware. + */ +void __init MMU_init_hw(void) +{ + flush_instruction_cache(); +} + +void __init adjust_total_lowmem(void) +{ + unsigned long ram; + int i; + + /* adjust lowmem size to __max_low_memory */ + ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem); + + i = switch_to_as1(); + __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM); + restore_to_as0(i, 0, 0, 1); + + pr_info("Memory CAM mapping: "); + for (i = 0; i < tlbcam_index - 1; i++) + pr_cont("%lu/", tlbcam_sz(i) >> 20); + pr_cont("%lu Mb, residual: %dMb\n", tlbcam_sz(tlbcam_index - 1) >> 20, + (unsigned int)((total_lowmem - __max_low_memory) >> 20)); + + memblock_set_current_limit(memstart_addr + __max_low_memory); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + phys_addr_t limit = first_memblock_base + first_memblock_size; + + /* 64M mapped initially according to head_fsl_booke.S */ + memblock_set_current_limit(min_t(u64, limit, 0x04000000)); +} + +#ifdef CONFIG_RELOCATABLE +int __initdata is_second_reloc; +notrace void __init relocate_init(u64 dt_ptr, phys_addr_t start) +{ + unsigned long base = KERNELBASE; + + kernstart_addr = start; + if (is_second_reloc) { + virt_phys_offset = PAGE_OFFSET - memstart_addr; + return; + } + + /* + * Relocatable kernel support based on processing of dynamic + * relocation entries. Before we get the real memstart_addr, + * We will compute the virt_phys_offset like this: + * virt_phys_offset = stext.run - kernstart_addr + * + * stext.run = (KERNELBASE & ~0x3ffffff) + + * (kernstart_addr & 0x3ffffff) + * When we relocate, we have : + * + * (kernstart_addr & 0x3ffffff) = (stext.run & 0x3ffffff) + * + * hence: + * virt_phys_offset = (KERNELBASE & ~0x3ffffff) - + * (kernstart_addr & ~0x3ffffff) + * + */ + start &= ~0x3ffffff; + base &= ~0x3ffffff; + virt_phys_offset = base - start; + early_get_first_memblock_info(__va(dt_ptr), NULL); + /* + * We now get the memstart_addr, then we should check if this + * address is the same as what the PAGE_OFFSET map to now. If + * not we have to change the map of PAGE_OFFSET to memstart_addr + * and do a second relocation. + */ + if (start != memstart_addr) { + int n; + long offset = start - memstart_addr; + + is_second_reloc = 1; + n = switch_to_as1(); + /* map a 64M area for the second relocation */ + if (memstart_addr > start) + map_mem_in_cams(0x4000000, CONFIG_LOWMEM_CAM_NUM); + else + map_mem_in_cams_addr(start, PAGE_OFFSET + offset, + 0x4000000, CONFIG_LOWMEM_CAM_NUM); + restore_to_as0(n, offset, __va(dt_ptr), 1); + /* We should never reach here */ + panic("Relocation error"); + } +} +#endif +#endif diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S new file mode 100644 index 000000000..115347f74 --- /dev/null +++ b/arch/powerpc/mm/hash_low_32.S @@ -0,0 +1,712 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * Rewritten by Cort Dougan (cort@cs.nmt.edu) for PReP + * Copyright (C) 1996 Cort Dougan <cort@cs.nmt.edu> + * Adapted for Power Macintosh by Paul Mackerras. + * Low-level exception handlers and MMU support + * rewritten by Paul Mackerras. + * Copyright (C) 1996 Paul Mackerras. + * + * This file contains low-level assembler routines for managing + * the PowerPC MMU hash table. (PPC 8xx processors don't use a + * hash table, so this file is not used on them.) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/cputable.h> +#include <asm/ppc_asm.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> + +#ifdef CONFIG_SMP + .section .bss + .align 2 + .globl mmu_hash_lock +mmu_hash_lock: + .space 4 +#endif /* CONFIG_SMP */ + +/* + * Load a PTE into the hash table, if possible. + * The address is in r4, and r3 contains an access flag: + * _PAGE_RW (0x400) if a write. + * r9 contains the SRR1 value, from which we use the MSR_PR bit. + * SPRG_THREAD contains the physical address of the current task's thread. + * + * Returns to the caller if the access is illegal or there is no + * mapping for the address. Otherwise it places an appropriate PTE + * in the hash table and returns from the exception. + * Uses r0, r3 - r8, r10, ctr, lr. + */ + .text +_GLOBAL(hash_page) + tophys(r7,0) /* gets -KERNELBASE into r7 */ +#ifdef CONFIG_SMP + addis r8,r7,mmu_hash_lock@h + ori r8,r8,mmu_hash_lock@l + lis r0,0x0fff + b 10f +11: lwz r6,0(r8) + cmpwi 0,r6,0 + bne 11b +10: lwarx r6,0,r8 + cmpwi 0,r6,0 + bne- 11b + stwcx. r0,0,r8 + bne- 10b + isync +#endif + /* Get PTE (linux-style) and check access */ + lis r0,KERNELBASE@h /* check if kernel address */ + cmplw 0,r4,r0 + mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ + lwz r5,PGDIR(r8) /* virt page-table root */ + blt+ 112f /* assume user more likely */ + lis r5,swapper_pg_dir@ha /* if kernel address, use */ + addi r5,r5,swapper_pg_dir@l /* kernel page table */ + rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ +112: add r5,r5,r7 /* convert to phys addr */ +#ifndef CONFIG_PTE_64BIT + rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ + lwz r8,0(r5) /* get pmd entry */ + rlwinm. r8,r8,0,0,19 /* extract address of pte page */ +#else + rlwinm r8,r4,13,19,29 /* Compute pgdir/pmd offset */ + lwzx r8,r8,r5 /* Get L1 entry */ + rlwinm. r8,r8,0,0,20 /* extract pt base address */ +#endif +#ifdef CONFIG_SMP + beq- hash_page_out /* return if no mapping */ +#else + /* XXX it seems like the 601 will give a machine fault on the + rfi if its alignment is wrong (bottom 4 bits of address are + 8 or 0xc) and we have had a not-taken conditional branch + to the address following the rfi. */ + beqlr- +#endif +#ifndef CONFIG_PTE_64BIT + rlwimi r8,r4,22,20,29 /* insert next 10 bits of address */ +#else + rlwimi r8,r4,23,20,28 /* compute pte address */ +#endif + rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ + ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE + + /* + * Update the linux PTE atomically. We do the lwarx up-front + * because almost always, there won't be a permission violation + * and there won't already be an HPTE, and thus we will have + * to update the PTE to set _PAGE_HASHPTE. -- paulus. + * + * If PTE_64BIT is set, the low word is the flags word; use that + * word for locking since it contains all the interesting bits. + */ +#if (PTE_FLAGS_OFFSET != 0) + addi r8,r8,PTE_FLAGS_OFFSET +#endif +retry: + lwarx r6,0,r8 /* get linux-style pte, flag word */ + andc. r5,r3,r6 /* check access & ~permission */ +#ifdef CONFIG_SMP + bne- hash_page_out /* return if access not permitted */ +#else + bnelr- +#endif + or r5,r0,r6 /* set accessed/dirty bits */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ + stwcx. r5,0,r8 /* attempt to update PTE */ + bne- retry /* retry if someone got there first */ + + mfsrin r3,r4 /* get segment reg for segment */ + mfctr r0 + stw r0,_CTR(r11) + bl create_hpte /* add the hash table entry */ + +#ifdef CONFIG_SMP + eieio + addis r8,r7,mmu_hash_lock@ha + li r0,0 + stw r0,mmu_hash_lock@l(r8) +#endif + + /* Return from the exception */ + lwz r5,_CTR(r11) + mtctr r5 + lwz r0,GPR0(r11) + lwz r7,GPR7(r11) + lwz r8,GPR8(r11) + b fast_exception_return + +#ifdef CONFIG_SMP +hash_page_out: + eieio + addis r8,r7,mmu_hash_lock@ha + li r0,0 + stw r0,mmu_hash_lock@l(r8) + blr +#endif /* CONFIG_SMP */ + +/* + * Add an entry for a particular page to the hash table. + * + * add_hash_page(unsigned context, unsigned long va, unsigned long pmdval) + * + * We assume any necessary modifications to the pte (e.g. setting + * the accessed bit) have already been done and that there is actually + * a hash table in use (i.e. we're not on a 603). + */ +_GLOBAL(add_hash_page) + mflr r0 + stw r0,4(r1) + + /* Convert context and va to VSID */ + mulli r3,r3,897*16 /* multiply context by context skew */ + rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ + mulli r0,r0,0x111 /* multiply by ESID skew */ + add r3,r3,r0 /* note create_hpte trims to 24 bits */ + +#ifdef CONFIG_SMP + CURRENT_THREAD_INFO(r8, r1) /* use cpu number to make tag */ + lwz r8,TI_CPU(r8) /* to go in mmu_hash_lock */ + oris r8,r8,12 +#endif /* CONFIG_SMP */ + + /* + * We disable interrupts here, even on UP, because we don't + * want to race with hash_page, and because we want the + * _PAGE_HASHPTE bit to be a reliable indication of whether + * the HPTE exists (or at least whether one did once). + * We also turn off the MMU for data accesses so that we + * we can't take a hash table miss (assuming the code is + * covered by a BAT). -- paulus + */ + mfmsr r9 + SYNC + rlwinm r0,r9,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear MSR_DR */ + mtmsr r0 + SYNC_601 + isync + + tophys(r7,0) + +#ifdef CONFIG_SMP + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l +10: lwarx r0,0,r6 /* take the mmu_hash_lock */ + cmpi 0,r0,0 + bne- 11f + stwcx. r8,0,r6 + beq+ 12f +11: lwz r0,0(r6) + cmpi 0,r0,0 + beq 10b + b 11b +12: isync +#endif + + /* + * Fetch the linux pte and test and set _PAGE_HASHPTE atomically. + * If _PAGE_HASHPTE was already set, we don't replace the existing + * HPTE, so we just unlock and return. + */ + mr r8,r5 +#ifndef CONFIG_PTE_64BIT + rlwimi r8,r4,22,20,29 +#else + rlwimi r8,r4,23,20,28 + addi r8,r8,PTE_FLAGS_OFFSET +#endif +1: lwarx r6,0,r8 + andi. r0,r6,_PAGE_HASHPTE + bne 9f /* if HASHPTE already set, done */ +#ifdef CONFIG_PTE_64BIT +#ifdef CONFIG_SMP + subf r10,r6,r8 /* create false data dependency */ + subi r10,r10,PTE_FLAGS_OFFSET + lwzx r10,r6,r10 /* Get upper PTE word */ +#else + lwz r10,-PTE_FLAGS_OFFSET(r8) +#endif /* CONFIG_SMP */ +#endif /* CONFIG_PTE_64BIT */ + ori r5,r6,_PAGE_HASHPTE + stwcx. r5,0,r8 + bne- 1b + + bl create_hpte + +9: +#ifdef CONFIG_SMP + addis r6,r7,mmu_hash_lock@ha + addi r6,r6,mmu_hash_lock@l + eieio + li r0,0 + stw r0,0(r6) /* clear mmu_hash_lock */ +#endif + + /* reenable interrupts and DR */ + mtmsr r9 + SYNC_601 + isync + + lwz r0,4(r1) + mtlr r0 + blr + +/* + * This routine adds a hardware PTE to the hash table. + * It is designed to be called with the MMU either on or off. + * r3 contains the VSID, r4 contains the virtual address, + * r5 contains the linux PTE, r6 contains the old value of the + * linux PTE (before setting _PAGE_HASHPTE) and r7 contains the + * offset to be added to addresses (0 if the MMU is on, + * -KERNELBASE if it is off). r10 contains the upper half of + * the PTE if CONFIG_PTE_64BIT. + * On SMP, the caller should have the mmu_hash_lock held. + * We assume that the caller has (or will) set the _PAGE_HASHPTE + * bit in the linux PTE in memory. The value passed in r6 should + * be the old linux PTE value; if it doesn't have _PAGE_HASHPTE set + * this routine will skip the search for an existing HPTE. + * This procedure modifies r0, r3 - r6, r8, cr0. + * -- paulus. + * + * For speed, 4 of the instructions get patched once the size and + * physical address of the hash table are known. These definitions + * of Hash_base and Hash_bits below are just an example. + */ +Hash_base = 0xc0180000 +Hash_bits = 12 /* e.g. 256kB hash table */ +Hash_msk = (((1 << Hash_bits) - 1) * 64) + +/* defines for the PTE format for 32-bit PPCs */ +#define HPTE_SIZE 8 +#define PTEG_SIZE 64 +#define LG_PTEG_SIZE 6 +#define LDPTEu lwzu +#define LDPTE lwz +#define STPTE stw +#define CMPPTE cmpw +#define PTE_H 0x40 +#define PTE_V 0x80000000 +#define TST_V(r) rlwinm. r,r,0,0,0 +#define SET_V(r) oris r,r,PTE_V@h +#define CLR_V(r,t) rlwinm r,r,0,1,31 + +#define HASH_LEFT 31-(LG_PTEG_SIZE+Hash_bits-1) +#define HASH_RIGHT 31-LG_PTEG_SIZE + +_GLOBAL(create_hpte) + /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ + rlwinm r8,r5,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwinm r0,r5,32-7,31,31 /* _PAGE_DIRTY -> PP lsb */ + and r8,r8,r0 /* writable if _RW & _DIRTY */ + rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ + rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ + ori r8,r8,0xe04 /* clear out reserved bits */ + andc r8,r5,r8 /* PP = user? (rw&dirty? 2: 3): 0 */ +BEGIN_FTR_SECTION + rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ +END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) +#ifdef CONFIG_PTE_64BIT + /* Put the XPN bits into the PTE */ + rlwimi r8,r10,8,20,22 + rlwimi r8,r10,2,29,29 +#endif + + /* Construct the high word of the PPC-style PTE (r5) */ + rlwinm r5,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ + rlwimi r5,r4,10,26,31 /* put in API (abbrev page index) */ + SET_V(r5) /* set V (valid) bit */ + + /* Get the address of the primary PTE group in the hash table (r3) */ +_GLOBAL(hash_page_patch_A) + addis r0,r7,Hash_base@h /* base address of hash table */ + rlwimi r0,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ + rlwinm r3,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ + xor r3,r3,r0 /* make primary hash */ + li r0,8 /* PTEs/group */ + + /* + * Test the _PAGE_HASHPTE bit in the old linux PTE, and skip the search + * if it is clear, meaning that the HPTE isn't there already... + */ + andi. r6,r6,_PAGE_HASHPTE + beq+ 10f /* no PTE: go look for an empty slot */ + tlbie r4 + + addis r4,r7,htab_hash_searches@ha + lwz r6,htab_hash_searches@l(r4) + addi r6,r6,1 /* count how many searches we do */ + stw r6,htab_hash_searches@l(r4) + + /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ + mtctr r0 + addi r4,r3,-HPTE_SIZE +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ + CMPPTE 0,r6,r5 + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ found_slot + + /* Search the secondary PTEG for a matching PTE */ + ori r5,r5,PTE_H /* set H (secondary hash) bit */ +_GLOBAL(hash_page_patch_B) + xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ + xori r4,r4,(-PTEG_SIZE & 0xffff) + addi r4,r4,-HPTE_SIZE + mtctr r0 +2: LDPTEu r6,HPTE_SIZE(r4) + CMPPTE 0,r6,r5 + bdnzf 2,2b + beq+ found_slot + xori r5,r5,PTE_H /* clear H bit again */ + + /* Search the primary PTEG for an empty slot */ +10: mtctr r0 + addi r4,r3,-HPTE_SIZE /* search primary PTEG */ +1: LDPTEu r6,HPTE_SIZE(r4) /* get next PTE */ + TST_V(r6) /* test valid bit */ + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ found_empty + + /* update counter of times that the primary PTEG is full */ + addis r4,r7,primary_pteg_full@ha + lwz r6,primary_pteg_full@l(r4) + addi r6,r6,1 + stw r6,primary_pteg_full@l(r4) + + /* Search the secondary PTEG for an empty slot */ + ori r5,r5,PTE_H /* set H (secondary hash) bit */ +_GLOBAL(hash_page_patch_C) + xoris r4,r3,Hash_msk>>16 /* compute secondary hash */ + xori r4,r4,(-PTEG_SIZE & 0xffff) + addi r4,r4,-HPTE_SIZE + mtctr r0 +2: LDPTEu r6,HPTE_SIZE(r4) + TST_V(r6) + bdnzf 2,2b + beq+ found_empty + xori r5,r5,PTE_H /* clear H bit again */ + + /* + * Choose an arbitrary slot in the primary PTEG to overwrite. + * Since both the primary and secondary PTEGs are full, and we + * have no information that the PTEs in the primary PTEG are + * more important or useful than those in the secondary PTEG, + * and we know there is a definite (although small) speed + * advantage to putting the PTE in the primary PTEG, we always + * put the PTE in the primary PTEG. + * + * In addition, we skip any slot that is mapping kernel text in + * order to avoid a deadlock when not using BAT mappings if + * trying to hash in the kernel hash code itself after it has + * already taken the hash table lock. This works in conjunction + * with pre-faulting of the kernel text. + * + * If the hash table bucket is full of kernel text entries, we'll + * lockup here but that shouldn't happen + */ + +1: addis r4,r7,next_slot@ha /* get next evict slot */ + lwz r6,next_slot@l(r4) + addi r6,r6,HPTE_SIZE /* search for candidate */ + andi. r6,r6,7*HPTE_SIZE + stw r6,next_slot@l(r4) + add r4,r3,r6 + LDPTE r0,HPTE_SIZE/2(r4) /* get PTE second word */ + clrrwi r0,r0,12 + lis r6,etext@h + ori r6,r6,etext@l /* get etext */ + tophys(r6,r6) + cmpl cr0,r0,r6 /* compare and try again */ + blt 1b + +#ifndef CONFIG_SMP + /* Store PTE in PTEG */ +found_empty: + STPTE r5,0(r4) +found_slot: + STPTE r8,HPTE_SIZE/2(r4) + +#else /* CONFIG_SMP */ +/* + * Between the tlbie above and updating the hash table entry below, + * another CPU could read the hash table entry and put it in its TLB. + * There are 3 cases: + * 1. using an empty slot + * 2. updating an earlier entry to change permissions (i.e. enable write) + * 3. taking over the PTE for an unrelated address + * + * In each case it doesn't really matter if the other CPUs have the old + * PTE in their TLB. So we don't need to bother with another tlbie here, + * which is convenient as we've overwritten the register that had the + * address. :-) The tlbie above is mainly to make sure that this CPU comes + * and gets the new PTE from the hash table. + * + * We do however have to make sure that the PTE is never in an invalid + * state with the V bit set. + */ +found_empty: +found_slot: + CLR_V(r5,r0) /* clear V (valid) bit in PTE */ + STPTE r5,0(r4) + sync + TLBSYNC + STPTE r8,HPTE_SIZE/2(r4) /* put in correct RPN, WIMG, PP bits */ + sync + SET_V(r5) + STPTE r5,0(r4) /* finally set V bit in PTE */ +#endif /* CONFIG_SMP */ + + sync /* make sure pte updates get to memory */ + blr + + .section .bss + .align 2 +next_slot: + .space 4 +primary_pteg_full: + .space 4 +htab_hash_searches: + .space 4 + .previous + +/* + * Flush the entry for a particular page from the hash table. + * + * flush_hash_pages(unsigned context, unsigned long va, unsigned long pmdval, + * int count) + * + * We assume that there is a hash table in use (Hash != 0). + */ +_GLOBAL(flush_hash_pages) + tophys(r7,0) + + /* + * We disable interrupts here, even on UP, because we want + * the _PAGE_HASHPTE bit to be a reliable indication of + * whether the HPTE exists (or at least whether one did once). + * We also turn off the MMU for data accesses so that we + * we can't take a hash table miss (assuming the code is + * covered by a BAT). -- paulus + */ + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear MSR_DR */ + mtmsr r0 + SYNC_601 + isync + + /* First find a PTE in the range that has _PAGE_HASHPTE set */ +#ifndef CONFIG_PTE_64BIT + rlwimi r5,r4,22,20,29 +#else + rlwimi r5,r4,23,20,28 +#endif +1: lwz r0,PTE_FLAGS_OFFSET(r5) + cmpwi cr1,r6,1 + andi. r0,r0,_PAGE_HASHPTE + bne 2f + ble cr1,19f + addi r4,r4,0x1000 + addi r5,r5,PTE_SIZE + addi r6,r6,-1 + b 1b + + /* Convert context and va to VSID */ +2: mulli r3,r3,897*16 /* multiply context by context skew */ + rlwinm r0,r4,4,28,31 /* get ESID (top 4 bits of va) */ + mulli r0,r0,0x111 /* multiply by ESID skew */ + add r3,r3,r0 /* note code below trims to 24 bits */ + + /* Construct the high word of the PPC-style PTE (r11) */ + rlwinm r11,r3,7,1,24 /* put VSID in 0x7fffff80 bits */ + rlwimi r11,r4,10,26,31 /* put in API (abbrev page index) */ + SET_V(r11) /* set V (valid) bit */ + +#ifdef CONFIG_SMP + addis r9,r7,mmu_hash_lock@ha + addi r9,r9,mmu_hash_lock@l + CURRENT_THREAD_INFO(r8, r1) + add r8,r8,r7 + lwz r8,TI_CPU(r8) + oris r8,r8,9 +10: lwarx r0,0,r9 + cmpi 0,r0,0 + bne- 11f + stwcx. r8,0,r9 + beq+ 12f +11: lwz r0,0(r9) + cmpi 0,r0,0 + beq 10b + b 11b +12: isync +#endif + + /* + * Check the _PAGE_HASHPTE bit in the linux PTE. If it is + * already clear, we're done (for this pte). If not, + * clear it (atomically) and proceed. -- paulus. + */ +#if (PTE_FLAGS_OFFSET != 0) + addi r5,r5,PTE_FLAGS_OFFSET +#endif +33: lwarx r8,0,r5 /* fetch the pte flags word */ + andi. r0,r8,_PAGE_HASHPTE + beq 8f /* done if HASHPTE is already clear */ + rlwinm r8,r8,0,31,29 /* clear HASHPTE bit */ + stwcx. r8,0,r5 /* update the pte */ + bne- 33b + + /* Get the address of the primary PTE group in the hash table (r3) */ +_GLOBAL(flush_hash_patch_A) + addis r8,r7,Hash_base@h /* base address of hash table */ + rlwimi r8,r3,LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* VSID -> hash */ + rlwinm r0,r4,20+LG_PTEG_SIZE,HASH_LEFT,HASH_RIGHT /* PI -> hash */ + xor r8,r0,r8 /* make primary hash */ + + /* Search the primary PTEG for a PTE whose 1st (d)word matches r5 */ + li r0,8 /* PTEs/group */ + mtctr r0 + addi r12,r8,-HPTE_SIZE +1: LDPTEu r0,HPTE_SIZE(r12) /* get next PTE */ + CMPPTE 0,r0,r11 + bdnzf 2,1b /* loop while ctr != 0 && !cr0.eq */ + beq+ 3f + + /* Search the secondary PTEG for a matching PTE */ + ori r11,r11,PTE_H /* set H (secondary hash) bit */ + li r0,8 /* PTEs/group */ +_GLOBAL(flush_hash_patch_B) + xoris r12,r8,Hash_msk>>16 /* compute secondary hash */ + xori r12,r12,(-PTEG_SIZE & 0xffff) + addi r12,r12,-HPTE_SIZE + mtctr r0 +2: LDPTEu r0,HPTE_SIZE(r12) + CMPPTE 0,r0,r11 + bdnzf 2,2b + xori r11,r11,PTE_H /* clear H again */ + bne- 4f /* should rarely fail to find it */ + +3: li r0,0 + STPTE r0,0(r12) /* invalidate entry */ +4: sync + tlbie r4 /* in hw tlb too */ + sync + +8: ble cr1,9f /* if all ptes checked */ +81: addi r6,r6,-1 + addi r5,r5,PTE_SIZE + addi r4,r4,0x1000 + lwz r0,0(r5) /* check next pte */ + cmpwi cr1,r6,1 + andi. r0,r0,_PAGE_HASHPTE + bne 33b + bgt cr1,81b + +9: +#ifdef CONFIG_SMP + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ +#endif + +19: mtmsr r10 + SYNC_601 + isync + blr + +/* + * Flush an entry from the TLB + */ +_GLOBAL(_tlbie) +#ifdef CONFIG_SMP + CURRENT_THREAD_INFO(r8, r1) + lwz r8,TI_CPU(r8) + oris r8,r8,11 + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + SYNC_601 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b + eieio + tlbie r3 + sync + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + SYNC_601 + isync +#else /* CONFIG_SMP */ + tlbie r3 + sync +#endif /* CONFIG_SMP */ + blr + +/* + * Flush the entire TLB. 603/603e only + */ +_GLOBAL(_tlbia) +#if defined(CONFIG_SMP) + CURRENT_THREAD_INFO(r8, r1) + lwz r8,TI_CPU(r8) + oris r8,r8,10 + mfmsr r10 + SYNC + rlwinm r0,r10,0,17,15 /* clear bit 16 (MSR_EE) */ + rlwinm r0,r0,0,28,26 /* clear DR */ + mtmsr r0 + SYNC_601 + isync + lis r9,mmu_hash_lock@h + ori r9,r9,mmu_hash_lock@l + tophys(r9,r9) +10: lwarx r7,0,r9 + cmpwi 0,r7,0 + bne- 10b + stwcx. r8,0,r9 + bne- 10b + sync + tlbia + sync + TLBSYNC + li r0,0 + stw r0,0(r9) /* clear mmu_hash_lock */ + mtmsr r10 + SYNC_601 + isync +#else /* CONFIG_SMP */ + sync + tlbia + sync +#endif /* CONFIG_SMP */ + blr diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S new file mode 100644 index 000000000..463174a4a --- /dev/null +++ b/arch/powerpc/mm/hash_low_64.S @@ -0,0 +1,1003 @@ +/* + * ppc64 MMU hashtable management routines + * + * (c) Copyright IBM Corp. 2003, 2005 + * + * Maintained by: Benjamin Herrenschmidt + * <benh@kernel.crashing.org> + * + * This file is covered by the GNU Public Licence v2 as + * described in the kernel's COPYING file. + */ + +#include <asm/reg.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/types.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> + + .text + +/* + * Stackframe: + * + * +-> Back chain (SP + 256) + * | General register save area (SP + 112) + * | Parameter save area (SP + 48) + * | TOC save area (SP + 40) + * | link editor doubleword (SP + 32) + * | compiler doubleword (SP + 24) + * | LR save area (SP + 16) + * | CR save area (SP + 8) + * SP ---> +-- Back chain (SP + 0) + */ + +#ifndef CONFIG_PPC_64K_PAGES + +/***************************************************************************** + * * + * 4K SW & 4K HW pages implementation * + * * + *****************************************************************************/ + + +/* + * _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, unsigned long flags, + * int ssize) + * + * Adds a 4K page to the hash table in a segment of 4K pages only + */ + +_GLOBAL(__hash_page_4K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARAM(R6)(r1) + std r8,STK_PARAM(R8)(r1) + std r9,STK_PARAM(R9)(r1) + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is vpn + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + */ + std r27,STK_REG(R27)(r1) + std r28,STK_REG(R28)(r1) + std r29,STK_REG(R29)(r1) + std r30,STK_REG(R30)(r1) + std r31,STK_REG(R31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- htab_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- htab_bail_ok + + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY,HASHPTE and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + +BEGIN_FTR_SECTION + cmpdi r9,0 /* check segment size */ + bne 3f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 + /* + * Calculate hash value for primary slot and store it in r28 + * r3 = va, r5 = vsid + * r0 = (va >> 12) & ((1ul << (28 - 12)) -1) + */ + rldicl r0,r3,64-12,48 + xor r28,r5,r0 /* hash */ + b 4f + +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + + /* + * calculate hash value for primary slot and + * store it in r28 for 1T segment + * r3 = va, r5 = vsid + */ + sldi r28,r5,25 /* vsid << 25 */ + /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */ + rldicl r0,r3,64-12,36 + xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ + xor r28,r28,r0 /* hash */ + + /* Convert linux PTE bits into HW equivalents */ +4: andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + ori r3,r3,HPTE_R_C | HPTE_R_M + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARAM(R4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE + */ + andi. r0,r31,_PAGE_HASHPTE + bne htab_modify_pte + +htab_insert_pte: + /* Clear hpte bits in new pte (we also clear BUSY btw) and + * add _PAGE_HASHPTE + */ + lis r0,_PAGE_HPTEFLAGS@h + ori r0,r0,_PAGE_HPTEFLAGS@l + andc r30,r30,r0 + ori r30,r30,_PAGE_HASHPTE + + /* physical address r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r3 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_4K /* page size */ + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert1 +htab_call_hpte_insert1: + bl . /* Patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Now try secondary slot */ + + /* physical address r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_4K /* page size */ + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert2 +htab_call_hpte_insert2: + bl . /* Patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +.globl htab_call_hpte_remove +htab_call_hpte_remove: + bl . /* Patched by htab_finish_init() */ + + /* Try all again */ + b htab_insert_pte + +htab_bail_ok: + li r3,0 + b htab_bail + +htab_pte_insert_ok: + /* Insert slot number & secondary bit in PTE */ + rldimi r30,r3,12,63-15 + + /* Write out the PTE with a normal write + * (maybe add eieio may be good still ?) + */ +htab_write_out_pte: + ld r6,STK_PARAM(R6)(r1) + std r30,0(r6) + li r3, 0 +htab_bail: + ld r27,STK_REG(R27)(r1) + ld r28,STK_REG(R28)(r1) + ld r29,STK_REG(R29)(r1) + ld r30,STK_REG(R30)(r1) + ld r31,STK_REG(R31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +htab_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + rlwinm r3,r31,32-12,29,31 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r31,_PAGE_SECONDARY + beq 1f + not r5,r5 +1: + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* vpn */ + li r6,MMU_PAGE_4K /* base page size */ + li r7,MMU_PAGE_4K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "flags" param */ +.globl htab_call_hpte_updatepp +htab_call_hpte_updatepp: + bl . /* Patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- htab_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + b htab_write_out_pte + +htab_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b htab_bail + +htab_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARAM(R6)(r1) + std r31,0(r6) + li r3,-1 + b htab_bail + + +#else /* CONFIG_PPC_64K_PAGES */ + + +/***************************************************************************** + * * + * 64K SW & 4K or 64K HW in a 4K segment pages implementation * + * * + *****************************************************************************/ + +/* _hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, + * pte_t *ptep, unsigned long trap, unsigned local flags, + * int ssize, int subpg_prot) + */ + +/* + * For now, we do NOT implement Admixed pages + */ +_GLOBAL(__hash_page_4K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARAM(R6)(r1) + std r8,STK_PARAM(R8)(r1) + std r9,STK_PARAM(R9)(r1) + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is vpn + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + * r26 is the hidx mask + * r25 is the index in combo page + */ + std r25,STK_REG(R25)(r1) + std r26,STK_REG(R26)(r1) + std r27,STK_REG(R27)(r1) + std r28,STK_REG(R28)(r1) + std r29,STK_REG(R29)(r1) + std r30,STK_REG(R30)(r1) + std r31,STK_REG(R31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- htab_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- htab_bail_ok + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED + oris r30,r30,_PAGE_COMBO@h + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + + /* Load the hidx index */ + rldicl r25,r3,64-12,60 + +BEGIN_FTR_SECTION + cmpdi r9,0 /* check segment size */ + bne 3f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + /* + * clrldi r3,r3,64 - SID_SHIFT --> ea & 0xfffffff + * srdi r28,r3,VPN_SHIFT + */ + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 + /* + * Calculate hash value for primary slot and store it in r28 + * r3 = va, r5 = vsid + * r0 = (va >> 12) & ((1ul << (28 - 12)) -1) + */ + rldicl r0,r3,64-12,48 + xor r28,r5,r0 /* hash */ + b 4f + +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + /* + * clrldi r3,r3,64 - SID_SHIFT_1T --> ea & 0xffffffffff + * srdi r28,r3,VPN_SHIFT + */ + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + + /* + * Calculate hash value for primary slot and + * store it in r28 for 1T segment + * r3 = va, r5 = vsid + */ + sldi r28,r5,25 /* vsid << 25 */ + /* r0 = (va >> 12) & ((1ul << (40 - 12)) -1) */ + rldicl r0,r3,64-12,36 + xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ + xor r28,r28,r0 /* hash */ + + /* Convert linux PTE bits into HW equivalents */ +4: +#ifdef CONFIG_PPC_SUBPAGE_PROT + andc r10,r30,r10 + andi. r3,r10,0x1fe /* Get basic set of flags */ + rlwinm r0,r10,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ +#else + andi. r3,r30,0x1fe /* Get basic set of flags */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ +#endif + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r3,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + ori r3,r3,HPTE_R_C | HPTE_R_M + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARAM(R4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE. We look for + * the bit at (1 >> (index + 32)) + */ + rldicl. r0,r31,64-12,48 + li r26,0 /* Default hidx */ + beq htab_insert_pte + + /* + * Check if the pte was already inserted into the hash table + * as a 64k HW page, and invalidate the 64k HPTE if so. + */ + andis. r0,r31,_PAGE_COMBO@h + beq htab_inval_old_hpte + + ld r6,STK_PARAM(R6)(r1) + ori r26,r6,PTE_PAGE_HIDX_OFFSET /* Load the hidx mask. */ + ld r26,0(r26) + addi r5,r25,36 /* Check actual HPTE_SUB bit, this */ + rldcr. r0,r31,r5,0 /* must match pgtable.h definition */ + bne htab_modify_pte + +htab_insert_pte: + /* real page number in r5, PTE RPN value + index */ + andis. r0,r31,_PAGE_4K_PFN@h + srdi r5,r31,PTE_RPN_SHIFT + bne- htab_special_pfn + sldi r5,r5,PAGE_FACTOR + add r5,r5,r25 +htab_special_pfn: + sldi r5,r5,HW_PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_4K /* page size */ + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert1 +htab_call_hpte_insert1: + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Now try secondary slot */ + + /* real page number in r5, PTE RPN value + index */ + andis. r0,r31,_PAGE_4K_PFN@h + srdi r5,r31,PTE_RPN_SHIFT + bne- 3f + sldi r5,r5,PAGE_FACTOR + add r5,r5,r25 +3: sldi r5,r5,HW_PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_4K /* page size */ + li r9,MMU_PAGE_4K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl htab_call_hpte_insert2 +htab_call_hpte_insert2: + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ htab_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- htab_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +.globl htab_call_hpte_remove +htab_call_hpte_remove: + bl . /* patched by htab_finish_init() */ + + /* Try all again */ + b htab_insert_pte + + /* + * Call out to C code to invalidate an 64k HW HPTE that is + * useless now that the segment has been switched to 4k pages. + */ +htab_inval_old_hpte: + mr r3,r29 /* vpn */ + mr r4,r31 /* PTE.pte */ + li r5,0 /* PTE.hidx */ + li r6,MMU_PAGE_64K /* psize */ + ld r7,STK_PARAM(R9)(r1) /* ssize */ + ld r8,STK_PARAM(R8)(r1) /* flags */ + bl flush_hash_page + /* Clear out _PAGE_HPTE_SUB bits in the new linux PTE */ + lis r0,_PAGE_HPTE_SUB@h + ori r0,r0,_PAGE_HPTE_SUB@l + andc r30,r30,r0 + b htab_insert_pte + +htab_bail_ok: + li r3,0 + b htab_bail + +htab_pte_insert_ok: + /* Insert slot number & secondary bit in PTE second half, + * clear _PAGE_BUSY and set approriate HPTE slot bit + */ + ld r6,STK_PARAM(R6)(r1) + li r0,_PAGE_BUSY + andc r30,r30,r0 + /* HPTE SUB bit */ + li r0,1 + subfic r5,r25,27 /* Must match bit position in */ + sld r0,r0,r5 /* pgtable.h */ + or r30,r30,r0 + /* hindx */ + sldi r5,r25,2 + sld r3,r3,r5 + li r4,0xf + sld r4,r4,r5 + andc r26,r26,r4 + or r26,r26,r3 + ori r5,r6,PTE_PAGE_HIDX_OFFSET + std r26,0(r5) + lwsync + std r30,0(r6) + li r3, 0 +htab_bail: + ld r25,STK_REG(R25)(r1) + ld r26,STK_REG(R26)(r1) + ld r27,STK_REG(R27)(r1) + ld r28,STK_REG(R28)(r1) + ld r29,STK_REG(R29)(r1) + ld r30,STK_REG(R30)(r1) + ld r31,STK_REG(R31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +htab_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + sldi r5,r25,2 + srd r3,r26,r5 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r3,0x8 /* page secondary ? */ + beq 1f + not r5,r5 +1: andi. r3,r3,0x7 /* extract idx alone */ + + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* vpn */ + li r6,MMU_PAGE_4K /* base page size */ + li r7,MMU_PAGE_4K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "flags" param */ +.globl htab_call_hpte_updatepp +htab_call_hpte_updatepp: + bl . /* patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- htab_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + ld r6,STK_PARAM(R6)(r1) + std r30,0(r6) + li r3,0 + b htab_bail + +htab_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b htab_bail + +htab_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARAM(R6)(r1) + std r31,0(r6) + li r3,-1 + b htab_bail + +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_HAS_HASH_64K + +/***************************************************************************** + * * + * 64K SW & 64K HW in a 64K segment pages implementation * + * * + *****************************************************************************/ + +_GLOBAL(__hash_page_64K) + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + /* Save all params that we need after a function call */ + std r6,STK_PARAM(R6)(r1) + std r8,STK_PARAM(R8)(r1) + std r9,STK_PARAM(R9)(r1) + + /* Save non-volatile registers. + * r31 will hold "old PTE" + * r30 is "new PTE" + * r29 is vpn + * r28 is a hash value + * r27 is hashtab mask (maybe dynamic patched instead ?) + */ + std r27,STK_REG(R27)(r1) + std r28,STK_REG(R28)(r1) + std r29,STK_REG(R29)(r1) + std r30,STK_REG(R30)(r1) + std r31,STK_REG(R31)(r1) + + /* Step 1: + * + * Check permissions, atomically mark the linux PTE busy + * and hashed. + */ +1: + ldarx r31,0,r6 + /* Check access rights (access & ~(pte_val(*ptep))) */ + andc. r0,r4,r31 + bne- ht64_wrong_access + /* Check if PTE is busy */ + andi. r0,r31,_PAGE_BUSY + /* If so, just bail out and refault if needed. Someone else + * is changing this PTE anyway and might hash it. + */ + bne- ht64_bail_ok +BEGIN_FTR_SECTION + /* Check if PTE has the cache-inhibit bit set */ + andi. r0,r31,_PAGE_NO_CACHE + /* If so, bail out and refault as a 4k page */ + bne- ht64_bail_ok +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_CI_LARGE_PAGE) + /* Prepare new PTE value (turn access RW into DIRTY, then + * add BUSY and ACCESSED) + */ + rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */ + or r30,r30,r31 + ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED + /* Write the linux PTE atomically (setting busy) */ + stdcx. r30,0,r6 + bne- 1b + isync + + /* Step 2: + * + * Insert/Update the HPTE in the hash table. At this point, + * r4 (access) is re-useable, we use it for the new HPTE flags + */ + +BEGIN_FTR_SECTION + cmpdi r9,0 /* check segment size */ + bne 3f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT - VPN_SHIFT) + or r29,r28,r29 + + /* Calculate hash value for primary slot and store it in r28 + * r3 = va, r5 = vsid + * r0 = (va >> 16) & ((1ul << (28 - 16)) -1) + */ + rldicl r0,r3,64-16,52 + xor r28,r5,r0 /* hash */ + b 4f + +3: /* Calc vpn and put it in r29 */ + sldi r29,r5,SID_SHIFT_1T - VPN_SHIFT + rldicl r28,r3,64 - VPN_SHIFT,64 - (SID_SHIFT_1T - VPN_SHIFT) + or r29,r28,r29 + /* + * calculate hash value for primary slot and + * store it in r28 for 1T segment + * r3 = va, r5 = vsid + */ + sldi r28,r5,25 /* vsid << 25 */ + /* r0 = (va >> 16) & ((1ul << (40 - 16)) -1) */ + rldicl r0,r3,64-16,40 + xor r28,r28,r5 /* vsid ^ ( vsid << 25) */ + xor r28,r28,r0 /* hash */ + + /* Convert linux PTE bits into HW equivalents */ +4: andi. r3,r30,0x1fe /* Get basic set of flags */ + xori r3,r3,HPTE_R_N /* _PAGE_EXEC -> NOEXEC */ + rlwinm r0,r30,32-9+1,30,30 /* _PAGE_RW -> _PAGE_USER (r0) */ + rlwinm r4,r30,32-7+1,30,30 /* _PAGE_DIRTY -> _PAGE_USER (r4) */ + and r0,r0,r4 /* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/ + andc r0,r30,r0 /* r0 = pte & ~r0 */ + rlwimi r3,r0,32-1,31,31 /* Insert result into PP lsb */ + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + ori r3,r3,HPTE_R_C | HPTE_R_M + + /* We eventually do the icache sync here (maybe inline that + * code rather than call a C function...) + */ +BEGIN_FTR_SECTION + mr r4,r30 + mr r5,r7 + bl hash_page_do_lazy_icache +END_FTR_SECTION(CPU_FTR_NOEXECUTE|CPU_FTR_COHERENT_ICACHE, CPU_FTR_NOEXECUTE) + + /* At this point, r3 contains new PP bits, save them in + * place of "access" in the param area (sic) + */ + std r3,STK_PARAM(R4)(r1) + + /* Get htab_hash_mask */ + ld r4,htab_hash_mask@got(2) + ld r27,0(r4) /* htab_hash_mask -> r27 */ + + /* Check if we may already be in the hashtable, in this case, we + * go to out-of-line code to try to modify the HPTE + */ + rldicl. r0,r31,64-12,48 + bne ht64_modify_pte + +ht64_insert_pte: + /* Clear hpte bits in new pte (we also clear BUSY btw) and + * add _PAGE_HPTE_SUB0 + */ + lis r0,_PAGE_HPTEFLAGS@h + ori r0,r0,_PAGE_HPTEFLAGS@l + andc r30,r30,r0 +#ifdef CONFIG_PPC_64K_PAGES + oris r30,r30,_PAGE_HPTE_SUB0@h +#else + ori r30,r30,_PAGE_HASHPTE +#endif + /* Phyical address in r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate primary group hash */ + and r0,r28,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ + li r7,0 /* !bolted, !secondary */ + li r8,MMU_PAGE_64K + li r9,MMU_PAGE_64K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl ht64_call_hpte_insert1 +ht64_call_hpte_insert1: + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge ht64_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- ht64_pte_insert_failure + + /* Now try secondary slot */ + + /* Phyical address in r5 */ + rldicl r5,r31,64-PTE_RPN_SHIFT,PTE_RPN_SHIFT + sldi r5,r5,PAGE_SHIFT + + /* Calculate secondary group hash */ + andc r0,r27,r28 + rldicr r3,r0,3,63-3 /* r0 = (~hash & mask) << 3 */ + + /* Call ppc_md.hpte_insert */ + ld r6,STK_PARAM(R4)(r1) /* Retrieve new pp bits */ + mr r4,r29 /* Retrieve vpn */ + li r7,HPTE_V_SECONDARY /* !bolted, secondary */ + li r8,MMU_PAGE_64K + li r9,MMU_PAGE_64K /* actual page size */ + ld r10,STK_PARAM(R9)(r1) /* segment size */ +.globl ht64_call_hpte_insert2 +ht64_call_hpte_insert2: + bl . /* patched by htab_finish_init() */ + cmpdi 0,r3,0 + bge+ ht64_pte_insert_ok /* Insertion successful */ + cmpdi 0,r3,-2 /* Critical failure */ + beq- ht64_pte_insert_failure + + /* Both are full, we need to evict something */ + mftb r0 + /* Pick a random group based on TB */ + andi. r0,r0,1 + mr r5,r28 + bne 2f + not r5,r5 +2: and r0,r5,r27 + rldicr r3,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + /* Call ppc_md.hpte_remove */ +.globl ht64_call_hpte_remove +ht64_call_hpte_remove: + bl . /* patched by htab_finish_init() */ + + /* Try all again */ + b ht64_insert_pte + +ht64_bail_ok: + li r3,0 + b ht64_bail + +ht64_pte_insert_ok: + /* Insert slot number & secondary bit in PTE */ + rldimi r30,r3,12,63-15 + + /* Write out the PTE with a normal write + * (maybe add eieio may be good still ?) + */ +ht64_write_out_pte: + ld r6,STK_PARAM(R6)(r1) + std r30,0(r6) + li r3, 0 +ht64_bail: + ld r27,STK_REG(R27)(r1) + ld r28,STK_REG(R28)(r1) + ld r29,STK_REG(R29)(r1) + ld r30,STK_REG(R30)(r1) + ld r31,STK_REG(R31)(r1) + addi r1,r1,STACKFRAMESIZE + ld r0,16(r1) + mtlr r0 + blr + +ht64_modify_pte: + /* Keep PP bits in r4 and slot idx from the PTE around in r3 */ + mr r4,r3 + rlwinm r3,r31,32-12,29,31 + + /* Secondary group ? if yes, get a inverted hash value */ + mr r5,r28 + andi. r0,r31,_PAGE_F_SECOND + beq 1f + not r5,r5 +1: + /* Calculate proper slot value for ppc_md.hpte_updatepp */ + and r0,r5,r27 + rldicr r0,r0,3,63-3 /* r0 = (hash & mask) << 3 */ + add r3,r0,r3 /* add slot idx */ + + /* Call ppc_md.hpte_updatepp */ + mr r5,r29 /* vpn */ + li r6,MMU_PAGE_64K /* base page size */ + li r7,MMU_PAGE_64K /* actual page size */ + ld r8,STK_PARAM(R9)(r1) /* segment size */ + ld r9,STK_PARAM(R8)(r1) /* get "flags" param */ +.globl ht64_call_hpte_updatepp +ht64_call_hpte_updatepp: + bl . /* patched by htab_finish_init() */ + + /* if we failed because typically the HPTE wasn't really here + * we try an insertion. + */ + cmpdi 0,r3,-1 + beq- ht64_insert_pte + + /* Clear the BUSY bit and Write out the PTE */ + li r0,_PAGE_BUSY + andc r30,r30,r0 + b ht64_write_out_pte + +ht64_wrong_access: + /* Bail out clearing reservation */ + stdcx. r31,0,r6 + li r3,1 + b ht64_bail + +ht64_pte_insert_failure: + /* Bail out restoring old PTE */ + ld r6,STK_PARAM(R6)(r1) + std r31,0(r6) + li r3,-1 + b ht64_bail + + +#endif /* CONFIG_PPC_HAS_HASH_64K */ + + +/***************************************************************************** + * * + * Huge pages implementation is in hugetlbpage.c * + * * + *****************************************************************************/ diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c new file mode 100644 index 000000000..9c4880dde --- /dev/null +++ b/arch/powerpc/mm/hash_native_64.c @@ -0,0 +1,723 @@ +/* + * native hashtable management. + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG_LOW + +#include <linux/spinlock.h> +#include <linux/bitops.h> +#include <linux/of.h> +#include <linux/threads.h> +#include <linux/smp.h> + +#include <asm/machdep.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/cputable.h> +#include <asm/udbg.h> +#include <asm/kexec.h> +#include <asm/ppc-opcode.h> + +#include <misc/cxl.h> + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#ifdef __BIG_ENDIAN__ +#define HPTE_LOCK_BIT 3 +#else +#define HPTE_LOCK_BIT (56+3) +#endif + +DEFINE_RAW_SPINLOCK(native_tlbie_lock); + +static inline void __tlbie(unsigned long vpn, int psize, int apsize, int ssize) +{ + unsigned long va; + unsigned int penc; + unsigned long sllp; + + /* + * We need 14 to 65 bits of va for a tlibe of 4K page + * With vpn we ignore the lower VPN_SHIFT bits already. + * And top two bits are already ignored because we can + * only accomadate 76 bits in a 64 bit vpn with a VPN_SHIFT + * of 12. + */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + /* clear out bits after (52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); + va |= ssize << 8; + sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) | + ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4); + va |= sllp << 5; + asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + default: + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); /* AVAL */ + va |= 1; /* L */ + asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) + : "memory"); + break; + } +} + +static inline void __tlbiel(unsigned long vpn, int psize, int apsize, int ssize) +{ + unsigned long va; + unsigned int penc; + unsigned long sllp; + + /* VPN_SHIFT can be atmost 12 */ + va = vpn << VPN_SHIFT; + /* + * clear top 16 bits of 64 bit va, non SLS segment + * Older versions of the architecture (2.02 and earler) require the + * masking of the top 16 bits. + */ + va &= ~(0xffffULL << 48); + + switch (psize) { + case MMU_PAGE_4K: + /* clear out bits after(52) [0....52.....63] */ + va &= ~((1ul << (64 - 52)) - 1); + va |= ssize << 8; + sllp = ((mmu_psize_defs[apsize].sllp & SLB_VSID_L) >> 6) | + ((mmu_psize_defs[apsize].sllp & SLB_VSID_LP) >> 4); + va |= sllp << 5; + asm volatile(".long 0x7c000224 | (%0 << 11) | (0 << 21)" + : : "r"(va) : "memory"); + break; + default: + /* We need 14 to 14 + i bits of va */ + penc = mmu_psize_defs[psize].penc[apsize]; + va &= ~((1ul << mmu_psize_defs[apsize].shift) - 1); + va |= penc << 12; + va |= ssize << 8; + /* + * AVAL bits: + * We don't need all the bits, but rest of the bits + * must be ignored by the processor. + * vpn cover upto 65 bits of va. (0...65) and we need + * 58..64 bits of va. + */ + va |= (vpn & 0xfe); + va |= 1; /* L */ + asm volatile(".long 0x7c000224 | (%0 << 11) | (1 << 21)" + : : "r"(va) : "memory"); + break; + } + +} + +static inline void tlbie(unsigned long vpn, int psize, int apsize, + int ssize, int local) +{ + unsigned int use_local; + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + use_local = local && mmu_has_feature(MMU_FTR_TLBIEL) && !cxl_ctx_in_use(); + + if (use_local) + use_local = mmu_psize_defs[psize].tlbiel; + if (lock_tlbie && !use_local) + raw_spin_lock(&native_tlbie_lock); + asm volatile("ptesync": : :"memory"); + if (use_local) { + __tlbiel(vpn, psize, apsize, ssize); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie(vpn, psize, apsize, ssize); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + if (lock_tlbie && !use_local) + raw_spin_unlock(&native_tlbie_lock); +} + +static inline void native_lock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = (unsigned long *)&hptep->v; + + while (1) { + if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) + break; + while(test_bit(HPTE_LOCK_BIT, word)) + cpu_relax(); + } +} + +static inline void native_unlock_hpte(struct hash_pte *hptep) +{ + unsigned long *word = (unsigned long *)&hptep->v; + + clear_bit_unlock(HPTE_LOCK_BIT, word); +} + +static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int apsize, int ssize) +{ + struct hash_pte *hptep = htab_address + hpte_group; + unsigned long hpte_v, hpte_r; + int i; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," + " rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, vpn, pa, rflags, vflags, psize); + } + + for (i = 0; i < HPTES_PER_GROUP; i++) { + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { + /* retry with lock held */ + native_lock_hpte(hptep); + if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) + break; + native_unlock_hpte(hptep); + } + + hptep++; + } + + if (i == HPTES_PER_GROUP) + return -1; + + hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; + + if (!(vflags & HPTE_V_BOLTED)) { + DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", + i, hpte_v, hpte_r); + } + + hptep->r = cpu_to_be64(hpte_r); + /* Guarantee the second dword is visible before the valid bit */ + eieio(); + /* + * Now set the first dword including the valid bit + * NOTE: this also unlocks the hpte + */ + hptep->v = cpu_to_be64(hpte_v); + + __asm__ __volatile__ ("ptesync" : : : "memory"); + + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); +} + +static long native_hpte_remove(unsigned long hpte_group) +{ + struct hash_pte *hptep; + int i; + int slot_offset; + unsigned long hpte_v; + + DBG_LOW(" remove(group=%lx)\n", hpte_group); + + /* pick a random entry to start at */ + slot_offset = mftb() & 0x7; + + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + hpte_group + slot_offset; + hpte_v = be64_to_cpu(hptep->v); + + if ((hpte_v & HPTE_V_VALID) && !(hpte_v & HPTE_V_BOLTED)) { + /* retry with lock held */ + native_lock_hpte(hptep); + hpte_v = be64_to_cpu(hptep->v); + if ((hpte_v & HPTE_V_VALID) + && !(hpte_v & HPTE_V_BOLTED)) + break; + native_unlock_hpte(hptep); + } + + slot_offset++; + slot_offset &= 0x7; + } + + if (i == HPTES_PER_GROUP) + return -1; + + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + + return i; +} + +static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, + unsigned long vpn, int bpsize, + int apsize, int ssize, unsigned long flags) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v, want_v; + int ret = 0, local = 0; + + want_v = hpte_encode_avpn(vpn, bpsize, ssize); + + DBG_LOW(" update(vpn=%016lx, avpnv=%016lx, group=%lx, newpp=%lx)", + vpn, want_v & HPTE_V_AVPN, slot, newpp); + + hpte_v = be64_to_cpu(hptep->v); + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) { + DBG_LOW(" -> miss\n"); + ret = -1; + } else { + native_lock_hpte(hptep); + /* recheck with locks held */ + hpte_v = be64_to_cpu(hptep->v); + if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || + !(hpte_v & HPTE_V_VALID))) { + ret = -1; + } else { + DBG_LOW(" -> hit\n"); + /* Update the HPTE */ + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N | + HPTE_R_C))); + } + native_unlock_hpte(hptep); + } + + if (flags & HPTE_LOCAL_UPDATE) + local = 1; + /* + * Ensure it is out of the tlb too if it is not a nohpte fault + */ + if (!(flags & HPTE_NOHPTE_UPDATE)) + tlbie(vpn, bpsize, apsize, ssize, local); + + return ret; +} + +static long native_hpte_find(unsigned long vpn, int psize, int ssize) +{ + struct hash_pte *hptep; + unsigned long hash; + unsigned long i; + long slot; + unsigned long want_v, hpte_v; + + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(vpn, psize, ssize); + + /* Bolted mappings are only ever in the primary group */ + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + hptep = htab_address + slot; + hpte_v = be64_to_cpu(hptep->v); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* HPTE matches */ + return slot; + ++slot; + } + + return -1; +} + +/* + * Update the page protection bits. Intended to be used to create + * guard pages for kernel data structures on pages which are bolted + * in the HPT. Assumes pages being operated on will not be stolen. + * + * No need to lock here because we should be the only user. + */ +static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, + int psize, int ssize) +{ + unsigned long vpn; + unsigned long vsid; + long slot; + struct hash_pte *hptep; + + vsid = get_kernel_vsid(ea, ssize); + vpn = hpt_vpn(ea, vsid, ssize); + + slot = native_hpte_find(vpn, psize, ssize); + if (slot == -1) + panic("could not find page to bolt\n"); + hptep = htab_address + slot; + + /* Update the HPTE */ + hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & + ~(HPTE_R_PP | HPTE_R_N)) | + (newpp & (HPTE_R_PP | HPTE_R_N))); + /* + * Ensure it is out of the tlb too. Bolted entries base and + * actual page size will be same. + */ + tlbie(vpn, psize, psize, ssize, 0); +} + +static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, + int bpsize, int apsize, int ssize, int local) +{ + struct hash_pte *hptep = htab_address + slot; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + + local_irq_save(flags); + + DBG_LOW(" invalidate(vpn=%016lx, hash: %lx)\n", vpn, slot); + + want_v = hpte_encode_avpn(vpn, bpsize, ssize); + native_lock_hpte(hptep); + hpte_v = be64_to_cpu(hptep->v); + + /* + * We need to invalidate the TLB always because hpte_remove doesn't do + * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less + * random entry from it. When we do that we don't invalidate the TLB + * (hpte_remove) because we assume the old translation is still + * technically "valid". + */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + + /* Invalidate the TLB */ + tlbie(vpn, bpsize, apsize, ssize, local); + + local_irq_restore(flags); +} + +static void native_hugepage_invalidate(unsigned long vsid, + unsigned long addr, + unsigned char *hpte_slot_array, + int psize, int ssize, int local) +{ + int i; + struct hash_pte *hptep; + int actual_psize = MMU_PAGE_16M; + unsigned int max_hpte_count, valid; + unsigned long flags, s_addr = addr; + unsigned long hpte_v, want_v, shift; + unsigned long hidx, vpn = 0, hash, slot; + + shift = mmu_psize_defs[psize].shift; + max_hpte_count = 1U << (PMD_SHIFT - shift); + + local_irq_save(flags); + for (i = 0; i < max_hpte_count; i++) { + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + native_lock_hpte(hptep); + hpte_v = be64_to_cpu(hptep->v); + + /* Even if we miss, we need to invalidate the TLB */ + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + /* Invalidate the hpte. NOTE: this also unlocks it */ + hptep->v = 0; + /* + * We need to do tlb invalidate for all the address, tlbie + * instruction compares entry_VA in tlb with the VA specified + * here + */ + tlbie(vpn, psize, actual_psize, ssize, local); + } + local_irq_restore(flags); +} + +static inline int __hpte_actual_psize(unsigned int lp, int psize) +{ + int i, shift; + unsigned int mask; + + /* start from 1 ignoring MMU_PAGE_4K */ + for (i = 1; i < MMU_PAGE_COUNT; i++) { + + /* invalid penc */ + if (mmu_psize_defs[psize].penc[i] == -1) + continue; + /* + * encoding bits per actual page size + * PTE LP actual page size + * rrrr rrrz >=8KB + * rrrr rrzz >=16KB + * rrrr rzzz >=32KB + * rrrr zzzz >=64KB + * ....... + */ + shift = mmu_psize_defs[i].shift - LP_SHIFT; + if (shift > LP_BITS) + shift = LP_BITS; + mask = (1 << shift) - 1; + if ((lp & mask) == mmu_psize_defs[psize].penc[i]) + return i; + } + return -1; +} + +static void hpte_decode(struct hash_pte *hpte, unsigned long slot, + int *psize, int *apsize, int *ssize, unsigned long *vpn) +{ + unsigned long avpn, pteg, vpi; + unsigned long hpte_v = be64_to_cpu(hpte->v); + unsigned long hpte_r = be64_to_cpu(hpte->r); + unsigned long vsid, seg_off; + int size, a_size, shift; + /* Look at the 8 bit LP value */ + unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + + if (!(hpte_v & HPTE_V_LARGE)) { + size = MMU_PAGE_4K; + a_size = MMU_PAGE_4K; + } else { + for (size = 0; size < MMU_PAGE_COUNT; size++) { + + /* valid entries have a shift value */ + if (!mmu_psize_defs[size].shift) + continue; + + a_size = __hpte_actual_psize(lp, size); + if (a_size != -1) + break; + } + } + /* This works for all page sizes, and for 256M and 1T segments */ + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; + shift = mmu_psize_defs[size].shift; + + avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); + pteg = slot / HPTES_PER_GROUP; + if (hpte_v & HPTE_V_SECONDARY) + pteg = ~pteg; + + switch (*ssize) { + case MMU_SEGSIZE_256M: + /* We only have 28 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1f) << 23; + vsid = avpn >> 5; + /* We can find more bits from the pteg value */ + if (shift < 23) { + vpi = (vsid ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + case MMU_SEGSIZE_1T: + /* We only have 40 - 23 bits of seg_off in avpn */ + seg_off = (avpn & 0x1ffff) << 23; + vsid = avpn >> 17; + if (shift < 23) { + vpi = (vsid ^ (vsid << 25) ^ pteg) & htab_hash_mask; + seg_off |= vpi << shift; + } + *vpn = vsid << (SID_SHIFT_1T - VPN_SHIFT) | seg_off >> VPN_SHIFT; + break; + default: + *vpn = size = 0; + } + *psize = size; + *apsize = a_size; +} + +/* + * clear all mappings on kexec. All cpus are in real mode (or they will + * be when they isi), and we are the only one left. We rely on our kernel + * mapping being 0xC0's and the hardware ignoring those two real bits. + * + * TODO: add batching support when enabled. remember, no dynamic memory here, + * athough there is the control page available... + */ +static void native_hpte_clear(void) +{ + unsigned long vpn = 0; + unsigned long slot, slots, flags; + struct hash_pte *hptep = htab_address; + unsigned long hpte_v; + unsigned long pteg_count; + int psize, apsize, ssize; + + pteg_count = htab_hash_mask + 1; + + local_irq_save(flags); + + /* we take the tlbie lock and hold it. Some hardware will + * deadlock if we try to tlbie from two processors at once. + */ + raw_spin_lock(&native_tlbie_lock); + + slots = pteg_count * HPTES_PER_GROUP; + + for (slot = 0; slot < slots; slot++, hptep++) { + /* + * we could lock the pte here, but we are the only cpu + * running, right? and for crash dump, we probably + * don't want to wait for a maybe bad cpu. + */ + hpte_v = be64_to_cpu(hptep->v); + + /* + * Call __tlbie() here rather than tlbie() since we + * already hold the native_tlbie_lock. + */ + if (hpte_v & HPTE_V_VALID) { + hpte_decode(hptep, slot, &psize, &apsize, &ssize, &vpn); + hptep->v = 0; + __tlbie(vpn, psize, apsize, ssize); + } + } + + asm volatile("eieio; tlbsync; ptesync":::"memory"); + raw_spin_unlock(&native_tlbie_lock); + local_irq_restore(flags); +} + +/* + * Batched hash table flush, we batch the tlbie's to avoid taking/releasing + * the lock all the time + */ +static void native_flush_hash_range(unsigned long number, int local) +{ + unsigned long vpn; + unsigned long hash, index, hidx, shift, slot; + struct hash_pte *hptep; + unsigned long hpte_v; + unsigned long want_v; + unsigned long flags; + real_pte_t pte; + struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); + unsigned long psize = batch->psize; + int ssize = batch->ssize; + int i; + + local_irq_save(flags); + + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + hptep = htab_address + slot; + want_v = hpte_encode_avpn(vpn, psize, ssize); + native_lock_hpte(hptep); + hpte_v = be64_to_cpu(hptep->v); + if (!HPTE_V_COMPARE(hpte_v, want_v) || + !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + hptep->v = 0; + } pte_iterate_hashed_end(); + } + + if (mmu_has_feature(MMU_FTR_TLBIEL) && + mmu_psize_defs[psize].tlbiel && local) { + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbiel(vpn, psize, psize, ssize); + } pte_iterate_hashed_end(); + } + asm volatile("ptesync":::"memory"); + } else { + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + + if (lock_tlbie) + raw_spin_lock(&native_tlbie_lock); + + asm volatile("ptesync":::"memory"); + for (i = 0; i < number; i++) { + vpn = batch->vpn[i]; + pte = batch->pte[i]; + + pte_iterate_hashed_subpages(pte, psize, + vpn, index, shift) { + __tlbie(vpn, psize, psize, ssize); + } pte_iterate_hashed_end(); + } + asm volatile("eieio; tlbsync; ptesync":::"memory"); + + if (lock_tlbie) + raw_spin_unlock(&native_tlbie_lock); + } + + local_irq_restore(flags); +} + +void __init hpte_init_native(void) +{ + ppc_md.hpte_invalidate = native_hpte_invalidate; + ppc_md.hpte_updatepp = native_hpte_updatepp; + ppc_md.hpte_updateboltedpp = native_hpte_updateboltedpp; + ppc_md.hpte_insert = native_hpte_insert; + ppc_md.hpte_remove = native_hpte_remove; + ppc_md.hpte_clear_all = native_hpte_clear; + ppc_md.flush_hash_range = native_flush_hash_range; + ppc_md.hugepage_invalidate = native_hugepage_invalidate; +} diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c new file mode 100644 index 000000000..fda236f90 --- /dev/null +++ b/arch/powerpc/mm/hash_utils_64.c @@ -0,0 +1,1556 @@ +/* + * PowerPC64 port by Mike Corrigan and Dave Engebretsen + * {mikejc|engebret}@us.ibm.com + * + * Copyright (c) 2000 Mike Corrigan <mikejc@us.ibm.com> + * + * SMP scalability work: + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * + * Module name: htab.c + * + * Description: + * PowerPC Hashed Page Table functions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#undef DEBUG +#undef DEBUG_LOW + +#include <linux/spinlock.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/sysctl.h> +#include <linux/export.h> +#include <linux/ctype.h> +#include <linux/cache.h> +#include <linux/init.h> +#include <linux/signal.h> +#include <linux/memblock.h> +#include <linux/context_tracking.h> + +#include <asm/processor.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/page.h> +#include <asm/types.h> +#include <asm/uaccess.h> +#include <asm/machdep.h> +#include <asm/prom.h> +#include <asm/tlbflush.h> +#include <asm/io.h> +#include <asm/eeh.h> +#include <asm/tlb.h> +#include <asm/cacheflush.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/copro.h> +#include <asm/udbg.h> +#include <asm/code-patching.h> +#include <asm/fadump.h> +#include <asm/firmware.h> +#include <asm/tm.h> + +#ifdef DEBUG +#define DBG(fmt...) udbg_printf(fmt) +#else +#define DBG(fmt...) +#endif + +#ifdef DEBUG_LOW +#define DBG_LOW(fmt...) udbg_printf(fmt) +#else +#define DBG_LOW(fmt...) +#endif + +#define KB (1024) +#define MB (1024*KB) +#define GB (1024L*MB) + +/* + * Note: pte --> Linux PTE + * HPTE --> PowerPC Hashed Page Table Entry + * + * Execution context: + * htab_initialize is called with the MMU off (of course), but + * the kernel has been copied down to zero so it can directly + * reference global data. At this point it is very difficult + * to print debug info. + * + */ + +#ifdef CONFIG_U3_DART +extern unsigned long dart_tablebase; +#endif /* CONFIG_U3_DART */ + +static unsigned long _SDR1; +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; +EXPORT_SYMBOL_GPL(mmu_psize_defs); + +struct hash_pte *htab_address; +unsigned long htab_size_bytes; +unsigned long htab_hash_mask; +EXPORT_SYMBOL_GPL(htab_hash_mask); +int mmu_linear_psize = MMU_PAGE_4K; +EXPORT_SYMBOL_GPL(mmu_linear_psize); +int mmu_virtual_psize = MMU_PAGE_4K; +int mmu_vmalloc_psize = MMU_PAGE_4K; +#ifdef CONFIG_SPARSEMEM_VMEMMAP +int mmu_vmemmap_psize = MMU_PAGE_4K; +#endif +int mmu_io_psize = MMU_PAGE_4K; +int mmu_kernel_ssize = MMU_SEGSIZE_256M; +EXPORT_SYMBOL_GPL(mmu_kernel_ssize); +int mmu_highuser_ssize = MMU_SEGSIZE_256M; +u16 mmu_slb_size = 64; +EXPORT_SYMBOL_GPL(mmu_slb_size); +#ifdef CONFIG_PPC_64K_PAGES +int mmu_ci_restrictions; +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC +static u8 *linear_map_hash_slots; +static unsigned long linear_map_hash_count; +static DEFINE_SPINLOCK(linear_map_hash_lock); +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +/* There are definitions of page sizes arrays to be used when none + * is provided by the firmware. + */ + +/* Pre-POWER4 CPUs (4k pages only) + */ +static struct mmu_psize_def mmu_psize_defaults_old[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 0, + }, +}; + +/* POWER4, GPUL, POWER5 + * + * Support for 16Mb large pages + */ +static struct mmu_psize_def mmu_psize_defaults_gp[] = { + [MMU_PAGE_4K] = { + .shift = 12, + .sllp = 0, + .penc = {[MMU_PAGE_4K] = 0, [1 ... MMU_PAGE_COUNT - 1] = -1}, + .avpnm = 0, + .tlbiel = 1, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .sllp = SLB_VSID_L, + .penc = {[0 ... MMU_PAGE_16M - 1] = -1, [MMU_PAGE_16M] = 0, + [MMU_PAGE_16M + 1 ... MMU_PAGE_COUNT - 1] = -1 }, + .avpnm = 0x1UL, + .tlbiel = 0, + }, +}; + +static unsigned long htab_convert_pte_flags(unsigned long pteflags) +{ + unsigned long rflags = pteflags & 0x1fa; + + /* _PAGE_EXEC -> NOEXEC */ + if ((pteflags & _PAGE_EXEC) == 0) + rflags |= HPTE_R_N; + + /* PP bits. PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) && + (pteflags & _PAGE_DIRTY))) + rflags |= 1; + /* + * Always add "C" bit for perf. Memory coherence is always enabled + */ + return rflags | HPTE_R_C | HPTE_R_M; +} + +int htab_bolt_mapping(unsigned long vstart, unsigned long vend, + unsigned long pstart, unsigned long prot, + int psize, int ssize) +{ + unsigned long vaddr, paddr; + unsigned int step, shift; + int ret = 0; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + prot = htab_convert_pte_flags(prot); + + DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n", + vstart, vend, pstart, prot, psize, ssize); + + for (vaddr = vstart, paddr = pstart; vaddr < vend; + vaddr += step, paddr += step) { + unsigned long hash, hpteg; + unsigned long vsid = get_kernel_vsid(vaddr, ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, ssize); + unsigned long tprot = prot; + + /* + * If we hit a bad address return error. + */ + if (!vsid) + return -1; + /* Make kernel text executable */ + if (overlaps_kernel_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* Make kvm guest trampolines executable */ + if (overlaps_kvm_tmp(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + /* + * If relocatable, check if it overlaps interrupt vectors that + * are copied down to real 0. For relocatable kernel + * (e.g. kdump case) we copy interrupt vectors down to real + * address 0. Mark that region as executable. This is + * because on p8 system with relocation on exception feature + * enabled, exceptions are raised with MMU (IR=DR=1) ON. Hence + * in order to execute the interrupt handlers in virtual + * mode the vector region need to be marked as executable. + */ + if ((PHYSICAL_START > MEMORY_START) && + overlaps_interrupt_vector_text(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + + hash = hpt_hash(vpn, shift, ssize); + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + BUG_ON(!ppc_md.hpte_insert); + ret = ppc_md.hpte_insert(hpteg, vpn, paddr, tprot, + HPTE_V_BOLTED, psize, psize, ssize); + + if (ret < 0) + break; +#ifdef CONFIG_DEBUG_PAGEALLOC + if ((paddr >> PAGE_SHIFT) < linear_map_hash_count) + linear_map_hash_slots[paddr >> PAGE_SHIFT] = ret | 0x80; +#endif /* CONFIG_DEBUG_PAGEALLOC */ + } + return ret < 0 ? ret : 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int htab_remove_mapping(unsigned long vstart, unsigned long vend, + int psize, int ssize) +{ + unsigned long vaddr; + unsigned int step, shift; + + shift = mmu_psize_defs[psize].shift; + step = 1 << shift; + + if (!ppc_md.hpte_removebolted) { + printk(KERN_WARNING "Platform doesn't implement " + "hpte_removebolted\n"); + return -EINVAL; + } + + for (vaddr = vstart; vaddr < vend; vaddr += step) + ppc_md.hpte_removebolted(vaddr, psize, ssize); + + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static int __init htab_dt_scan_seg_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,processor-segment-sizes", &size); + if (prop == NULL) + return 0; + for (; size >= 4; size -= 4, ++prop) { + if (be32_to_cpu(prop[0]) == 40) { + DBG("1T segment support detected\n"); + cur_cpu_spec->mmu_features |= MMU_FTR_1T_SEGMENT; + return 1; + } + } + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; + return 0; +} + +static void __init htab_init_seg_sizes(void) +{ + of_scan_flat_dt(htab_dt_scan_seg_sizes, NULL); +} + +static int __init get_idx_from_shift(unsigned int shift) +{ + int idx = -1; + + switch (shift) { + case 0xc: + idx = MMU_PAGE_4K; + break; + case 0x10: + idx = MMU_PAGE_64K; + break; + case 0x14: + idx = MMU_PAGE_1M; + break; + case 0x18: + idx = MMU_PAGE_16M; + break; + case 0x22: + idx = MMU_PAGE_16G; + break; + } + return idx; +} + +static int __init htab_dt_scan_page_sizes(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + int size = 0; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,segment-page-sizes", &size); + if (!prop) + return 0; + + pr_info("Page sizes from device-tree:\n"); + size /= 4; + cur_cpu_spec->mmu_features &= ~(MMU_FTR_16M_PAGE); + while(size > 0) { + unsigned int base_shift = be32_to_cpu(prop[0]); + unsigned int slbenc = be32_to_cpu(prop[1]); + unsigned int lpnum = be32_to_cpu(prop[2]); + struct mmu_psize_def *def; + int idx, base_idx; + + size -= 3; prop += 3; + base_idx = get_idx_from_shift(base_shift); + if (base_idx < 0) { + /* skip the pte encoding also */ + prop += lpnum * 2; size -= lpnum * 2; + continue; + } + def = &mmu_psize_defs[base_idx]; + if (base_idx == MMU_PAGE_16M) + cur_cpu_spec->mmu_features |= MMU_FTR_16M_PAGE; + + def->shift = base_shift; + if (base_shift <= 23) + def->avpnm = 0; + else + def->avpnm = (1 << (base_shift - 23)) - 1; + def->sllp = slbenc; + /* + * We don't know for sure what's up with tlbiel, so + * for now we only set it for 4K and 64K pages + */ + if (base_idx == MMU_PAGE_4K || base_idx == MMU_PAGE_64K) + def->tlbiel = 1; + else + def->tlbiel = 0; + + while (size > 0 && lpnum) { + unsigned int shift = be32_to_cpu(prop[0]); + int penc = be32_to_cpu(prop[1]); + + prop += 2; size -= 2; + lpnum--; + + idx = get_idx_from_shift(shift); + if (idx < 0) + continue; + + if (penc == -1) + pr_err("Invalid penc for base_shift=%d " + "shift=%d\n", base_shift, shift); + + def->penc[idx] = penc; + pr_info("base_shift=%d: shift=%d, sllp=0x%04lx," + " avpnm=0x%08lx, tlbiel=%d, penc=%d\n", + base_shift, shift, def->sllp, + def->avpnm, def->tlbiel, def->penc[idx]); + } + } + + return 1; +} + +#ifdef CONFIG_HUGETLB_PAGE +/* Scan for 16G memory blocks that have been set aside for huge pages + * and reserve those blocks for 16G huge pages. + */ +static int __init htab_dt_scan_hugepage_blocks(unsigned long node, + const char *uname, int depth, + void *data) { + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be64 *addr_prop; + const __be32 *page_count_prop; + unsigned int expected_pages; + long unsigned int phys_addr; + long unsigned int block_size; + + /* We are scanning "memory" nodes only */ + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* This property is the log base 2 of the number of virtual pages that + * will represent this memory block. */ + page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL); + if (page_count_prop == NULL) + return 0; + expected_pages = (1 << be32_to_cpu(page_count_prop[0])); + addr_prop = of_get_flat_dt_prop(node, "reg", NULL); + if (addr_prop == NULL) + return 0; + phys_addr = be64_to_cpu(addr_prop[0]); + block_size = be64_to_cpu(addr_prop[1]); + if (block_size != (16 * GB)) + return 0; + printk(KERN_INFO "Huge page(16GB) memory: " + "addr = 0x%lX size = 0x%lX pages = %d\n", + phys_addr, block_size, expected_pages); + if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) { + memblock_reserve(phys_addr, block_size * expected_pages); + add_gpage(phys_addr, block_size, expected_pages); + } + return 0; +} +#endif /* CONFIG_HUGETLB_PAGE */ + +static void mmu_psize_set_default_penc(void) +{ + int bpsize, apsize; + for (bpsize = 0; bpsize < MMU_PAGE_COUNT; bpsize++) + for (apsize = 0; apsize < MMU_PAGE_COUNT; apsize++) + mmu_psize_defs[bpsize].penc[apsize] = -1; +} + +#ifdef CONFIG_PPC_64K_PAGES + +static bool might_have_hea(void) +{ + /* + * The HEA ethernet adapter requires awareness of the + * GX bus. Without that awareness we can easily assume + * we will never see an HEA ethernet device. + */ +#ifdef CONFIG_IBMEBUS + return !cpu_has_feature(CPU_FTR_ARCH_207S); +#else + return false; +#endif +} + +#endif /* #ifdef CONFIG_PPC_64K_PAGES */ + +static void __init htab_init_page_sizes(void) +{ + int rc; + + /* se the invalid penc to -1 */ + mmu_psize_set_default_penc(); + + /* Default to 4K pages only */ + memcpy(mmu_psize_defs, mmu_psize_defaults_old, + sizeof(mmu_psize_defaults_old)); + + /* + * Try to find the available page sizes in the device-tree + */ + rc = of_scan_flat_dt(htab_dt_scan_page_sizes, NULL); + if (rc != 0) /* Found */ + goto found; + + /* + * Not in the device-tree, let's fallback on known size + * list for 16M capable GP & GR + */ + if (mmu_has_feature(MMU_FTR_16M_PAGE)) + memcpy(mmu_psize_defs, mmu_psize_defaults_gp, + sizeof(mmu_psize_defaults_gp)); + found: +#ifndef CONFIG_DEBUG_PAGEALLOC + /* + * Pick a size for the linear mapping. Currently, we only support + * 16M, 1M and 4K which is the default + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + mmu_linear_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + mmu_linear_psize = MMU_PAGE_1M; +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +#ifdef CONFIG_PPC_64K_PAGES + /* + * Pick a size for the ordinary pages. Default is 4K, we support + * 64K for user mappings and vmalloc if supported by the processor. + * We only use 64k for ioremap if the processor + * (and firmware) support cache-inhibited large pages. + * If not, we use 4k and set mmu_ci_restrictions so that + * hash_page knows to switch processes that use cache-inhibited + * mappings to 4k pages. + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift) { + mmu_virtual_psize = MMU_PAGE_64K; + mmu_vmalloc_psize = MMU_PAGE_64K; + if (mmu_linear_psize == MMU_PAGE_4K) + mmu_linear_psize = MMU_PAGE_64K; + if (mmu_has_feature(MMU_FTR_CI_LARGE_PAGE)) { + /* + * When running on pSeries using 64k pages for ioremap + * would stop us accessing the HEA ethernet. So if we + * have the chance of ever seeing one, stay at 4k. + */ + if (!might_have_hea() || !machine_is(pseries)) + mmu_io_psize = MMU_PAGE_64K; + } else + mmu_ci_restrictions = 1; + } +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* We try to use 16M pages for vmemmap if that is supported + * and we have at least 1G of RAM at boot + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift && + memblock_phys_mem_size() >= 0x40000000) + mmu_vmemmap_psize = MMU_PAGE_16M; + else if (mmu_psize_defs[MMU_PAGE_64K].shift) + mmu_vmemmap_psize = MMU_PAGE_64K; + else + mmu_vmemmap_psize = MMU_PAGE_4K; +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + printk(KERN_DEBUG "Page orders: linear mapping = %d, " + "virtual = %d, io = %d" +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ", vmemmap = %d" +#endif + "\n", + mmu_psize_defs[mmu_linear_psize].shift, + mmu_psize_defs[mmu_virtual_psize].shift, + mmu_psize_defs[mmu_io_psize].shift +#ifdef CONFIG_SPARSEMEM_VMEMMAP + ,mmu_psize_defs[mmu_vmemmap_psize].shift +#endif + ); + +#ifdef CONFIG_HUGETLB_PAGE + /* Reserve 16G huge page memory sections for huge pages */ + of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); +#endif /* CONFIG_HUGETLB_PAGE */ +} + +static int __init htab_dt_scan_pftsize(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); + const __be32 *prop; + + /* We are scanning "cpu" nodes only */ + if (type == NULL || strcmp(type, "cpu") != 0) + return 0; + + prop = of_get_flat_dt_prop(node, "ibm,pft-size", NULL); + if (prop != NULL) { + /* pft_size[0] is the NUMA CEC cookie */ + ppc64_pft_size = be32_to_cpu(prop[1]); + return 1; + } + return 0; +} + +static unsigned long __init htab_get_table_size(void) +{ + unsigned long mem_size, rnd_mem_size, pteg_count, psize; + + /* If hash size isn't already provided by the platform, we try to + * retrieve it from the device-tree. If it's not there neither, we + * calculate it now based on the total RAM size + */ + if (ppc64_pft_size == 0) + of_scan_flat_dt(htab_dt_scan_pftsize, NULL); + if (ppc64_pft_size) + return 1UL << ppc64_pft_size; + + /* round mem_size up to next power of 2 */ + mem_size = memblock_phys_mem_size(); + rnd_mem_size = 1UL << __ilog2(mem_size); + if (rnd_mem_size < mem_size) + rnd_mem_size <<= 1; + + /* # pages / 2 */ + psize = mmu_psize_defs[mmu_virtual_psize].shift; + pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11); + + return pteg_count << 7; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +int create_section_mapping(unsigned long start, unsigned long end) +{ + return htab_bolt_mapping(start, end, __pa(start), + pgprot_val(PAGE_KERNEL), mmu_linear_psize, + mmu_kernel_ssize); +} + +int remove_section_mapping(unsigned long start, unsigned long end) +{ + return htab_remove_mapping(start, end, mmu_linear_psize, + mmu_kernel_ssize); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +extern u32 htab_call_hpte_insert1[]; +extern u32 htab_call_hpte_insert2[]; +extern u32 htab_call_hpte_remove[]; +extern u32 htab_call_hpte_updatepp[]; +extern u32 ht64_call_hpte_insert1[]; +extern u32 ht64_call_hpte_insert2[]; +extern u32 ht64_call_hpte_remove[]; +extern u32 ht64_call_hpte_updatepp[]; + +static void __init htab_finish_init(void) +{ +#ifdef CONFIG_PPC_HAS_HASH_64K + patch_branch(ht64_call_hpte_insert1, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(ht64_call_hpte_insert2, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(ht64_call_hpte_remove, + ppc_function_entry(ppc_md.hpte_remove), + BRANCH_SET_LINK); + patch_branch(ht64_call_hpte_updatepp, + ppc_function_entry(ppc_md.hpte_updatepp), + BRANCH_SET_LINK); +#endif /* CONFIG_PPC_HAS_HASH_64K */ + + patch_branch(htab_call_hpte_insert1, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(htab_call_hpte_insert2, + ppc_function_entry(ppc_md.hpte_insert), + BRANCH_SET_LINK); + patch_branch(htab_call_hpte_remove, + ppc_function_entry(ppc_md.hpte_remove), + BRANCH_SET_LINK); + patch_branch(htab_call_hpte_updatepp, + ppc_function_entry(ppc_md.hpte_updatepp), + BRANCH_SET_LINK); +} + +static void __init htab_initialize(void) +{ + unsigned long table; + unsigned long pteg_count; + unsigned long prot; + unsigned long base = 0, size = 0, limit; + struct memblock_region *reg; + + DBG(" -> htab_initialize()\n"); + + /* Initialize segment sizes */ + htab_init_seg_sizes(); + + /* Initialize page sizes */ + htab_init_page_sizes(); + + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) { + mmu_kernel_ssize = MMU_SEGSIZE_1T; + mmu_highuser_ssize = MMU_SEGSIZE_1T; + printk(KERN_INFO "Using 1TB segments\n"); + } + + /* + * Calculate the required size of the htab. We want the number of + * PTEGs to equal one half the number of real pages. + */ + htab_size_bytes = htab_get_table_size(); + pteg_count = htab_size_bytes >> 7; + + htab_hash_mask = pteg_count - 1; + + if (firmware_has_feature(FW_FEATURE_LPAR)) { + /* Using a hypervisor which owns the htab */ + htab_address = NULL; + _SDR1 = 0; +#ifdef CONFIG_FA_DUMP + /* + * If firmware assisted dump is active firmware preserves + * the contents of htab along with entire partition memory. + * Clear the htab if firmware assisted dump is active so + * that we dont end up using old mappings. + */ + if (is_fadump_active() && ppc_md.hpte_clear_all) + ppc_md.hpte_clear_all(); +#endif + } else { + /* Find storage for the HPT. Must be contiguous in + * the absolute address space. On cell we want it to be + * in the first 2 Gig so we can use it for IOMMU hacks. + */ + if (machine_is(cell)) + limit = 0x80000000; + else + limit = MEMBLOCK_ALLOC_ANYWHERE; + + table = memblock_alloc_base(htab_size_bytes, htab_size_bytes, limit); + + DBG("Hash table allocated at %lx, size: %lx\n", table, + htab_size_bytes); + + htab_address = __va(table); + + /* htab absolute addr + encoded htabsize */ + _SDR1 = table + __ilog2(pteg_count) - 11; + + /* Initialize the HPT with no entries */ + memset((void *)table, 0, htab_size_bytes); + + /* Set SDR1 */ + mtspr(SPRN_SDR1, _SDR1); + } + + prot = pgprot_val(PAGE_KERNEL); + +#ifdef CONFIG_DEBUG_PAGEALLOC + linear_map_hash_count = memblock_end_of_DRAM() >> PAGE_SHIFT; + linear_map_hash_slots = __va(memblock_alloc_base(linear_map_hash_count, + 1, ppc64_rma_size)); + memset(linear_map_hash_slots, 0, linear_map_hash_count); +#endif /* CONFIG_DEBUG_PAGEALLOC */ + + /* On U3 based machines, we need to reserve the DART area and + * _NOT_ map it to avoid cache paradoxes as it's remapped non + * cacheable later on + */ + + /* create bolted the linear mapping in the hash table */ + for_each_memblock(memory, reg) { + base = (unsigned long)__va(reg->base); + size = reg->size; + + DBG("creating mapping for region: %lx..%lx (prot: %lx)\n", + base, size, prot); + +#ifdef CONFIG_U3_DART + /* Do not map the DART space. Fortunately, it will be aligned + * in such a way that it will not cross two memblock regions and + * will fit within a single 16Mb page. + * The DART space is assumed to be a full 16Mb region even if + * we only use 2Mb of that space. We will use more of it later + * for AGP GART. We have to use a full 16Mb large page. + */ + DBG("DART base: %lx\n", dart_tablebase); + + if (dart_tablebase != 0 && dart_tablebase >= base + && dart_tablebase < (base + size)) { + unsigned long dart_table_end = dart_tablebase + 16 * MB; + if (base != dart_tablebase) + BUG_ON(htab_bolt_mapping(base, dart_tablebase, + __pa(base), prot, + mmu_linear_psize, + mmu_kernel_ssize)); + if ((base + size) > dart_table_end) + BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB, + base + size, + __pa(dart_table_end), + prot, + mmu_linear_psize, + mmu_kernel_ssize)); + continue; + } +#endif /* CONFIG_U3_DART */ + BUG_ON(htab_bolt_mapping(base, base + size, __pa(base), + prot, mmu_linear_psize, mmu_kernel_ssize)); + } + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + + /* + * If we have a memory_limit and we've allocated TCEs then we need to + * explicitly map the TCE area at the top of RAM. We also cope with the + * case that the TCEs start below memory_limit. + * tce_alloc_start/end are 16MB aligned so the mapping should work + * for either 4K or 16MB pages. + */ + if (tce_alloc_start) { + tce_alloc_start = (unsigned long)__va(tce_alloc_start); + tce_alloc_end = (unsigned long)__va(tce_alloc_end); + + if (base + size >= tce_alloc_start) + tce_alloc_start = base + size + 1; + + BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end, + __pa(tce_alloc_start), prot, + mmu_linear_psize, mmu_kernel_ssize)); + } + + htab_finish_init(); + + DBG(" <- htab_initialize()\n"); +} +#undef KB +#undef MB + +void __init early_init_mmu(void) +{ + /* Initialize the MMU Hash table and create the linear mapping + * of memory. Has to be done before SLB initialization as this is + * currently where the page size encoding is obtained. + */ + htab_initialize(); + + /* Initialize SLB management */ + slb_initialize(); +} + +#ifdef CONFIG_SMP +void early_init_mmu_secondary(void) +{ + /* Initialize hash table for that CPU */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + mtspr(SPRN_SDR1, _SDR1); + + /* Initialize SLB */ + slb_initialize(); +} +#endif /* CONFIG_SMP */ + +/* + * Called by asm hashtable.S for doing lazy icache flush + */ +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) +{ + struct page *page; + + if (!pfn_valid(pte_pfn(pte))) + return pp; + + page = pte_page(pte); + + /* page is dirty */ + if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { + if (trap == 0x400) { + flush_dcache_icache_page(page); + set_bit(PG_arch_1, &page->flags); + } else + pp |= HPTE_R_N; + } + return pp; +} + +#ifdef CONFIG_PPC_MM_SLICES +static unsigned int get_paca_psize(unsigned long addr) +{ + u64 lpsizes; + unsigned char *hpsizes; + unsigned long index, mask_index; + + if (addr < SLICE_LOW_TOP) { + lpsizes = get_paca()->context.low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + return (lpsizes >> (index * 4)) & 0xF; + } + hpsizes = get_paca()->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + mask_index = index & 0x1; + return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xF; +} + +#else +unsigned int get_paca_psize(unsigned long addr) +{ + return get_paca()->context.user_psize; +} +#endif + +/* + * Demote a segment to using 4k pages. + * For now this makes the whole process use 4k pages. + */ +#ifdef CONFIG_PPC_64K_PAGES +void demote_segment_4k(struct mm_struct *mm, unsigned long addr) +{ + if (get_slice_psize(mm, addr) == MMU_PAGE_4K) + return; + slice_set_range_psize(mm, addr, 1, MMU_PAGE_4K); + copro_flush_all_slbs(mm); + if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { + get_paca()->context = mm->context; + slb_flush_and_rebolt(); + } +} +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_SUBPAGE_PROT +/* + * This looks up a 2-bit protection code for a 4k subpage of a 64k page. + * Userspace sets the subpage permissions using the subpage_prot system call. + * + * Result is 0: full permissions, _PAGE_RW: read-only, + * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. + */ +static int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + struct subpage_prot_table *spt = &mm->context.spt; + u32 spp = 0; + u32 **sbpm, *sbpp; + + if (ea >= spt->maxaddr) + return 0; + if (ea < 0x100000000UL) { + /* addresses below 4GB use spt->low_prot */ + sbpm = spt->low_prot; + } else { + sbpm = spt->protptrs[ea >> SBP_L3_SHIFT]; + if (!sbpm) + return 0; + } + sbpp = sbpm[(ea >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!sbpp) + return 0; + spp = sbpp[(ea >> PAGE_SHIFT) & (SBP_L1_COUNT - 1)]; + + /* extract 2-bit bitfield for this 4k subpage */ + spp >>= 30 - 2 * ((ea >> 12) & 0xf); + + /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ + spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); + return spp; +} + +#else /* CONFIG_PPC_SUBPAGE_PROT */ +static inline int subpage_protection(struct mm_struct *mm, unsigned long ea) +{ + return 0; +} +#endif + +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, int lpsize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d base psize=%d psize %d pte=0x%lx\n", + trap, vsid, ssize, psize, lpsize, pte); +} + +static void check_paca_psize(unsigned long ea, struct mm_struct *mm, + int psize, bool user_region) +{ + if (user_region) { + if (psize != get_paca_psize(ea)) { + get_paca()->context = mm->context; + slb_flush_and_rebolt(); + } + } else if (get_paca()->vmalloc_sllp != + mmu_psize_defs[mmu_vmalloc_psize].sllp) { + get_paca()->vmalloc_sllp = + mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_vmalloc_update(); + } +} + +/* Result code is: + * 0 - handled + * 1 - normal page fault + * -1 - critical hash insertion error + * -2 - access not permitted by subpage protection mechanism + */ +int hash_page_mm(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap, + unsigned long flags) +{ + enum ctx_state prev_state = exception_enter(); + pgd_t *pgdir; + unsigned long vsid; + pte_t *ptep; + unsigned hugeshift; + const struct cpumask *tmp; + int rc, user_region = 0; + int psize, ssize; + + DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n", + ea, access, trap); + + /* Get region & vsid */ + switch (REGION_ID(ea)) { + case USER_REGION_ID: + user_region = 1; + if (! mm) { + DBG_LOW(" user region with no mm !\n"); + rc = 1; + goto bail; + } + psize = get_slice_psize(mm, ea); + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + break; + case VMALLOC_REGION_ID: + vsid = get_kernel_vsid(ea, mmu_kernel_ssize); + if (ea < VMALLOC_END) + psize = mmu_vmalloc_psize; + else + psize = mmu_io_psize; + ssize = mmu_kernel_ssize; + break; + default: + /* Not a valid range + * Send the problem up to do_page_fault + */ + rc = 1; + goto bail; + } + DBG_LOW(" mm=%p, mm->pgdir=%p, vsid=%016lx\n", mm, mm->pgd, vsid); + + /* Bad address. */ + if (!vsid) { + DBG_LOW("Bad address!\n"); + rc = 1; + goto bail; + } + /* Get pgdir */ + pgdir = mm->pgd; + if (pgdir == NULL) { + rc = 1; + goto bail; + } + + /* Check CPU locality */ + tmp = cpumask_of(smp_processor_id()); + if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) + flags |= HPTE_LOCAL_UPDATE; + +#ifndef CONFIG_PPC_64K_PAGES + /* If we use 4K pages and our psize is not 4K, then we might + * be hitting a special driver mapping, and need to align the + * address before we fetch the PTE. + * + * It could also be a hugepage mapping, in which case this is + * not necessary, but it's not harmful, either. + */ + if (psize != MMU_PAGE_4K) + ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get PTE and page size from page tables */ + ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); + if (ptep == NULL || !pte_present(*ptep)) { + DBG_LOW(" no PTE !\n"); + rc = 1; + goto bail; + } + + /* Add _PAGE_PRESENT to the required access perm */ + access |= _PAGE_PRESENT; + + /* Pre-check access permissions (will be re-checked atomically + * in __hash_page_XX but this pre-check is a fast path + */ + if (access & ~pte_val(*ptep)) { + DBG_LOW(" no access !\n"); + rc = 1; + goto bail; + } + + if (hugeshift) { + if (pmd_trans_huge(*(pmd_t *)ptep)) + rc = __hash_page_thp(ea, access, vsid, (pmd_t *)ptep, + trap, flags, ssize, psize); +#ifdef CONFIG_HUGETLB_PAGE + else + rc = __hash_page_huge(ea, access, vsid, ptep, trap, + flags, ssize, hugeshift, psize); +#else + else { + /* + * if we have hugeshift, and is not transhuge with + * hugetlb disabled, something is really wrong. + */ + rc = 1; + WARN_ON(1); + } +#endif + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); + + goto bail; + } + +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" i-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + /* Do actual hashing */ +#ifdef CONFIG_PPC_64K_PAGES + /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ + if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } + + /* If this PTE is non-cacheable and we have restrictions on + * using non cacheable large pages, then we switch to 4k + */ + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && + (pte_val(*ptep) & _PAGE_NO_CACHE)) { + if (user_region) { + demote_segment_4k(mm, ea); + psize = MMU_PAGE_4K; + } else if (ea < VMALLOC_END) { + /* + * some driver did a non-cacheable mapping + * in vmalloc space, so switch vmalloc + * to 4k pages + */ + printk(KERN_ALERT "Reducing vmalloc segment " + "to 4kB pages because of " + "non-cacheable mapping\n"); + psize = mmu_vmalloc_psize = MMU_PAGE_4K; + copro_flush_all_slbs(mm); + } + } + + if (current->mm == mm) + check_paca_psize(ea, mm, psize, user_region); +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_PPC_HAS_HASH_64K + if (psize == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + flags, ssize); + else +#endif /* CONFIG_PPC_HAS_HASH_64K */ + { + int spp = subpage_protection(mm, ea); + if (access & spp) + rc = -2; + else + rc = __hash_page_4K(ea, access, vsid, ptep, trap, + flags, ssize, spp); + } + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + psize, pte_val(*ptep)); +#ifndef CONFIG_PPC_64K_PAGES + DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); +#else + DBG_LOW(" o-pte: %016lx %016lx\n", pte_val(*ptep), + pte_val(*(ptep + PTRS_PER_PTE))); +#endif + DBG_LOW(" -> rc=%d\n", rc); + +bail: + exception_exit(prev_state); + return rc; +} +EXPORT_SYMBOL_GPL(hash_page_mm); + +int hash_page(unsigned long ea, unsigned long access, unsigned long trap, + unsigned long dsisr) +{ + unsigned long flags = 0; + struct mm_struct *mm = current->mm; + + if (REGION_ID(ea) == VMALLOC_REGION_ID) + mm = &init_mm; + + if (dsisr & DSISR_NOHPTE) + flags |= HPTE_NOHPTE_UPDATE; + + return hash_page_mm(mm, ea, access, trap, flags); +} +EXPORT_SYMBOL_GPL(hash_page); + +void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap) +{ + int hugepage_shift; + unsigned long vsid; + pgd_t *pgdir; + pte_t *ptep; + unsigned long flags; + int rc, ssize, update_flags = 0; + + BUG_ON(REGION_ID(ea) != USER_REGION_ID); + +#ifdef CONFIG_PPC_MM_SLICES + /* We only prefault standard pages for now */ + if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) + return; +#endif + + DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," + " trap=%lx\n", mm, mm->pgd, ea, access, trap); + + /* Get Linux PTE if available */ + pgdir = mm->pgd; + if (pgdir == NULL) + return; + + /* Get VSID */ + ssize = user_segment_size(ea); + vsid = get_vsid(mm->context.id, ea, ssize); + if (!vsid) + return; + /* + * Hash doesn't like irqs. Walking linux page table with irq disabled + * saves us from holding multiple locks. + */ + local_irq_save(flags); + + /* + * THP pages use update_mmu_cache_pmd. We don't do + * hash preload there. Hence can ignore THP here + */ + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugepage_shift); + if (!ptep) + goto out_exit; + + WARN_ON(hugepage_shift); +#ifdef CONFIG_PPC_64K_PAGES + /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on + * a 64K kernel), then we don't preload, hash_page() will take + * care of it once we actually try to access the page. + * That way we don't have to duplicate all of the logic for segment + * page size demotion here + */ + if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) + goto out_exit; +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Is that local to this CPU ? */ + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + update_flags |= HPTE_LOCAL_UPDATE; + + /* Hash it in */ +#ifdef CONFIG_PPC_HAS_HASH_64K + if (mm->context.user_psize == MMU_PAGE_64K) + rc = __hash_page_64K(ea, access, vsid, ptep, trap, + update_flags, ssize); + else +#endif /* CONFIG_PPC_HAS_HASH_64K */ + rc = __hash_page_4K(ea, access, vsid, ptep, trap, update_flags, + ssize, subpage_protection(mm, ea)); + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm->context.user_psize, + mm->context.user_psize, + pte_val(*ptep)); +out_exit: + local_irq_restore(flags); +} + +/* WARNING: This is called from hash_low_64.S, if you change this prototype, + * do not forget to update the assembly call site ! + */ +void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, + unsigned long flags) +{ + unsigned long hash, index, shift, hidx, slot; + int local = flags & HPTE_LOCAL_UPDATE; + + DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); + pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { + hash = hpt_hash(vpn, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); + /* + * We use same base page size and actual psize, because we don't + * use these functions for hugepage + */ + ppc_md.hpte_invalidate(slot, vpn, psize, psize, ssize, local); + } pte_iterate_hashed_end(); + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* Transactions are not aborted by tlbiel, only tlbie. + * Without, syncing a page back to a block device w/ PIO could pick up + * transactional data (bad!) so we force an abort here. Before the + * sync the page will be made read-only, which will flush_hash_page. + * BIG ISSUE here: if the kernel uses a page from userspace without + * unmapping it first, it may see the speculated version. + */ + if (local && cpu_has_feature(CPU_FTR_TM) && + current->thread.regs && + MSR_TM_ACTIVE(current->thread.regs->msr)) { + tm_enable(); + tm_abort(TM_CAUSE_TLBI); + } +#endif +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void flush_hash_hugepage(unsigned long vsid, unsigned long addr, + pmd_t *pmdp, unsigned int psize, int ssize, + unsigned long flags) +{ + int i, max_hpte_count, valid; + unsigned long s_addr; + unsigned char *hpte_slot_array; + unsigned long hidx, shift, vpn, hash, slot; + int local = flags & HPTE_LOCAL_UPDATE; + + s_addr = addr & HPAGE_PMD_MASK; + hpte_slot_array = get_hpte_slot_array(pmdp); + /* + * IF we try to do a HUGE PTE update after a withdraw is done. + * we will find the below NULL. This happens when we do + * split_huge_page_pmd + */ + if (!hpte_slot_array) + return; + + if (ppc_md.hugepage_invalidate) { + ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array, + psize, ssize, local); + goto tm_abort; + } + /* + * No bluk hpte removal support, invalidate each entry + */ + shift = mmu_psize_defs[psize].shift; + max_hpte_count = HPAGE_PMD_SIZE >> shift; + for (i = 0; i < max_hpte_count; i++) { + /* + * 8 bits per each hpte entries + * 000| [ secondary group (one bit) | hidx (3 bits) | valid bit] + */ + valid = hpte_valid(hpte_slot_array, i); + if (!valid) + continue; + hidx = hpte_hash_index(hpte_slot_array, i); + + /* get the vpn */ + addr = s_addr + (i * (1ul << shift)); + vpn = hpt_vpn(addr, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + ppc_md.hpte_invalidate(slot, vpn, psize, + MMU_PAGE_16M, ssize, local); + } +tm_abort: +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + /* Transactions are not aborted by tlbiel, only tlbie. + * Without, syncing a page back to a block device w/ PIO could pick up + * transactional data (bad!) so we force an abort here. Before the + * sync the page will be made read-only, which will flush_hash_page. + * BIG ISSUE here: if the kernel uses a page from userspace without + * unmapping it first, it may see the speculated version. + */ + if (local && cpu_has_feature(CPU_FTR_TM) && + current->thread.regs && + MSR_TM_ACTIVE(current->thread.regs->msr)) { + tm_enable(); + tm_abort(TM_CAUSE_TLBI); + } +#endif + return; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +void flush_hash_range(unsigned long number, int local) +{ + if (ppc_md.flush_hash_range) + ppc_md.flush_hash_range(number, local); + else { + int i; + struct ppc64_tlb_batch *batch = + this_cpu_ptr(&ppc64_tlb_batch); + + for (i = 0; i < number; i++) + flush_hash_page(batch->vpn[i], batch->pte[i], + batch->psize, batch->ssize, local); + } +} + +/* + * low_hash_fault is called when we the low level hash code failed + * to instert a PTE due to an hypervisor error + */ +void low_hash_fault(struct pt_regs *regs, unsigned long address, int rc) +{ + enum ctx_state prev_state = exception_enter(); + + if (user_mode(regs)) { +#ifdef CONFIG_PPC_SUBPAGE_PROT + if (rc == -2) + _exception(SIGSEGV, regs, SEGV_ACCERR, address); + else +#endif + _exception(SIGBUS, regs, BUS_ADRERR, address); + } else + bad_page_fault(regs, address, SIGBUS); + + exception_exit(prev_state); +} + +long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rflags, + unsigned long vflags, int psize, int ssize) +{ + unsigned long hpte_group; + long slot; + +repeat: + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, vflags, + psize, psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, + vflags | HPTE_V_SECONDARY, + psize, psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP)&~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + + return slot; +} + +#ifdef CONFIG_DEBUG_PAGEALLOC +static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL); + long ret; + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + + /* Don't create HPTE entries for bad address */ + if (!vsid) + return; + + ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, + HPTE_V_BOLTED, + mmu_linear_psize, mmu_kernel_ssize); + + BUG_ON (ret < 0); + spin_lock(&linear_map_hash_lock); + BUG_ON(linear_map_hash_slots[lmi] & 0x80); + linear_map_hash_slots[lmi] = ret | 0x80; + spin_unlock(&linear_map_hash_lock); +} + +static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) +{ + unsigned long hash, hidx, slot; + unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); + unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); + + hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); + spin_lock(&linear_map_hash_lock); + BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); + hidx = linear_map_hash_slots[lmi] & 0x7f; + linear_map_hash_slots[lmi] = 0; + spin_unlock(&linear_map_hash_lock); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + ppc_md.hpte_invalidate(slot, vpn, mmu_linear_psize, mmu_linear_psize, + mmu_kernel_ssize, 0); +} + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + unsigned long flags, vaddr, lmi; + int i; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + vaddr = (unsigned long)page_address(page); + lmi = __pa(vaddr) >> PAGE_SHIFT; + if (lmi >= linear_map_hash_count) + continue; + if (enable) + kernel_map_linear_page(vaddr, lmi); + else + kernel_unmap_linear_page(vaddr, lmi); + } + local_irq_restore(flags); +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* On LPAR systems, the first entry is our RMA region, + * non-LPAR 64-bit hash MMU systems don't have a limitation + * on real mode access, but using the first entry works well + * enough. We also clamp it to 1G to avoid some funky things + * such as RTAS bugs etc... + */ + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(ppc64_rma_size); +} diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c new file mode 100644 index 000000000..e7450bdbe --- /dev/null +++ b/arch/powerpc/mm/highmem.c @@ -0,0 +1,86 @@ +/* + * highmem.c: virtual kernel memory mappings for high memory + * + * PowerPC version, stolen from the i386 version. + * + * Used in CONFIG_HIGHMEM systems for memory pages which + * are not addressable by direct kernel virtual addresses. + * + * Copyright (C) 1999 Gerhard Wichert, Siemens AG + * Gerhard.Wichert@pdb.siemens.de + * + * + * Redesigned the x86 32-bit VM architecture to deal with + * up to 16 Terrabyte physical memory. With current x86 CPUs + * we now support up to 64 Gigabytes physical RAM. + * + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * + * Reworked for PowerPC by various contributors. Moved from + * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. + */ + +#include <linux/highmem.h> +#include <linux/module.h> + +/* + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + unsigned long vaddr; + int idx, type; + + /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + type = kmap_atomic_idx_push(); + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#ifdef CONFIG_DEBUG_HIGHMEM + BUG_ON(!pte_none(*(kmap_pte-idx))); +#endif + __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); + local_flush_tlb_page(NULL, vaddr); + + return (void*) vaddr; +} +EXPORT_SYMBOL(kmap_atomic_prot); + +void __kunmap_atomic(void *kvaddr) +{ + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + int type; + + if (vaddr < __fix_to_virt(FIX_KMAP_END)) { + pagefault_enable(); + return; + } + + type = kmap_atomic_idx(); + +#ifdef CONFIG_DEBUG_HIGHMEM + { + unsigned int idx; + + idx = type + KM_TYPE_NR * smp_processor_id(); + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(&init_mm, vaddr, kmap_pte-idx); + local_flush_tlb_page(NULL, vaddr); + } +#endif + + kmap_atomic_idx_pop(); + pagefault_enable(); +} +EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c new file mode 100644 index 000000000..43dafb9d6 --- /dev/null +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -0,0 +1,195 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +/* + * PPC64 THP Support for hash based MMUs + */ +#include <linux/mm.h> +#include <asm/machdep.h> + +int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, + pmd_t *pmdp, unsigned long trap, unsigned long flags, + int ssize, unsigned int psize) +{ + unsigned int index, valid; + unsigned char *hpte_slot_array; + unsigned long rflags, pa, hidx; + unsigned long old_pmd, new_pmd; + int ret, lpsize = MMU_PAGE_16M; + unsigned long vpn, hash, shift, slot; + + /* + * atomically mark the linux large page PMD busy and dirty + */ + do { + pmd_t pmd = READ_ONCE(*pmdp); + + old_pmd = pmd_val(pmd); + /* If PMD busy, retry the access */ + if (unlikely(old_pmd & _PAGE_BUSY)) + return 0; + /* If PMD is trans splitting retry the access */ + if (unlikely(old_pmd & _PAGE_SPLITTING)) + return 0; + /* If PMD permissions don't match, take page fault */ + if (unlikely(access & ~old_pmd)) + return 1; + /* + * Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access + */ + new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pmd |= _PAGE_DIRTY; + } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, + old_pmd, new_pmd)); + /* + * PP bits. _PAGE_USER is already PP bit 0x2, so we only + * need to add in 0x1 if it's a read-only user page + */ + rflags = new_pmd & _PAGE_USER; + if ((new_pmd & _PAGE_USER) && !((new_pmd & _PAGE_RW) && + (new_pmd & _PAGE_DIRTY))) + rflags |= 0x1; + /* + * _PAGE_EXEC -> HW_NO_EXEC since it's inverted + */ + rflags |= ((new_pmd & _PAGE_EXEC) ? 0 : HPTE_R_N); + +#if 0 + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { + + /* + * No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case + */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + } +#endif + /* + * Find the slot index details for this ea, using base page size. + */ + shift = mmu_psize_defs[psize].shift; + index = (ea & ~HPAGE_PMD_MASK) >> shift; + BUG_ON(index >= 4096); + + vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, shift, ssize); + hpte_slot_array = get_hpte_slot_array(pmdp); + if (psize == MMU_PAGE_4K) { + /* + * invalidate the old hpte entry if we have that mapped via 64K + * base page size. This is because demote_segment won't flush + * hash page table entries. + */ + if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) + flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, + ssize, flags); + } + + valid = hpte_valid(hpte_slot_array, index); + if (valid) { + /* update the hpte bits */ + hidx = hpte_hash_index(hpte_slot_array, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + + ret = ppc_md.hpte_updatepp(slot, rflags, vpn, + psize, lpsize, ssize, flags); + /* + * We failed to update, try to insert a new entry. + */ + if (ret == -1) { + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + valid = 0; + hpte_slot_array[index] = 0; + } + } + + if (!valid) { + unsigned long hpte_group; + + /* insert new entry */ + pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; + new_pmd |= _PAGE_HASHPTE; + + /* Add in WIMG bits */ + rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_GUARDED)); + /* + * enable the memory coherence always + */ + rflags |= HPTE_R_M; +repeat: + hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, + psize, lpsize, ssize); + /* + * Primary is full, try the secondary + */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, vpn, pa, + rflags, HPTE_V_SECONDARY, + psize, lpsize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + /* + * Hypervisor failure. Restore old pmd and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *pmdp = __pmd(old_pmd); + hash_failure_debug(ea, access, vsid, trap, ssize, + psize, lpsize, old_pmd); + return -1; + } + /* + * large pte is marked busy, so we can be sure + * nobody is looking at hpte_slot_array. hence we can + * safely update this here. + */ + mark_hpte_slot_valid(hpte_slot_array, index, slot); + } + /* + * Mark the pte with _PAGE_COMBO, if we are trying to hash it with + * base page size 4k. + */ + if (psize == MMU_PAGE_4K) + new_pmd |= _PAGE_COMBO; + /* + * The hpte valid is stored in the pgtable whose address is in the + * second half of the PMD. Order this against clearing of the busy bit in + * huge pmd. + */ + smp_wmb(); + *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c new file mode 100644 index 000000000..ba47aaf33 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -0,0 +1,153 @@ +/* + * PPC Huge TLB Page Support for Book3E MMU + * + * Copyright (C) 2009 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + */ +#include <linux/mm.h> +#include <linux/hugetlb.h> + +#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC64 +static inline int tlb1_next(void) +{ + struct paca_struct *paca = get_paca(); + struct tlb_core_data *tcd; + int this, next; + + tcd = paca->tcd_ptr; + this = tcd->esel_next; + + next = this + 1; + if (next >= tcd->esel_max) + next = tcd->esel_first; + + tcd->esel_next = next; + return this; +} +#else +static inline int tlb1_next(void) +{ + int index, ncams; + + ncams = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + index = this_cpu_read(next_tlbcam_idx); + + /* Just round-robin the entries and wrap when we hit the end */ + if (unlikely(index == ncams - 1)) + __this_cpu_write(next_tlbcam_idx, tlbcam_index); + else + __this_cpu_inc(next_tlbcam_idx); + + return index; +} +#endif /* !PPC64 */ +#endif /* FSL */ + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} + +static inline int book3e_tlb_exists(unsigned long ea, unsigned long pid) +{ + int found = 0; + + mtspr(SPRN_MAS6, pid << 16); + if (mmu_has_feature(MMU_FTR_USE_TLBRSRV)) { + asm volatile( + "li %0,0\n" + "tlbsx. 0,%1\n" + "bne 1f\n" + "li %0,1\n" + "1:\n" + : "=&r"(found) : "r"(ea)); + } else { + asm volatile( + "tlbsx 0,%1\n" + "mfspr %0,0x271\n" + "srwi %0,%0,31\n" + : "=&r"(found) : "r"(ea)); + } + + return found; +} + +void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, + pte_t pte) +{ + unsigned long mas1, mas2; + u64 mas7_3; + unsigned long psize, tsize, shift; + unsigned long flags; + struct mm_struct *mm; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int index; +#endif + + if (unlikely(is_kernel_addr(ea))) + return; + + mm = vma->vm_mm; + +#ifdef CONFIG_PPC_MM_SLICES + psize = get_slice_psize(mm, ea); + tsize = mmu_get_tsize(psize); + shift = mmu_psize_defs[psize].shift; +#else + psize = vma_mmu_pagesize(vma); + shift = __ilog2(psize); + tsize = shift - 10; +#endif + + /* + * We can't be interrupted while we're setting up the MAS + * regusters or after we've confirmed that no tlb exists. + */ + local_irq_save(flags); + + if (unlikely(book3e_tlb_exists(ea, mm->context.id))) { + local_irq_restore(flags); + return; + } + +#ifdef CONFIG_PPC_FSL_BOOK3E + /* We have to use the CAM(TLB1) on FSL parts for hugepages */ + index = tlb1_next(); + mtspr(SPRN_MAS0, MAS0_ESEL(index) | MAS0_TLBSEL(1)); +#endif + + mas1 = MAS1_VALID | MAS1_TID(mm->context.id) | MAS1_TSIZE(tsize); + mas2 = ea & ~((1UL << shift) - 1); + mas2 |= (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; + mas7_3 = (u64)pte_pfn(pte) << PAGE_SHIFT; + mas7_3 |= (pte_val(pte) >> PTE_BAP_SHIFT) & MAS3_BAP_MASK; + if (!pte_dirty(pte)) + mas7_3 &= ~(MAS3_SW|MAS3_UW); + + mtspr(SPRN_MAS1, mas1); + mtspr(SPRN_MAS2, mas2); + + if (mmu_has_feature(MMU_FTR_USE_PAIRED_MAS)) { + mtspr(SPRN_MAS7_MAS3, mas7_3); + } else { + if (mmu_has_feature(MMU_FTR_BIG_PHYS)) + mtspr(SPRN_MAS7, upper_32_bits(mas7_3)); + mtspr(SPRN_MAS3, lower_32_bits(mas7_3)); + } + + asm volatile ("tlbwe"); + + local_irq_restore(flags); +} + +void flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct hstate *hstate = hstate_file(vma->vm_file); + unsigned long tsize = huge_page_shift(hstate) - 10; + + __flush_tlb_page(vma->vm_mm, vmaddr, tsize, 0); +} diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c new file mode 100644 index 000000000..d94b1af53 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -0,0 +1,129 @@ +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> + +extern long hpte_insert_repeating(unsigned long hash, unsigned long vpn, + unsigned long pa, unsigned long rlags, + unsigned long vflags, int psize, int ssize); + +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, unsigned long flags, + int ssize, unsigned int shift, unsigned int mmu_psize) +{ + unsigned long vpn; + unsigned long old_pte, new_pte; + unsigned long rflags, pa, sz; + long slot; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + vpn = hpt_vpn(ea, vsid, ssize); + + /* At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + /* If PTE busy, retry the access */ + if (unlikely(old_pte & _PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(access & ~old_pte)) + return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access */ + new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pte |= _PAGE_DIRTY; + } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, + old_pte, new_pte)); + + rflags = 0x2 | (!(new_pte & _PAGE_RW)); + /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ + rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); + sz = ((1UL) << shift); + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & _PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long hash, slot; + + hash = hpt_hash(vpn, shift, ssize); + if (old_pte & _PAGE_F_SECOND) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (old_pte & _PAGE_F_GIX) >> 12; + + if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, + mmu_psize, ssize, flags) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & _PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(vpn, shift, ssize); + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + + /* clear HPTE slot informations in new PTE */ +#ifdef CONFIG_PPC_64K_PAGES + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; +#else + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; +#endif + /* Add in WIMG bits */ + rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_COHERENT | _PAGE_GUARDED)); + /* + * enable the memory coherence always + */ + rflags |= HPTE_R_M; + + slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, + mmu_psize, ssize); + + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, mmu_psize, old_pte); + return -1; + } + + new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~_PAGE_BUSY); + return 0; +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c new file mode 100644 index 000000000..3385e3d05 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage.c @@ -0,0 +1,1113 @@ +/* + * PPC Huge TLB Page Support for Kernel. + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/mm.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/export.h> +#include <linux/of_fdt.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <linux/moduleparam.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> +#include <asm/setup.h> +#include <asm/hugetlb.h> + +#ifdef CONFIG_HUGETLB_PAGE + +#define PAGE_SHIFT_64K 16 +#define PAGE_SHIFT_16M 24 +#define PAGE_SHIFT_16G 34 + +unsigned int HPAGE_SHIFT; + +/* + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready. On non-Freescale implementations, this is + * just used to track 16G pages and so is a single array. FSL-based + * implementations may have more than one gpage size, so we need multiple + * arrays + */ +#ifdef CONFIG_PPC_FSL_BOOK3E +#define MAX_NUMBER_GPAGES 128 +struct psize_gpages { + u64 gpage_list[MAX_NUMBER_GPAGES]; + unsigned int nr_gpages; +}; +static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; +#else +#define MAX_NUMBER_GPAGES 1024 +static u64 gpage_freearray[MAX_NUMBER_GPAGES]; +static unsigned nr_gpages; +#endif + +#define hugepd_none(hpd) ((hpd).pd == 0) + +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * At this point we do the placement change only for BOOK3S 64. This would + * possibly work on other subarchs. + */ + +/* + * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have + * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; + * + * Defined in such a way that we can optimize away code block at build time + * if CONFIG_HUGETLB_PAGE=n. + */ +int pmd_huge(pmd_t pmd) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + return ((pmd_val(pmd) & 0x3) != 0x0); +} + +int pud_huge(pud_t pud) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + return ((pud_val(pud) & 0x3) != 0x0); +} + +int pgd_huge(pgd_t pgd) +{ + /* + * leaf pte for huge page, bottom two bits != 00 + */ + return ((pgd_val(pgd) & 0x3) != 0x0); +} +#else +int pmd_huge(pmd_t pmd) +{ + return 0; +} + +int pud_huge(pud_t pud) +{ + return 0; +} + +int pgd_huge(pgd_t pgd) +{ + return 0; +} +#endif + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + /* Only called for hugetlbfs pages, hence can ignore THP */ + return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL); +} + +static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, + unsigned long address, unsigned pdshift, unsigned pshift) +{ + struct kmem_cache *cachep; + pte_t *new; + +#ifdef CONFIG_PPC_FSL_BOOK3E + int i; + int num_hugepd = 1 << (pshift - pdshift); + cachep = hugepte_cache; +#else + cachep = PGT_CACHE(pdshift - pshift); +#endif + + new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT); + + BUG_ON(pshift > HUGEPD_SHIFT_MASK); + BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); + + if (! new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * We have multiple higher-level entries that point to the same + * actual pte location. Fill in each as we go and backtrack on error. + * We need all of these so the DTLB pgtable walk code can find the + * right higher-level entry without knowing if it's a hugepage or not. + */ + for (i = 0; i < num_hugepd; i++, hpdp++) { + if (unlikely(!hugepd_none(*hpdp))) + break; + else + /* We use the old format for PPC_FSL_BOOK3E */ + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; + } + /* If we bailed from the for loop early, an error occurred, clean up */ + if (i < num_hugepd) { + for (i = i - 1 ; i >= 0; i--, hpdp--) + hpdp->pd = 0; + kmem_cache_free(cachep, new); + } +#else + if (!hugepd_none(*hpdp)) + kmem_cache_free(cachep, new); + else { +#ifdef CONFIG_PPC_BOOK3S_64 + hpdp->pd = (unsigned long)new | + (shift_to_mmu_psize(pshift) << 2); +#else + hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; +#endif + } +#endif + spin_unlock(&mm->page_table_lock); + return 0; +} + +/* + * These macros define how to determine which level of the page table holds + * the hpdp. + */ +#ifdef CONFIG_PPC_FSL_BOOK3E +#define HUGEPD_PGD_SHIFT PGDIR_SHIFT +#define HUGEPD_PUD_SHIFT PUD_SHIFT +#else +#define HUGEPD_PGD_SHIFT PUD_SHIFT +#define HUGEPD_PUD_SHIFT PMD_SHIFT +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * At this point we do the placement change only for BOOK3S 64. This would + * possibly work on other subarchs. + */ +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + hugepd_t *hpdp = NULL; + unsigned pshift = __ffs(sz); + unsigned pdshift = PGDIR_SHIFT; + + addr &= ~(sz-1); + pg = pgd_offset(mm, addr); + + if (pshift == PGDIR_SHIFT) + /* 16GB huge page */ + return (pte_t *) pg; + else if (pshift > PUD_SHIFT) + /* + * We need to use hugepd table + */ + hpdp = (hugepd_t *)pg; + else { + pdshift = PUD_SHIFT; + pu = pud_alloc(mm, pg, addr); + if (pshift == PUD_SHIFT) + return (pte_t *)pu; + else if (pshift > PMD_SHIFT) + hpdp = (hugepd_t *)pu; + else { + pdshift = PMD_SHIFT; + pm = pmd_alloc(mm, pu, addr); + if (pshift == PMD_SHIFT) + /* 16MB hugepage */ + return (pte_t *)pm; + else + hpdp = (hugepd_t *)pm; + } + } + if (!hpdp) + return NULL; + + BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); + + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) + return NULL; + + return hugepte_offset(*hpdp, addr, pdshift); +} + +#else + +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + hugepd_t *hpdp = NULL; + unsigned pshift = __ffs(sz); + unsigned pdshift = PGDIR_SHIFT; + + addr &= ~(sz-1); + + pg = pgd_offset(mm, addr); + + if (pshift >= HUGEPD_PGD_SHIFT) { + hpdp = (hugepd_t *)pg; + } else { + pdshift = PUD_SHIFT; + pu = pud_alloc(mm, pg, addr); + if (pshift >= HUGEPD_PUD_SHIFT) { + hpdp = (hugepd_t *)pu; + } else { + pdshift = PMD_SHIFT; + pm = pmd_alloc(mm, pu, addr); + hpdp = (hugepd_t *)pm; + } + } + + if (!hpdp) + return NULL; + + BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); + + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) + return NULL; + + return hugepte_offset(*hpdp, addr, pdshift); +} +#endif + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ + unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); + int i; + + if (addr == 0) + return; + + gpage_freearray[idx].nr_gpages = number_of_pages; + + for (i = 0; i < number_of_pages; i++) { + gpage_freearray[idx].gpage_list[i] = addr; + addr += page_size; + } +} + +/* + * Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + int idx = shift_to_mmu_psize(huge_page_shift(hstate)); + int nr_gpages = gpage_freearray[idx].nr_gpages; + + if (nr_gpages == 0) + return 0; + +#ifdef CONFIG_HIGHMEM + /* + * If gpages can be in highmem we can't use the trick of storing the + * data structure in the page; allocate space for this + */ + m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0); + m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; +#else + m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); +#endif + + list_add(&m->list, &huge_boot_pages); + gpage_freearray[idx].nr_gpages = nr_gpages; + gpage_freearray[idx].gpage_list[nr_gpages] = 0; + m->hstate = hstate; + + return 1; +} +/* + * Scan the command line hugepagesz= options for gigantic pages; store those in + * a list that we use to allocate the memory once all options are parsed. + */ + +unsigned long gpage_npages[MMU_PAGE_COUNT]; + +static int __init do_gpage_early_setup(char *param, char *val, + const char *unused) +{ + static phys_addr_t size; + unsigned long npages; + + /* + * The hugepagesz and hugepages cmdline options are interleaved. We + * use the size variable to keep track of whether or not this was done + * properly and skip over instances where it is incorrect. Other + * command-line parsing code will issue warnings, so we don't need to. + * + */ + if ((strcmp(param, "default_hugepagesz") == 0) || + (strcmp(param, "hugepagesz") == 0)) { + size = memparse(val, NULL); + } else if (strcmp(param, "hugepages") == 0) { + if (size != 0) { + if (sscanf(val, "%lu", &npages) <= 0) + npages = 0; + if (npages > MAX_NUMBER_GPAGES) { + pr_warn("MMU: %lu pages requested for page " + "size %llu KB, limiting to " + __stringify(MAX_NUMBER_GPAGES) "\n", + npages, size / 1024); + npages = MAX_NUMBER_GPAGES; + } + gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; + size = 0; + } + } + return 0; +} + + +/* + * This function allocates physical space for pages that are larger than the + * buddy allocator can handle. We want to allocate these in highmem because + * the amount of lowmem is limited. This means that this function MUST be + * called before lowmem_end_addr is set up in MMU_init() in order for the lmb + * allocate to grab highmem. + */ +void __init reserve_hugetlb_gpages(void) +{ + static __initdata char cmdline[COMMAND_LINE_SIZE]; + phys_addr_t size, base; + int i; + + strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); + parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, + &do_gpage_early_setup); + + /* + * Walk gpage list in reverse, allocating larger page sizes first. + * Skip over unsupported sizes, or sizes that have 0 gpages allocated. + * When we reach the point in the list where pages are no longer + * considered gpages, we're done. + */ + for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { + if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) + continue; + else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) + break; + + size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); + base = memblock_alloc_base(size * gpage_npages[i], size, + MEMBLOCK_ALLOC_ANYWHERE); + add_gpage(base, size, gpage_npages[i]); + } +} + +#else /* !PPC_FSL_BOOK3E */ + +/* Build list of addresses of gigantic pages. This function is used in early + * boot before the buddy allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ + if (!addr) + return; + while (number_of_pages > 0) { + gpage_freearray[nr_gpages] = addr; + nr_gpages++; + number_of_pages--; + addr += page_size; + } +} + +/* Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ + struct huge_bootmem_page *m; + if (nr_gpages == 0) + return 0; + m = phys_to_virt(gpage_freearray[--nr_gpages]); + gpage_freearray[nr_gpages] = 0; + list_add(&m->list, &huge_boot_pages); + m->hstate = hstate; + return 1; +} +#endif + +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + return 0; +} + +#ifdef CONFIG_PPC_FSL_BOOK3E +#define HUGEPD_FREELIST_SIZE \ + ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) + +struct hugepd_freelist { + struct rcu_head rcu; + unsigned int index; + void *ptes[0]; +}; + +static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); + +static void hugepd_free_rcu_callback(struct rcu_head *head) +{ + struct hugepd_freelist *batch = + container_of(head, struct hugepd_freelist, rcu); + unsigned int i; + + for (i = 0; i < batch->index; i++) + kmem_cache_free(hugepte_cache, batch->ptes[i]); + + free_page((unsigned long)batch); +} + +static void hugepd_free(struct mmu_gather *tlb, void *hugepte) +{ + struct hugepd_freelist **batchp; + + batchp = this_cpu_ptr(&hugepd_freelist_cur); + + if (atomic_read(&tlb->mm->mm_users) < 2 || + cpumask_equal(mm_cpumask(tlb->mm), + cpumask_of(smp_processor_id()))) { + kmem_cache_free(hugepte_cache, hugepte); + put_cpu_var(hugepd_freelist_cur); + return; + } + + if (*batchp == NULL) { + *batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); + (*batchp)->index = 0; + } + + (*batchp)->ptes[(*batchp)->index++] = hugepte; + if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { + call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); + *batchp = NULL; + } + put_cpu_var(hugepd_freelist_cur); +} +#endif + +static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, + unsigned long start, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pte_t *hugepte = hugepd_page(*hpdp); + int i; + + unsigned long pdmask = ~((1UL << pdshift) - 1); + unsigned int num_hugepd = 1; + +#ifdef CONFIG_PPC_FSL_BOOK3E + /* Note: On fsl the hpdp may be the first of several */ + num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); +#else + unsigned int shift = hugepd_shift(*hpdp); +#endif + + start &= pdmask; + if (start < floor) + return; + if (ceiling) { + ceiling &= pdmask; + if (! ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + for (i = 0; i < num_hugepd; i++, hpdp++) + hpdp->pd = 0; + +#ifdef CONFIG_PPC_FSL_BOOK3E + hugepd_free(tlb, hugepte); +#else + pgtable_free_tlb(tlb, hugepte, pdshift - shift); +#endif +} + +static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + do { + pmd = pmd_offset(pud, addr); + next = pmd_addr_end(addr, end); + if (!is_hugepd(__hugepd(pmd_val(*pmd)))) { + /* + * if it is not hugepd pointer, we should already find + * it cleared. + */ + WARN_ON(!pmd_none_or_clear_bad(pmd)); + continue; + } +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * Increment next by the size of the huge mapping since + * there may be more than one entry at this level for a + * single hugepage, but all of them point to + * the same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pmd)); +#endif + free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, + addr, next, floor, ceiling); + } while (addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); +} + +static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + do { + pud = pud_offset(pgd, addr); + next = pud_addr_end(addr, end); + if (!is_hugepd(__hugepd(pud_val(*pud)))) { + if (pud_none_or_clear_bad(pud)) + continue; + hugetlb_free_pmd_range(tlb, pud, addr, next, floor, + ceiling); + } else { +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * Increment next by the size of the huge mapping since + * there may be more than one entry at this level for a + * single hugepage, but all of them point to + * the same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pud)); +#endif + free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, + addr, next, floor, ceiling); + } + } while (addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud, start); +} + +/* + * This function frees user-level page tables of a process. + */ +void hugetlb_free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + + /* + * Because there are a number of different possible pagetable + * layouts for hugepage ranges, we limit knowledge of how + * things should be laid out to the allocation path + * (huge_pte_alloc(), above). Everything else works out the + * structure as it goes from information in the hugepd + * pointers. That means that we can't here use the + * optimization used in the normal page free_pgd_range(), of + * checking whether we're actually covering a large enough + * range to have to do anything at the top level of the walk + * instead of at the bottom. + * + * To make sense of this, you should probably go read the big + * block comment at the top of the normal free_pgd_range(), + * too. + */ + + do { + next = pgd_addr_end(addr, end); + pgd = pgd_offset(tlb->mm, addr); + if (!is_hugepd(__hugepd(pgd_val(*pgd)))) { + if (pgd_none_or_clear_bad(pgd)) + continue; + hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); + } else { +#ifdef CONFIG_PPC_FSL_BOOK3E + /* + * Increment next by the size of the huge mapping since + * there may be more than one entry at the pgd level + * for a single hugepage, but all of them point to the + * same kmem cache that holds the hugepte. + */ + next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); +#endif + free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, + addr, next, floor, ceiling); + } + } while (addr = next, addr != end); +} + +/* + * We are holding mmap_sem, so a parallel huge page collapse cannot run. + * To prevent hugepage split, disable irq. + */ +struct page * +follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) +{ + pte_t *ptep, pte; + unsigned shift; + unsigned long mask, flags; + struct page *page = ERR_PTR(-EINVAL); + + local_irq_save(flags); + ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); + if (!ptep) + goto no_page; + pte = READ_ONCE(*ptep); + /* + * Verify it is a huge page else bail. + * Transparent hugepages are handled by generic code. We can skip them + * here. + */ + if (!shift || pmd_trans_huge(__pmd(pte_val(pte)))) + goto no_page; + + if (!pte_present(pte)) { + page = NULL; + goto no_page; + } + mask = (1UL << shift) - 1; + page = pte_page(pte); + if (page) + page += (address & mask) / PAGE_SIZE; + +no_page: + local_irq_restore(flags); + return page; +} + +struct page * +follow_huge_pmd(struct mm_struct *mm, unsigned long address, + pmd_t *pmd, int write) +{ + BUG(); + return NULL; +} + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + +static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, + unsigned long sz) +{ + unsigned long __boundary = (addr + sz) & ~(sz-1); + return (__boundary - 1 < end - 1) ? __boundary : end; +} + +int gup_huge_pd(hugepd_t hugepd, unsigned long addr, unsigned pdshift, + unsigned long end, int write, struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(hugepd); + unsigned long next; + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + next = hugepte_addr_end(addr, end, sz); + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr = next, addr != end); + + return 1; +} + +#ifdef CONFIG_PPC_MM_SLICES +unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct hstate *hstate = hstate_file(file); + int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); + + return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); +} +#endif + +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ +#ifdef CONFIG_PPC_MM_SLICES + unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); + + return 1UL << mmu_psize_to_shift(psize); +#else + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + return huge_page_size(hstate_vma(vma)); +#endif +} + +static inline bool is_power_of_4(unsigned long x) +{ + if (is_power_of_2(x)) + return (__ilog2(x) % 2) ? false : true; + return false; +} + +static int __init add_huge_page_size(unsigned long long size) +{ + int shift = __ffs(size); + int mmu_psize; + + /* Check that it is a page size supported by the hardware and + * that it fits within pagetable and slice limits. */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if ((size < PAGE_SIZE) || !is_power_of_4(size)) + return -EINVAL; +#else + if (!is_power_of_2(size) + || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) + return -EINVAL; +#endif + + if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) + return -EINVAL; + +#ifdef CONFIG_SPU_FS_64K_LS + /* Disable support for 64K huge pages when 64K SPU local store + * support is enabled as the current implementation conflicts. + */ + if (shift == PAGE_SHIFT_64K) + return -EINVAL; +#endif /* CONFIG_SPU_FS_64K_LS */ + + BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); + + /* Return if huge page size has already been setup */ + if (size_to_hstate(size)) + return 0; + + hugetlb_add_hstate(shift - PAGE_SHIFT); + + return 0; +} + +static int __init hugepage_setup_sz(char *str) +{ + unsigned long long size; + + size = memparse(str, &str); + + if (add_huge_page_size(size) != 0) + printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); + + return 1; +} +__setup("hugepagesz=", hugepage_setup_sz); + +#ifdef CONFIG_PPC_FSL_BOOK3E +struct kmem_cache *hugepte_cache; +static int __init hugetlbpage_init(void) +{ + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + + if (!mmu_psize_defs[psize].shift) + continue; + + shift = mmu_psize_to_shift(psize); + + /* Don't treat normal page sizes as huge... */ + if (shift != PAGE_SHIFT) + if (add_huge_page_size(1ULL << shift) < 0) + continue; + } + + /* + * Create a kmem cache for hugeptes. The bottom bits in the pte have + * size information encoded in them, so align them to allow this + */ + hugepte_cache = kmem_cache_create("hugepte-cache", sizeof(pte_t), + HUGEPD_SHIFT_MASK + 1, 0, NULL); + if (hugepte_cache == NULL) + panic("%s: Unable to create kmem cache for hugeptes\n", + __func__); + + /* Default hpage size = 4M */ + if (mmu_psize_defs[MMU_PAGE_4M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; + else + panic("%s: Unable to set default huge page size\n", __func__); + + + return 0; +} +#else +static int __init hugetlbpage_init(void) +{ + int psize; + + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return -ENODEV; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + unsigned pdshift; + + if (!mmu_psize_defs[psize].shift) + continue; + + shift = mmu_psize_to_shift(psize); + + if (add_huge_page_size(1ULL << shift) < 0) + continue; + + if (shift < PMD_SHIFT) + pdshift = PMD_SHIFT; + else if (shift < PUD_SHIFT) + pdshift = PUD_SHIFT; + else + pdshift = PGDIR_SHIFT; + /* + * if we have pdshift and shift value same, we don't + * use pgt cache for hugepd. + */ + if (pdshift != shift) { + pgtable_cache_add(pdshift - shift, NULL); + if (!PGT_CACHE(pdshift - shift)) + panic("hugetlbpage_init(): could not create " + "pgtable cache for %d bit pagesize\n", shift); + } + } + + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + + return 0; +} +#endif +module_init(hugetlbpage_init); + +void flush_dcache_icache_hugepage(struct page *page) +{ + int i; + void *start; + + BUG_ON(!PageCompound(page)); + + for (i = 0; i < (1UL << compound_order(page)); i++) { + if (!PageHighMem(page)) { + __flush_dcache_icache(page_address(page+i)); + } else { + start = kmap_atomic(page+i); + __flush_dcache_icache(start); + kunmap_atomic(start); + } + } +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * We have 4 cases for pgds and pmds: + * (1) invalid (all zeroes) + * (2) pointer to next table, as normal; bottom 6 bits == 0 + * (3) leaf pte for huge page, bottom two bits != 00 + * (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table + * + * So long as we atomically load page table pointers we are safe against teardown, + * we can follow the address down to the the page and take a ref on it. + * This function need to be called with interrupts disabled. We use this variant + * when we have MSR[EE] = 0 but the paca->soft_enabled = 1 + */ + +pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) +{ + pgd_t pgd, *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t *ret_pte; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (shift) + *shift = 0; + + pgdp = pgdir + pgd_index(ea); + pgd = READ_ONCE(*pgdp); + /* + * Always operate on the local stack value. This make sure the + * value don't get updated by a parallel THP split/collapse, + * page fault or a page unmap. The return pte_t * is still not + * stable. So should be checked there for above conditions. + */ + if (pgd_none(pgd)) + return NULL; + else if (pgd_huge(pgd)) { + ret_pte = (pte_t *) pgdp; + goto out; + } else if (is_hugepd(__hugepd(pgd_val(pgd)))) + hpdp = (hugepd_t *)&pgd; + else { + /* + * Even if we end up with an unmap, the pgtable will not + * be freed, because we do an rcu free and here we are + * irq disabled + */ + pdshift = PUD_SHIFT; + pudp = pud_offset(&pgd, ea); + pud = READ_ONCE(*pudp); + + if (pud_none(pud)) + return NULL; + else if (pud_huge(pud)) { + ret_pte = (pte_t *) pudp; + goto out; + } else if (is_hugepd(__hugepd(pud_val(pud)))) + hpdp = (hugepd_t *)&pud; + else { + pdshift = PMD_SHIFT; + pmdp = pmd_offset(&pud, ea); + pmd = READ_ONCE(*pmdp); + /* + * A hugepage collapse is captured by pmd_none, because + * it mark the pmd none and do a hpte invalidate. + * + * We don't worry about pmd_trans_splitting here, The + * caller if it needs to handle the splitting case + * should check for that. + */ + if (pmd_none(pmd)) + return NULL; + + if (pmd_huge(pmd) || pmd_large(pmd)) { + ret_pte = (pte_t *) pmdp; + goto out; + } else if (is_hugepd(__hugepd(pmd_val(pmd)))) + hpdp = (hugepd_t *)&pmd; + else + return pte_offset_kernel(&pmd, ea); + } + } + if (!hpdp) + return NULL; + + ret_pte = hugepte_offset(*hpdp, ea, pdshift); + pdshift = hugepd_shift(*hpdp); +out: + if (shift) + *shift = pdshift; + return ret_pte; +} +EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte); + +int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + unsigned long pte_end; + struct page *head, *page, *tail; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = READ_ONCE(*ptep); + mask = _PAGE_PRESENT | _PAGE_USER; + if (write) + mask |= _PAGE_RW; + + if ((pte_val(pte) & mask) != mask) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + tail = page; + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; + } + + return 1; +} diff --git a/arch/powerpc/mm/icswx.c b/arch/powerpc/mm/icswx.c new file mode 100644 index 000000000..915412e4d --- /dev/null +++ b/arch/powerpc/mm/icswx.c @@ -0,0 +1,292 @@ +/* + * ICSWX and ACOP Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#include "icswx.h" + +/* + * The processor and its L2 cache cause the icswx instruction to + * generate a COP_REQ transaction on PowerBus. The transaction has no + * address, and the processor does not perform an MMU access to + * authenticate the transaction. The command portion of the PowerBus + * COP_REQ transaction includes the LPAR_ID (LPID) and the coprocessor + * Process ID (PID), which the coprocessor compares to the authorized + * LPID and PID held in the coprocessor, to determine if the process + * is authorized to generate the transaction. The data of the COP_REQ + * transaction is 128-byte or less in size and is placed in cacheable + * memory on a 128-byte cache line boundary. + * + * The task to use a coprocessor should use use_cop() to mark the use + * of the Coprocessor Type (CT) and context switching. On a server + * class processor, the PID register is used only for coprocessor + * management + * and so a coprocessor PID is allocated before + * executing icswx + * instruction. Drop_cop() is used to free the + * coprocessor PID. + * + * Example: + * Host Fabric Interface (HFI) is a PowerPC network coprocessor. + * Each HFI have multiple windows. Each HFI window serves as a + * network device sending to and receiving from HFI network. + * HFI immediate send function uses icswx instruction. The immediate + * send function allows small (single cache-line) packets be sent + * without using the regular HFI send FIFO and doorbell, which are + * much slower than immediate send. + * + * For each task intending to use HFI immediate send, the HFI driver + * calls use_cop() to obtain a coprocessor PID for the task. + * The HFI driver then allocate a free HFI window and save the + * coprocessor PID to the HFI window to allow the task to use the + * HFI window. + * + * The HFI driver repeatedly creates immediate send packets and + * issues icswx instruction to send data through the HFI window. + * The HFI compares the coprocessor PID in the CPU PID register + * to the PID held in the HFI window to determine if the transaction + * is allowed. + * + * When the task to release the HFI window, the HFI driver calls + * drop_cop() to release the coprocessor PID. + */ + +void switch_cop(struct mm_struct *next) +{ +#ifdef CONFIG_PPC_ICSWX_PID + mtspr(SPRN_PID, next->context.cop_pid); +#endif + mtspr(SPRN_ACOP, next->context.acop); +} + +/** + * Start using a coprocessor. + * @acop: mask of coprocessor to be used. + * @mm: The mm the coprocessor to associate with. Most likely current mm. + * + * Return a positive PID if successful. Negative errno otherwise. + * The returned PID will be fed to the coprocessor to determine if an + * icswx transaction is authenticated. + */ +int use_cop(unsigned long acop, struct mm_struct *mm) +{ + int ret; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) + return -ENODEV; + + if (!mm || !acop) + return -EINVAL; + + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); + spin_lock(mm->context.cop_lockp); + + ret = get_cop_pid(mm); + if (ret < 0) + goto out; + + /* update acop */ + mm->context.acop |= acop; + + sync_cop(mm); + + /* + * If this is a threaded process then there might be other threads + * running. We need to send an IPI to force them to pick up any + * change in PID and ACOP. + */ + if (atomic_read(&mm->mm_users) > 1) + smp_call_function(sync_cop, mm, 1); + +out: + spin_unlock(mm->context.cop_lockp); + spin_unlock(&mm->page_table_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(use_cop); + +/** + * Stop using a coprocessor. + * @acop: mask of coprocessor to be stopped. + * @mm: The mm the coprocessor associated with. + */ +void drop_cop(unsigned long acop, struct mm_struct *mm) +{ + int free_pid; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) + return; + + if (WARN_ON_ONCE(!mm)) + return; + + /* The page_table_lock ensures mm_users won't change under us */ + spin_lock(&mm->page_table_lock); + spin_lock(mm->context.cop_lockp); + + mm->context.acop &= ~acop; + + free_pid = disable_cop_pid(mm); + sync_cop(mm); + + /* + * If this is a threaded process then there might be other threads + * running. We need to send an IPI to force them to pick up any + * change in PID and ACOP. + */ + if (atomic_read(&mm->mm_users) > 1) + smp_call_function(sync_cop, mm, 1); + + if (free_pid != COP_PID_NONE) + free_cop_pid(free_pid); + + spin_unlock(mm->context.cop_lockp); + spin_unlock(&mm->page_table_lock); +} +EXPORT_SYMBOL_GPL(drop_cop); + +static int acop_use_cop(int ct) +{ + /* There is no alternate policy, yet */ + return -1; +} + +/* + * Get the instruction word at the NIP + */ +static u32 acop_get_inst(struct pt_regs *regs) +{ + u32 inst; + u32 __user *p; + + p = (u32 __user *)regs->nip; + if (!access_ok(VERIFY_READ, p, sizeof(*p))) + return 0; + + if (__get_user(inst, p)) + return 0; + + return inst; +} + +/** + * @regs: regsiters at time of interrupt + * @address: storage address + * @error_code: Fault code, usually the DSISR or ESR depending on + * processor type + * + * Return 0 if we are able to resolve the data storage fault that + * results from a CT miss in the ACOP register. + */ +int acop_handle_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + int ct; + u32 inst = 0; + + if (!cpu_has_feature(CPU_FTR_ICSWX)) { + pr_info("No coprocessors available"); + _exception(SIGILL, regs, ILL_ILLOPN, address); + } + + if (!user_mode(regs)) { + /* this could happen if the HV denies the + * kernel access, for now we just die */ + die("ICSWX from kernel failed", regs, SIGSEGV); + } + + /* Some implementations leave us a hint for the CT */ + ct = ICSWX_GET_CT_HINT(error_code); + if (ct < 0) { + /* we have to peek at the instruction word to figure out CT */ + u32 ccw; + u32 rs; + + inst = acop_get_inst(regs); + if (inst == 0) + return -1; + + rs = (inst >> (31 - 10)) & 0x1f; + ccw = regs->gpr[rs]; + ct = (ccw >> 16) & 0x3f; + } + + /* + * We could be here because another thread has enabled acop + * but the ACOP register has yet to be updated. + * + * This should have been taken care of by the IPI to sync all + * the threads (see smp_call_function(sync_cop, mm, 1)), but + * that could take forever if there are a significant amount + * of threads. + * + * Given the number of threads on some of these systems, + * perhaps this is the best way to sync ACOP rather than whack + * every thread with an IPI. + */ + if ((acop_copro_type_bit(ct) & current->active_mm->context.acop) != 0) { + sync_cop(current->active_mm); + return 0; + } + + /* check for alternate policy */ + if (!acop_use_cop(ct)) + return 0; + + /* at this point the CT is unknown to the system */ + pr_warn("%s[%d]: Coprocessor %d is unavailable\n", + current->comm, current->pid, ct); + + /* get inst if we don't already have it */ + if (inst == 0) { + inst = acop_get_inst(regs); + if (inst == 0) + return -1; + } + + /* Check if the instruction is the "record form" */ + if (inst & 1) { + /* + * the instruction is "record" form so we can reject + * using CR0 + */ + regs->ccr &= ~(0xful << 28); + regs->ccr |= ICSWX_RC_NOT_FOUND << 28; + + /* Move on to the next instruction */ + regs->nip += 4; + } else { + /* + * There is no architected mechanism to report a bad + * CT so we could either SIGILL or report nothing. + * Since the non-record version should only bu used + * for "hints" or "don't care" we should probably do + * nothing. However, I could see how some people + * might want an SIGILL so it here if you want it. + */ +#ifdef CONFIG_PPC_ICSWX_USE_SIGILL + _exception(SIGILL, regs, ILL_ILLOPN, address); +#else + regs->nip += 4; +#endif + } + + return 0; +} +EXPORT_SYMBOL_GPL(acop_handle_fault); diff --git a/arch/powerpc/mm/icswx.h b/arch/powerpc/mm/icswx.h new file mode 100644 index 000000000..6dedc08e6 --- /dev/null +++ b/arch/powerpc/mm/icswx.h @@ -0,0 +1,68 @@ +#ifndef _ARCH_POWERPC_MM_ICSWX_H_ +#define _ARCH_POWERPC_MM_ICSWX_H_ + +/* + * ICSWX and ACOP Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/mmu_context.h> + +/* also used to denote that PIDs are not used */ +#define COP_PID_NONE 0 + +static inline void sync_cop(void *arg) +{ + struct mm_struct *mm = arg; + + if (mm == current->active_mm) + switch_cop(current->active_mm); +} + +#ifdef CONFIG_PPC_ICSWX_PID +extern int get_cop_pid(struct mm_struct *mm); +extern int disable_cop_pid(struct mm_struct *mm); +extern void free_cop_pid(int free_pid); +#else +#define get_cop_pid(m) (COP_PID_NONE) +#define disable_cop_pid(m) (COP_PID_NONE) +#define free_cop_pid(p) +#endif + +/* + * These are implementation bits for architected registers. If this + * ever becomes architecture the should be moved to reg.h et. al. + */ +/* UCT is the same bit for Server and Embedded */ +#define ICSWX_DSI_UCT 0x00004000 /* Unavailable Coprocessor Type */ + +#ifdef CONFIG_PPC_BOOK3E +/* Embedded implementation gives us no hints as to what the CT is */ +#define ICSWX_GET_CT_HINT(x) (-1) +#else +/* Server implementation contains the CT value in the DSISR */ +#define ICSWX_DSISR_CTMASK 0x00003f00 +#define ICSWX_GET_CT_HINT(x) (((x) & ICSWX_DSISR_CTMASK) >> 8) +#endif + +#define ICSWX_RC_STARTED 0x8 /* The request has been started */ +#define ICSWX_RC_NOT_IDLE 0x4 /* No coprocessor found idle */ +#define ICSWX_RC_NOT_FOUND 0x2 /* No coprocessor found */ +#define ICSWX_RC_UNDEFINED 0x1 /* Reserved */ + +extern int acop_handle_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code); + +static inline u64 acop_copro_type_bit(unsigned int type) +{ + return 1ULL << (63 - type); +} + +#endif /* !_ARCH_POWERPC_MM_ICSWX_H_ */ diff --git a/arch/powerpc/mm/icswx_pid.c b/arch/powerpc/mm/icswx_pid.c new file mode 100644 index 000000000..91e30eb7d --- /dev/null +++ b/arch/powerpc/mm/icswx_pid.c @@ -0,0 +1,87 @@ +/* + * ICSWX and ACOP/PID Management + * + * Copyright (C) 2011 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/module.h> +#include "icswx.h" + +#define COP_PID_MIN (COP_PID_NONE + 1) +#define COP_PID_MAX (0xFFFF) + +static DEFINE_SPINLOCK(mmu_context_acop_lock); +static DEFINE_IDA(cop_ida); + +static int new_cop_pid(struct ida *ida, int min_id, int max_id, + spinlock_t *lock) +{ + int index; + int err; + +again: + if (!ida_pre_get(ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(lock); + err = ida_get_new_above(ida, min_id, &index); + spin_unlock(lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > max_id) { + spin_lock(lock); + ida_remove(ida, index); + spin_unlock(lock); + return -ENOMEM; + } + + return index; +} + +int get_cop_pid(struct mm_struct *mm) +{ + int pid; + + if (mm->context.cop_pid == COP_PID_NONE) { + pid = new_cop_pid(&cop_ida, COP_PID_MIN, COP_PID_MAX, + &mmu_context_acop_lock); + if (pid >= 0) + mm->context.cop_pid = pid; + } + return mm->context.cop_pid; +} + +int disable_cop_pid(struct mm_struct *mm) +{ + int free_pid = COP_PID_NONE; + + if ((!mm->context.acop) && (mm->context.cop_pid != COP_PID_NONE)) { + free_pid = mm->context.cop_pid; + mm->context.cop_pid = COP_PID_NONE; + } + return free_pid; +} + +void free_cop_pid(int free_pid) +{ + spin_lock(&mmu_context_acop_lock); + ida_remove(&cop_ida, free_pid); + spin_unlock(&mmu_context_acop_lock); +} diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c new file mode 100644 index 000000000..a10be665b --- /dev/null +++ b/arch/powerpc/mm/init_32.c @@ -0,0 +1,214 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * PPC44x/36-bit changes by Matt Porter (mporter@mvista.com) + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/initrd.h> +#include <linux/pagemap.h> +#include <linux/memblock.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/btext.h> +#include <asm/tlb.h> +#include <asm/sections.h> +#include <asm/hugetlb.h> + +#include "mmu_decl.h" + +#if defined(CONFIG_KERNEL_START_BOOL) || defined(CONFIG_LOWMEM_SIZE_BOOL) +/* The amount of lowmem must be within 0xF0000000 - KERNELBASE. */ +#if (CONFIG_LOWMEM_SIZE > (0xF0000000 - PAGE_OFFSET)) +#error "You must adjust CONFIG_LOWMEM_SIZE or CONFIG_KERNEL_START" +#endif +#endif +#define MAX_LOW_MEM CONFIG_LOWMEM_SIZE + +phys_addr_t total_memory; +phys_addr_t total_lowmem; + +phys_addr_t memstart_addr = (phys_addr_t)~0ull; +EXPORT_SYMBOL(memstart_addr); +phys_addr_t kernstart_addr; +EXPORT_SYMBOL(kernstart_addr); + +#ifdef CONFIG_RELOCATABLE_PPC32 +/* Used in __va()/__pa() */ +long long virt_phys_offset; +EXPORT_SYMBOL(virt_phys_offset); +#endif + +phys_addr_t lowmem_end_addr; + +int boot_mapsize; +#ifdef CONFIG_PPC_PMAC +unsigned long agp_special_page; +EXPORT_SYMBOL(agp_special_page); +#endif + +void MMU_init(void); + +/* XXX should be in current.h -- paulus */ +extern struct task_struct *current_set[NR_CPUS]; + +/* + * this tells the system to map all of ram with the segregs + * (i.e. page tables) instead of the bats. + * -- Cort + */ +int __map_without_bats; +int __map_without_ltlbs; + +/* + * This tells the system to allow ioremapping memory marked as reserved. + */ +int __allow_ioremap_reserved; + +/* max amount of low RAM to map in */ +unsigned long __max_low_memory = MAX_LOW_MEM; + +/* + * Check for command-line options that affect what MMU_init will do. + */ +void __init MMU_setup(void) +{ + /* Check for nobats option (used in mapin_ram). */ + if (strstr(boot_command_line, "nobats")) { + __map_without_bats = 1; + } + + if (strstr(boot_command_line, "noltlbs")) { + __map_without_ltlbs = 1; + } +#ifdef CONFIG_DEBUG_PAGEALLOC + __map_without_bats = 1; + __map_without_ltlbs = 1; +#endif +} + +/* + * MMU_init sets up the basic memory mappings for the kernel, + * including both RAM and possibly some I/O regions, + * and sets up the page tables and the MMU hardware ready to go. + */ +void __init MMU_init(void) +{ + if (ppc_md.progress) + ppc_md.progress("MMU:enter", 0x111); + + /* parse args from command line */ + MMU_setup(); + + /* + * Reserve gigantic pages for hugetlb. This MUST occur before + * lowmem_end_addr is initialized below. + */ + reserve_hugetlb_gpages(); + + if (memblock.memory.cnt > 1) { +#ifndef CONFIG_WII + memblock_enforce_memory_limit(memblock.memory.regions[0].size); + printk(KERN_WARNING "Only using first contiguous memory region"); +#else + wii_memory_fixups(); +#endif + } + + total_lowmem = total_memory = memblock_end_of_DRAM() - memstart_addr; + lowmem_end_addr = memstart_addr + total_lowmem; + +#ifdef CONFIG_FSL_BOOKE + /* Freescale Book-E parts expect lowmem to be mapped by fixed TLB + * entries, so we need to adjust lowmem to match the amount we can map + * in the fixed entries */ + adjust_total_lowmem(); +#endif /* CONFIG_FSL_BOOKE */ + + if (total_lowmem > __max_low_memory) { + total_lowmem = __max_low_memory; + lowmem_end_addr = memstart_addr + total_lowmem; +#ifndef CONFIG_HIGHMEM + total_memory = total_lowmem; + memblock_enforce_memory_limit(total_lowmem); +#endif /* CONFIG_HIGHMEM */ + } + + /* Initialize the MMU hardware */ + if (ppc_md.progress) + ppc_md.progress("MMU:hw init", 0x300); + MMU_init_hw(); + + /* Map in all of RAM starting at KERNELBASE */ + if (ppc_md.progress) + ppc_md.progress("MMU:mapin", 0x301); + mapin_ram(); + + /* Initialize early top-down ioremap allocator */ + ioremap_bot = IOREMAP_TOP; + + /* Map in I/O resources */ + if (ppc_md.progress) + ppc_md.progress("MMU:setio", 0x302); + + if (ppc_md.progress) + ppc_md.progress("MMU:exit", 0x211); + + /* From now on, btext is no longer BAT mapped if it was at all */ +#ifdef CONFIG_BOOTX_TEXT + btext_unmap(); +#endif + + /* Shortly after that, the entire linear mapping will be available */ + memblock_set_current_limit(lowmem_end_addr); +} + +#ifdef CONFIG_8xx /* No 8xx specific .c file to put that in ... */ +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + +#ifdef CONFIG_PIN_TLB + /* 8xx can only access 24MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01800000)); +#else + /* 8xx can only access 8MB at the moment */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x00800000)); +#endif +} +#endif /* CONFIG_8xx */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c new file mode 100644 index 000000000..d747dd7bc --- /dev/null +++ b/arch/powerpc/mm/init_64.c @@ -0,0 +1,462 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#undef DEBUG + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/highmem.h> +#include <linux/idr.h> +#include <linux/nodemask.h> +#include <linux/module.h> +#include <linux/poison.h> +#include <linux/memblock.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> + +#include <asm/pgalloc.h> +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/uaccess.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/tlb.h> +#include <asm/eeh.h> +#include <asm/processor.h> +#include <asm/mmzone.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/iommu.h> +#include <asm/vdso.h> + +#include "mmu_decl.h" + +#ifdef CONFIG_PPC_STD_MMU_64 +#if PGTABLE_RANGE > USER_VSID_RANGE +#warning Limited user VSID range means pagetable space is wasted +#endif + +#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) +#warning TASK_SIZE is smaller than it needs to be. +#endif +#endif /* CONFIG_PPC_STD_MMU_64 */ + +phys_addr_t memstart_addr = ~0; +EXPORT_SYMBOL_GPL(memstart_addr); +phys_addr_t kernstart_addr; +EXPORT_SYMBOL_GPL(kernstart_addr); + +static void pgd_ctor(void *addr) +{ + memset(addr, 0, PGD_TABLE_SIZE); +} + +static void pmd_ctor(void *addr) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + memset(addr, 0, PMD_TABLE_SIZE * 2); +#else + memset(addr, 0, PMD_TABLE_SIZE); +#endif +} + +struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; + +/* + * Create a kmem_cache() for pagetables. This is not used for PTE + * pages - they're linked to struct page, come from the normal free + * pages pool and have a different entry size (see real_pte_t) to + * everything else. Caches created by this function are used for all + * the higher level pagetables, and for hugepage pagetables. + */ +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) +{ + char *name; + unsigned long table_size = sizeof(void *) << shift; + unsigned long align = table_size; + + /* When batching pgtable pointers for RCU freeing, we store + * the index size in the low bits. Table alignment must be + * big enough to fit it. + * + * Likewise, hugeapge pagetable pointers contain a (different) + * shift value in the low bits. All tables must be aligned so + * as to leave enough 0 bits in the address to contain it. */ + unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, + HUGEPD_SHIFT_MASK + 1); + struct kmem_cache *new; + + /* It would be nice if this was a BUILD_BUG_ON(), but at the + * moment, gcc doesn't seem to recognize is_power_of_2 as a + * constant expression, so so much for that. */ + BUG_ON(!is_power_of_2(minalign)); + BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); + + if (PGT_CACHE(shift)) + return; /* Already have a cache of this size */ + + align = max_t(unsigned long, align, minalign); + name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); + new = kmem_cache_create(name, table_size, align, 0, ctor); + kfree(name); + pgtable_cache[shift - 1] = new; + pr_debug("Allocated pgtable cache for order %d\n", shift); +} + + +void pgtable_cache_init(void) +{ + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); + pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX)) + panic("Couldn't allocate pgtable caches"); + /* In all current configs, when the PUD index exists it's the + * same size as either the pgd or pmd index. Verify that the + * initialization above has also created a PUD cache. This + * will need re-examiniation if we add new possibilities for + * the pagetable layout. */ + BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +/* + * Given an address within the vmemmap, determine the pfn of the page that + * represents the start of the section it is within. Note that we have to + * do this by hand as the proffered address may not be correctly aligned. + * Subtraction of non-aligned pointers produces undefined results. + */ +static unsigned long __meminit vmemmap_section_start(unsigned long page) +{ + unsigned long offset = page - ((unsigned long)(vmemmap)); + + /* Return the pfn of the start of the section. */ + return (offset / sizeof(struct page)) & PAGE_SECTION_MASK; +} + +/* + * Check if this vmemmap page is already initialised. If any section + * which overlaps this vmemmap page is initialised then this page is + * initialised already. + */ +static int __meminit vmemmap_populated(unsigned long start, int page_size) +{ + unsigned long end = start + page_size; + start = (unsigned long)(pfn_to_page(vmemmap_section_start(start))); + + for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page))) + if (pfn_valid(page_to_pfn((struct page *)start))) + return 1; + + return 0; +} + +/* On hash-based CPUs, the vmemmap is bolted in the hash table. + * + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ + +#ifdef CONFIG_PPC_BOOK3E +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ +} +#endif +#else /* CONFIG_PPC_BOOK3E */ +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int mapped = htab_bolt_mapping(start, start + page_size, phys, + pgprot_val(PAGE_KERNEL), + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void vmemmap_remove_mapping(unsigned long start, + unsigned long page_size) +{ + int mapped = htab_remove_mapping(start, start + page_size, + mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} +#endif + +#endif /* CONFIG_PPC_BOOK3E */ + +struct vmemmap_backing *vmemmap_list; +static struct vmemmap_backing *next; +static int num_left; +static int num_freed; + +static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) +{ + struct vmemmap_backing *vmem_back; + /* get from freed entries first */ + if (num_freed) { + num_freed--; + vmem_back = next; + next = next->list; + + return vmem_back; + } + + /* allocate a page when required and hand out chunks */ + if (!num_left) { + next = vmemmap_alloc_block(PAGE_SIZE, node); + if (unlikely(!next)) { + WARN_ON(1); + return NULL; + } + num_left = PAGE_SIZE / sizeof(struct vmemmap_backing); + } + + num_left--; + + return next++; +} + +static __meminit void vmemmap_list_populate(unsigned long phys, + unsigned long start, + int node) +{ + struct vmemmap_backing *vmem_back; + + vmem_back = vmemmap_list_alloc(node); + if (unlikely(!vmem_back)) { + WARN_ON(1); + return; + } + + vmem_back->phys = phys; + vmem_back->virt_addr = start; + vmem_back->list = vmemmap_list; + + vmemmap_list = vmem_back; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + + /* Align to the page size of the linear mapping. */ + start = _ALIGN_DOWN(start, page_size); + + pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); + + for (; start < end; start += page_size) { + void *p; + + if (vmemmap_populated(start, page_size)) + continue; + + p = vmemmap_alloc_block(page_size, node); + if (!p) + return -ENOMEM; + + vmemmap_list_populate(__pa(p), start, node); + + pr_debug(" * %016lx..%016lx allocated at %p\n", + start, start + page_size, p); + + vmemmap_create_mapping(start, page_size, __pa(p)); + } + + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static unsigned long vmemmap_list_free(unsigned long start) +{ + struct vmemmap_backing *vmem_back, *vmem_back_prev; + + vmem_back_prev = vmem_back = vmemmap_list; + + /* look for it with prev pointer recorded */ + for (; vmem_back; vmem_back = vmem_back->list) { + if (vmem_back->virt_addr == start) + break; + vmem_back_prev = vmem_back; + } + + if (unlikely(!vmem_back)) { + WARN_ON(1); + return 0; + } + + /* remove it from vmemmap_list */ + if (vmem_back == vmemmap_list) /* remove head */ + vmemmap_list = vmem_back->list; + else + vmem_back_prev->list = vmem_back->list; + + /* next point to this freed entry */ + vmem_back->list = next; + next = vmem_back; + num_freed++; + + return vmem_back->phys; +} + +void __ref vmemmap_free(unsigned long start, unsigned long end) +{ + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + + start = _ALIGN_DOWN(start, page_size); + + pr_debug("vmemmap_free %lx...%lx\n", start, end); + + for (; start < end; start += page_size) { + unsigned long addr; + + /* + * the section has already be marked as invalid, so + * vmemmap_populated() true means some other sections still + * in this page, so skip it. + */ + if (vmemmap_populated(start, page_size)) + continue; + + addr = vmemmap_list_free(start); + if (addr) { + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + + if (PageReserved(page)) { + /* allocated from bootmem */ + if (page_size < PAGE_SIZE) { + /* + * this shouldn't happen, but if it is + * the case, leave the memory there + */ + WARN_ON_ONCE(1); + } else { + unsigned int nr_pages = + 1 << get_order(page_size); + while (nr_pages--) + free_reserved_page(page++); + } + } else + free_pages((unsigned long)(__va(addr)), + get_order(page_size)); + + vmemmap_remove_mapping(start, page_size); + } + } +} +#endif +void register_page_bootmem_memmap(unsigned long section_nr, + struct page *start_page, unsigned long size) +{ +} + +/* + * We do not have access to the sparsemem vmemmap, so we fallback to + * walking the list of sparsemem blocks which we already maintain for + * the sake of crashdump. In the long run, we might want to maintain + * a tree if performance of that linear walk becomes a problem. + * + * realmode_pfn_to_page functions can fail due to: + * 1) As real sparsemem blocks do not lay in RAM continously (they + * are in virtual address space which is not available in the real mode), + * the requested page struct can be split between blocks so get_page/put_page + * may fail. + * 2) When huge pages are used, the get_page/put_page API will fail + * in real mode as the linked addresses in the page struct are virtual + * too. + */ +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct vmemmap_backing *vmem_back; + struct page *page; + unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; + unsigned long pg_va = (unsigned long) pfn_to_page(pfn); + + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { + if (pg_va < vmem_back->virt_addr) + continue; + + /* After vmemmap_list entry free is possible, need check all */ + if ((pg_va + sizeof(struct page)) <= + (vmem_back->virt_addr + page_size)) { + page = (struct page *) (vmem_back->phys + pg_va - + vmem_back->virt_addr); + return page; + } + } + + /* Probably that page struct is split between real pages */ + return NULL; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#elif defined(CONFIG_FLATMEM) + +struct page *realmode_pfn_to_page(unsigned long pfn) +{ + struct page *page = pfn_to_page(pfn); + return page; +} +EXPORT_SYMBOL_GPL(realmode_pfn_to_page); + +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c new file mode 100644 index 000000000..45fda71fe --- /dev/null +++ b/arch/powerpc/mm/mem.c @@ -0,0 +1,571 @@ +/* + * PowerPC version + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * PPC44x/36-bit changes by Matt Porter (mporter@mvista.com) + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <linux/initrd.h> +#include <linux/pagemap.h> +#include <linux/suspend.h> +#include <linux/memblock.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <asm/pgalloc.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/btext.h> +#include <asm/tlb.h> +#include <asm/sections.h> +#include <asm/sparsemem.h> +#include <asm/vdso.h> +#include <asm/fixmap.h> +#include <asm/swiotlb.h> +#include <asm/rtas.h> + +#include "mmu_decl.h" + +#ifndef CPU_FTR_COHERENT_ICACHE +#define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ +#define CPU_FTR_NOEXECUTE 0 +#endif + +unsigned long long memory_limit; + +#ifdef CONFIG_HIGHMEM +pte_t *kmap_pte; +EXPORT_SYMBOL(kmap_pte); +pgprot_t kmap_prot; +EXPORT_SYMBOL(kmap_prot); + +static inline pte_t *virt_to_kpte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} +#endif + +int page_is_ram(unsigned long pfn) +{ +#ifndef CONFIG_PPC64 /* XXX for now */ + return pfn < max_pfn; +#else + unsigned long paddr = (pfn << PAGE_SHIFT); + struct memblock_region *reg; + + for_each_memblock(memory, reg) + if (paddr >= reg->base && paddr < (reg->base + reg->size)) + return 1; + return 0; +#endif +} + +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + if (ppc_md.phys_mem_access_prot) + return ppc_md.phys_mem_access_prot(file, pfn, size, vma_prot); + + if (!page_is_ram(pfn)) + vma_prot = pgprot_noncached(vma_prot); + + return vma_prot; +} +EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +#ifdef CONFIG_NUMA +int memory_add_physaddr_to_nid(u64 start) +{ + return hot_add_scn_to_nid(start); +} +#endif + +int arch_add_memory(int nid, u64 start, u64 size) +{ + struct pglist_data *pgdata; + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + pgdata = NODE_DATA(nid); + + start = (unsigned long)__va(start); + if (create_section_mapping(start, start + size)) + return -EINVAL; + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones + + zone_for_memory(nid, start, size, 0); + + return __add_pages(nid, zone, start_pfn, nr_pages); +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct zone *zone; + int ret; + + zone = page_zone(pfn_to_page(start_pfn)); + ret = __remove_pages(zone, start_pfn, nr_pages); + if (ret) + return ret; + + /* Remove htab bolted mappings for this section of memory */ + start = (unsigned long)__va(start); + ret = remove_section_mapping(start, start + size); + + /* Ensure all vmalloc mappings are flushed in case they also + * hit that section of memory + */ + vm_unmap_aliases(); + + return ret; +} +#endif +#endif /* CONFIG_MEMORY_HOTPLUG */ + +/* + * walk_memory_resource() needs to make sure there is no holes in a given + * memory range. PPC64 does not maintain the memory layout in /proc/iomem. + * Instead it maintains it in memblock.memory structures. Walk through the + * memory regions, find holes and callback for contiguous regions. + */ +int +walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) +{ + struct memblock_region *reg; + unsigned long end_pfn = start_pfn + nr_pages; + unsigned long tstart, tend; + int ret = -1; + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart >= tend) + continue; + ret = (*func)(tstart, tend - tstart, arg); + if (ret) + break; + } + return ret; +} +EXPORT_SYMBOL_GPL(walk_system_ram_range); + +#ifndef CONFIG_NEED_MULTIPLE_NODES +void __init initmem_init(void) +{ + max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + min_low_pfn = MEMORY_START >> PAGE_SHIFT; +#ifdef CONFIG_HIGHMEM + max_low_pfn = lowmem_end_addr >> PAGE_SHIFT; +#endif + + /* Place all memblock_regions in the same node and merge contiguous + * memblock_regions + */ + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0); + + /* XXX need to clip this if using highmem? */ + sparse_memory_present_with_active_regions(0); + sparse_init(); +} + +/* mark pages that don't exist as nosave */ +static int __init mark_nonram_nosave(void) +{ + struct memblock_region *reg, *prev = NULL; + + for_each_memblock(memory, reg) { + if (prev && + memblock_region_memory_end_pfn(prev) < memblock_region_memory_base_pfn(reg)) + register_nosave_region(memblock_region_memory_end_pfn(prev), + memblock_region_memory_base_pfn(reg)); + prev = reg; + } + return 0; +} +#else /* CONFIG_NEED_MULTIPLE_NODES */ +static int __init mark_nonram_nosave(void) +{ + return 0; +} +#endif + +static bool zone_limits_final; + +static unsigned long max_zone_pfns[MAX_NR_ZONES] = { + [0 ... MAX_NR_ZONES - 1] = ~0UL +}; + +/* + * Restrict the specified zone and all more restrictive zones + * to be below the specified pfn. May not be called after + * paging_init(). + */ +void __init limit_zone_pfn(enum zone_type zone, unsigned long pfn_limit) +{ + int i; + + if (WARN_ON(zone_limits_final)) + return; + + for (i = zone; i >= 0; i--) { + if (max_zone_pfns[i] > pfn_limit) + max_zone_pfns[i] = pfn_limit; + } +} + +/* + * Find the least restrictive zone that is entirely below the + * specified pfn limit. Returns < 0 if no suitable zone is found. + * + * pfn_limit must be u64 because it can exceed 32 bits even on 32-bit + * systems -- the DMA limit can be higher than any possible real pfn. + */ +int dma_pfn_limit_to_zone(u64 pfn_limit) +{ + enum zone_type top_zone = ZONE_NORMAL; + int i; + +#ifdef CONFIG_HIGHMEM + top_zone = ZONE_HIGHMEM; +#endif + + for (i = top_zone; i >= 0; i--) { + if (max_zone_pfns[i] <= pfn_limit) + return i; + } + + return -EPERM; +} + +/* + * paging_init() sets up the page tables - in fact we've already done this. + */ +void __init paging_init(void) +{ + unsigned long long total_ram = memblock_phys_mem_size(); + phys_addr_t top_of_ram = memblock_end_of_DRAM(); + enum zone_type top_zone; + +#ifdef CONFIG_PPC32 + unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); + unsigned long end = __fix_to_virt(FIX_HOLE); + + for (; v < end; v += PAGE_SIZE) + map_page(v, 0, 0); /* XXX gross */ +#endif + +#ifdef CONFIG_HIGHMEM + map_page(PKMAP_BASE, 0, 0); /* XXX gross */ + pkmap_page_table = virt_to_kpte(PKMAP_BASE); + + kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + kmap_prot = PAGE_KERNEL; +#endif /* CONFIG_HIGHMEM */ + + printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", + (unsigned long long)top_of_ram, total_ram); + printk(KERN_DEBUG "Memory hole size: %ldMB\n", + (long int)((top_of_ram - total_ram) >> 20)); + +#ifdef CONFIG_HIGHMEM + top_zone = ZONE_HIGHMEM; + limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); +#else + top_zone = ZONE_NORMAL; +#endif + + limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); + zone_limits_final = true; + free_area_init_nodes(max_zone_pfns); + + mark_nonram_nosave(); +} + +void __init mem_init(void) +{ + /* + * book3s is limited to 16 page sizes due to encoding this in + * a 4-bit field for slices. + */ + BUILD_BUG_ON(MMU_PAGE_COUNT > 16); + +#ifdef CONFIG_SWIOTLB + swiotlb_init(0); +#endif + + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + set_max_mapnr(max_pfn); + free_all_bootmem(); + +#ifdef CONFIG_HIGHMEM + { + unsigned long pfn, highmem_mapnr; + + highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT; + for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { + phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; + struct page *page = pfn_to_page(pfn); + if (!memblock_is_reserved(paddr)) + free_highmem_page(page); + } + } +#endif /* CONFIG_HIGHMEM */ + +#if defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_SMP) + /* + * If smp is enabled, next_tlbcam_idx is initialized in the cpu up + * functions.... do it here for the non-smp case. + */ + per_cpu(next_tlbcam_idx, smp_processor_id()) = + (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; +#endif + + mem_init_print_info(NULL); +#ifdef CONFIG_PPC32 + pr_info("Kernel virtual memory layout:\n"); + pr_info(" * 0x%08lx..0x%08lx : fixmap\n", FIXADDR_START, FIXADDR_TOP); +#ifdef CONFIG_HIGHMEM + pr_info(" * 0x%08lx..0x%08lx : highmem PTEs\n", + PKMAP_BASE, PKMAP_ADDR(LAST_PKMAP)); +#endif /* CONFIG_HIGHMEM */ +#ifdef CONFIG_NOT_COHERENT_CACHE + pr_info(" * 0x%08lx..0x%08lx : consistent mem\n", + IOREMAP_TOP, IOREMAP_TOP + CONFIG_CONSISTENT_SIZE); +#endif /* CONFIG_NOT_COHERENT_CACHE */ + pr_info(" * 0x%08lx..0x%08lx : early ioremap\n", + ioremap_bot, IOREMAP_TOP); + pr_info(" * 0x%08lx..0x%08lx : vmalloc & ioremap\n", + VMALLOC_START, VMALLOC_END); +#endif /* CONFIG_PPC32 */ +} + +void free_initmem(void) +{ + ppc_md.progress = ppc_printk_progress; + free_initmem_default(POISON_FREE_INITMEM); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void __init free_initrd_mem(unsigned long start, unsigned long end) +{ + free_reserved_area((void *)start, (void *)end, -1, "initrd"); +} +#endif + +/* + * This is called when a page has been modified by the kernel. + * It just marks the page as not i-cache clean. We do the i-cache + * flush later when the page is given to a user process, if necessary. + */ +void flush_dcache_page(struct page *page) +{ + if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + return; + /* avoid an atomic op if possible */ + if (test_bit(PG_arch_1, &page->flags)) + clear_bit(PG_arch_1, &page->flags); +} +EXPORT_SYMBOL(flush_dcache_page); + +void flush_dcache_icache_page(struct page *page) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (PageCompound(page)) { + flush_dcache_icache_hugepage(page); + return; + } +#endif +#ifdef CONFIG_BOOKE + { + void *start = kmap_atomic(page); + __flush_dcache_icache(start); + kunmap_atomic(start); + } +#elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) + /* On 8xx there is no need to kmap since highmem is not supported */ + __flush_dcache_icache(page_address(page)); +#else + __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); +#endif +} +EXPORT_SYMBOL(flush_dcache_icache_page); + +void clear_user_page(void *page, unsigned long vaddr, struct page *pg) +{ + clear_page(page); + + /* + * We shouldn't have to do this, but some versions of glibc + * require it (ld.so assumes zero filled pages are icache clean) + * - Anton + */ + flush_dcache_page(pg); +} +EXPORT_SYMBOL(clear_user_page); + +void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, + struct page *pg) +{ + copy_page(vto, vfrom); + + /* + * We should be able to use the following optimisation, however + * there are two problems. + * Firstly a bug in some versions of binutils meant PLT sections + * were not marked executable. + * Secondly the first word in the GOT section is blrl, used + * to establish the GOT address. Until recently the GOT was + * not marked executable. + * - Anton + */ +#if 0 + if (!vma->vm_file && ((vma->vm_flags & VM_EXEC) == 0)) + return; +#endif + + flush_dcache_page(pg); +} + +void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, + unsigned long addr, int len) +{ + unsigned long maddr; + + maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK); + flush_icache_range(maddr, maddr + len); + kunmap(page); +} +EXPORT_SYMBOL(flush_icache_user_range); + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a PTE in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux PTE. + * + * This must always be called with the pte lock held. + */ +void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ +#ifdef CONFIG_PPC_STD_MMU + /* + * We don't need to worry about _PAGE_PRESENT here because we are + * called with either mm->page_table_lock held or ptl lock held + */ + unsigned long access = 0, trap; + + /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ + if (!pte_young(*ptep) || address >= TASK_SIZE) + return; + + /* We try to figure out if we are coming from an instruction + * access fault and pass that down to __hash_page so we avoid + * double-faulting on execution of fresh text. We have to test + * for regs NULL since init will get here first thing at boot + * + * We also avoid filling the hash if not coming from a fault + */ + if (current->thread.regs == NULL) + return; + trap = TRAP(current->thread.regs); + if (trap == 0x400) + access |= _PAGE_EXEC; + else if (trap != 0x300) + return; + hash_preload(vma->vm_mm, address, access, trap); +#endif /* CONFIG_PPC_STD_MMU */ +#if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \ + && defined(CONFIG_HUGETLB_PAGE) + if (is_vm_hugetlb_page(vma)) + book3e_hugetlb_preload(vma, address, *ptep); +#endif +} + +/* + * System memory should not be in /proc/iomem but various tools expect it + * (eg kdump). + */ +static int __init add_system_ram_resources(void) +{ + struct memblock_region *reg; + + for_each_memblock(memory, reg) { + struct resource *res; + unsigned long base = reg->base; + unsigned long size = reg->size; + + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + WARN_ON(!res); + + if (res) { + res->name = "System RAM"; + res->start = base; + res->end = base + size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + WARN_ON(request_resource(&iomem_resource, res) < 0); + } + } + + return 0; +} +subsys_initcall(add_system_ram_resources); + +#ifdef CONFIG_STRICT_DEVMEM +/* + * devmem_is_allowed(): check to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pfn) +{ + if (iomem_is_exclusive(pfn << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pfn)) + return 1; + if (page_is_rtas_user_buf(pfn)) + return 1; + return 0; +} +#endif /* CONFIG_STRICT_DEVMEM */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c new file mode 100644 index 000000000..0f0502e12 --- /dev/null +++ b/arch/powerpc/mm/mmap.c @@ -0,0 +1,103 @@ +/* + * flexible mmap layout support + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Started by Ingo Molnar <mingo@elte.hu> + */ + +#include <linux/personality.h> +#include <linux/mm.h> +#include <linux/random.h> +#include <linux/sched.h> + +/* + * Top of mmap area (just below the process stack). + * + * Leave at least a ~128 MB hole on 32bit applications. + * + * On 64bit applications we randomise the stack by 1GB so we need to + * space our mmap start address by a further 1GB, otherwise there is a + * chance the mmap area will end up closer to the stack than our ulimit + * requires. + */ +#define MIN_GAP32 (128*1024*1024) +#define MIN_GAP64 ((128 + 1024)*1024*1024UL) +#define MIN_GAP ((is_32bit_task()) ? MIN_GAP32 : MIN_GAP64) +#define MAX_GAP (TASK_SIZE/6*5) + +static inline int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (rlimit(RLIMIT_STACK) == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +unsigned long arch_mmap_rnd(void) +{ + unsigned long rnd; + + /* 8MB for 32bit, 1GB for 64bit */ + if (is_32bit_task()) + rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT)); + else + rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT)); + + return rnd << PAGE_SHIFT; +} + +static inline unsigned long mmap_base(unsigned long rnd) +{ + unsigned long gap = rlimit(RLIMIT_STACK); + + if (gap < MIN_GAP) + gap = MIN_GAP; + else if (gap > MAX_GAP) + gap = MAX_GAP; + + return PAGE_ALIGN(TASK_SIZE - gap - rnd); +} + +/* + * This function, called very early during the creation of a new + * process VM image, sets up which VM layout function to use: + */ +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + unsigned long random_factor = 0UL; + + if (current->flags & PF_RANDOMIZE) + random_factor = arch_mmap_rnd(); + + /* + * Fall back to the standard layout if the personality + * bit is set, or if the expected stack growth is unlimited: + */ + if (mmap_is_legacy()) { + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + } else { + mm->mmap_base = mmap_base(random_factor); + mm->get_unmapped_area = arch_get_unmapped_area_topdown; + } +} diff --git a/arch/powerpc/mm/mmu_context_hash32.c b/arch/powerpc/mm/mmu_context_hash32.c new file mode 100644 index 000000000..aa5a7fd89 --- /dev/null +++ b/arch/powerpc/mm/mmu_context_hash32.c @@ -0,0 +1,119 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * and 8260 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/export.h> + +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> + +/* + * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs + * (virtual segment identifiers) for each context. Although the + * hardware supports 24-bit VSIDs, and thus >1 million contexts, + * we only use 32,768 of them. That is ample, since there can be + * at most around 30,000 tasks in the system anyway, and it means + * that we can use a bitmap to indicate which contexts are in use. + * Using a bitmap means that we entirely avoid all of the problems + * that we used to have when the context number overflowed, + * particularly on SMP systems. + * -- paulus. + */ +#define NO_CONTEXT ((unsigned long) -1) +#define LAST_CONTEXT 32767 +#define FIRST_CONTEXT 1 + +/* + * This function defines the mapping from contexts to VSIDs (virtual + * segment IDs). We use a skew on both the context and the high 4 bits + * of the 32-bit virtual address (the "effective segment ID") in order + * to spread out the entries in the MMU hash table. Note, if this + * function is changed then arch/ppc/mm/hashtable.S will have to be + * changed to correspond. + * + * + * CTX_TO_VSID(ctx, va) (((ctx) * (897 * 16) + ((va) >> 28) * 0x111) \ + * & 0xffffff) + */ + +static unsigned long next_mmu_context; +static unsigned long context_map[LAST_CONTEXT / BITS_PER_LONG + 1]; + +unsigned long __init_new_context(void) +{ + unsigned long ctx = next_mmu_context; + + while (test_and_set_bit(ctx, context_map)) { + ctx = find_next_zero_bit(context_map, LAST_CONTEXT+1, ctx); + if (ctx > LAST_CONTEXT) + ctx = 0; + } + next_mmu_context = (ctx + 1) & LAST_CONTEXT; + + return ctx; +} +EXPORT_SYMBOL_GPL(__init_new_context); + +/* + * Set up the context for a new address space. + */ +int init_new_context(struct task_struct *t, struct mm_struct *mm) +{ + mm->context.id = __init_new_context(); + + return 0; +} + +/* + * Free a context ID. Make sure to call this with preempt disabled! + */ +void __destroy_context(unsigned long ctx) +{ + clear_bit(ctx, context_map); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +/* + * We're finished using the context for an address space. + */ +void destroy_context(struct mm_struct *mm) +{ + preempt_disable(); + if (mm->context.id != NO_CONTEXT) { + __destroy_context(mm->context.id); + mm->context.id = NO_CONTEXT; + } + preempt_enable(); +} + +/* + * Initialize the context management stuff. + */ +void __init mmu_context_init(void) +{ + /* Reserve context 0 for kernel use */ + context_map[0] = (1 << FIRST_CONTEXT) - 1; + next_mmu_context = FIRST_CONTEXT; +} diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c new file mode 100644 index 000000000..178876aef --- /dev/null +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -0,0 +1,146 @@ +/* + * MMU context allocation for 64-bit kernels. + * + * Copyright (C) 2004 Anton Blanchard, IBM Corp. <anton@samba.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/slab.h> + +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> + +#include "icswx.h" + +static DEFINE_SPINLOCK(mmu_context_lock); +static DEFINE_IDA(mmu_context_ida); + +int __init_new_context(void) +{ + int index; + int err; + +again: + if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&mmu_context_lock); + err = ida_get_new_above(&mmu_context_ida, 1, &index); + spin_unlock(&mmu_context_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_USER_CONTEXT) { + spin_lock(&mmu_context_lock); + ida_remove(&mmu_context_ida, index); + spin_unlock(&mmu_context_lock); + return -ENOMEM; + } + + return index; +} +EXPORT_SYMBOL_GPL(__init_new_context); + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + index = __init_new_context(); + if (index < 0) + return index; + + /* The old code would re-promote on fork, we don't do that + * when using slices as it could cause problem promoting slices + * that have been forced down to 4K + */ + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); + subpage_prot_init_new_context(mm); + mm->context.id = index; +#ifdef CONFIG_PPC_ICSWX + mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (!mm->context.cop_lockp) { + __destroy_context(index); + subpage_prot_free(mm); + mm->context.id = MMU_NO_CONTEXT; + return -ENOMEM; + } + spin_lock_init(mm->context.cop_lockp); +#endif /* CONFIG_PPC_ICSWX */ + +#ifdef CONFIG_PPC_64K_PAGES + mm->context.pte_frag = NULL; +#endif + return 0; +} + +void __destroy_context(int context_id) +{ + spin_lock(&mmu_context_lock); + ida_remove(&mmu_context_ida, context_id); + spin_unlock(&mmu_context_lock); +} +EXPORT_SYMBOL_GPL(__destroy_context); + +#ifdef CONFIG_PPC_64K_PAGES +static void destroy_pagetable_page(struct mm_struct *mm) +{ + int count; + void *pte_frag; + struct page *page; + + pte_frag = mm->context.pte_frag; + if (!pte_frag) + return; + + page = virt_to_page(pte_frag); + /* drop all the pending references */ + count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; + /* We allow PTE_FRAG_NR fragments from a PTE page */ + count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count); + if (!count) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +#else +static inline void destroy_pagetable_page(struct mm_struct *mm) +{ + return; +} +#endif + + +void destroy_context(struct mm_struct *mm) +{ + +#ifdef CONFIG_PPC_ICSWX + drop_cop(mm->context.acop, mm); + kfree(mm->context.cop_lockp); + mm->context.cop_lockp = NULL; +#endif /* CONFIG_PPC_ICSWX */ + + destroy_pagetable_page(mm); + __destroy_context(mm->context.id); + subpage_prot_free(mm); + mm->context.id = MMU_NO_CONTEXT; +} diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c new file mode 100644 index 000000000..986afbc22 --- /dev/null +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -0,0 +1,490 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU is not using the hash + * table, such as 8xx, 4xx, BookE's etc... + * + * Copyright 2008 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from previous arch/powerpc/mm/mmu_context.c + * and arch/powerpc/include/asm/mmu_context.h + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * TODO: + * + * - The global context lock will not scale very well + * - The maps should be dynamically allocated to allow for processors + * that support more PID bits at runtime + * - Implement flush_tlb_mm() by making the context stale and picking + * a new one + * - More aggressively clear stale map bits and maybe find some way to + * also clear mm->cpu_vm_mask bits when processes are migrated + */ + +//#define DEBUG_MAP_CONSISTENCY +//#define DEBUG_CLAMP_LAST_CONTEXT 31 +//#define DEBUG_HARDER + +/* We don't use DEBUG because it tends to be compiled in always nowadays + * and this would generate way too much output + */ +#ifdef DEBUG_HARDER +#define pr_hard(args...) printk(KERN_DEBUG args) +#define pr_hardcont(args...) printk(KERN_CONT args) +#else +#define pr_hard(args...) do { } while(0) +#define pr_hardcont(args...) do { } while(0) +#endif + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/slab.h> + +#include <asm/mmu_context.h> +#include <asm/tlbflush.h> + +#include "mmu_decl.h" + +static unsigned int first_context, last_context; +static unsigned int next_context, nr_free_contexts; +static unsigned long *context_map; +static unsigned long *stale_map[NR_CPUS]; +static struct mm_struct **context_mm; +static DEFINE_RAW_SPINLOCK(context_lock); +static bool no_selective_tlbil; + +#define CTX_MAP_SIZE \ + (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1)) + + +/* Steal a context from a task that has one at the moment. + * + * This is used when we are running out of available PID numbers + * on the processors. + * + * This isn't an LRU system, it just frees up each context in + * turn (sort-of pseudo-random replacement :). This would be the + * place to implement an LRU scheme if anyone was motivated to do it. + * -- paulus + * + * For context stealing, we use a slightly different approach for + * SMP and UP. Basically, the UP one is simpler and doesn't use + * the stale map as we can just flush the local CPU + * -- benh + */ +#ifdef CONFIG_SMP +static unsigned int steal_context_smp(unsigned int id) +{ + struct mm_struct *mm; + unsigned int cpu, max, i; + + max = last_context - first_context; + + /* Attempt to free next_context first and then loop until we manage */ + while (max--) { + /* Pick up the victim mm */ + mm = context_mm[id]; + + /* We have a candidate victim, check if it's active, on SMP + * we cannot steal active contexts + */ + if (mm->context.active) { + id++; + if (id > last_context) + id = first_context; + continue; + } + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Mark this mm has having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + + /* Mark it stale on all CPUs that used this mm. For threaded + * implementations, we set it on all threads on each core + * represented in the mask. A future implementation will use + * a core map instead but this will do for now. + */ + for_each_cpu(cpu, mm_cpumask(mm)) { + for (i = cpu_first_thread_sibling(cpu); + i <= cpu_last_thread_sibling(cpu); i++) { + if (stale_map[i]) + __set_bit(id, stale_map[i]); + } + cpu = i - 1; + } + return id; + } + + /* This will happen if you have more CPUs than available contexts, + * all we can do here is wait a bit and try again + */ + raw_spin_unlock(&context_lock); + cpu_relax(); + raw_spin_lock(&context_lock); + + /* This will cause the caller to try again */ + return MMU_NO_CONTEXT; +} +#endif /* CONFIG_SMP */ + +static unsigned int steal_all_contexts(void) +{ + struct mm_struct *mm; + int cpu = smp_processor_id(); + unsigned int id; + + for (id = first_context; id <= last_context; id++) { + /* Pick up the victim mm */ + mm = context_mm[id]; + + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Mark this mm as having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + if (id != first_context) { + context_mm[id] = NULL; + __clear_bit(id, context_map); +#ifdef DEBUG_MAP_CONSISTENCY + mm->context.active = 0; +#endif + } + __clear_bit(id, stale_map[cpu]); + } + + /* Flush the TLB for all contexts (not to be used on SMP) */ + _tlbil_all(); + + nr_free_contexts = last_context - first_context; + + return first_context; +} + +/* Note that this will also be called on SMP if all other CPUs are + * offlined, which means that it may be called for cpu != 0. For + * this to work, we somewhat assume that CPUs that are onlined + * come up with a fully clean TLB (or are cleaned when offlined) + */ +static unsigned int steal_context_up(unsigned int id) +{ + struct mm_struct *mm; + int cpu = smp_processor_id(); + + /* Pick up the victim mm */ + mm = context_mm[id]; + + pr_hardcont(" | steal %d from 0x%p", id, mm); + + /* Flush the TLB for that context */ + local_flush_tlb_mm(mm); + + /* Mark this mm has having no context anymore */ + mm->context.id = MMU_NO_CONTEXT; + + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ + __clear_bit(id, stale_map[cpu]); + + return id; +} + +#ifdef DEBUG_MAP_CONSISTENCY +static void context_check_map(void) +{ + unsigned int id, nrf, nact; + + nrf = nact = 0; + for (id = first_context; id <= last_context; id++) { + int used = test_bit(id, context_map); + if (!used) + nrf++; + if (used != (context_mm[id] != NULL)) + pr_err("MMU: Context %d is %s and MM is %p !\n", + id, used ? "used" : "free", context_mm[id]); + if (context_mm[id] != NULL) + nact += context_mm[id]->context.active; + } + if (nrf != nr_free_contexts) { + pr_err("MMU: Free context count out of sync ! (%d vs %d)\n", + nr_free_contexts, nrf); + nr_free_contexts = nrf; + } + if (nact > num_online_cpus()) + pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n", + nact, num_online_cpus()); + if (first_context > 0 && !test_bit(0, context_map)) + pr_err("MMU: Context 0 has been freed !!!\n"); +} +#else +static void context_check_map(void) { } +#endif + +void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) +{ + unsigned int i, id, cpu = smp_processor_id(); + unsigned long *map; + + /* No lockless fast path .. yet */ + raw_spin_lock(&context_lock); + + pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", + cpu, next, next->context.active, next->context.id); + +#ifdef CONFIG_SMP + /* Mark us active and the previous one not anymore */ + next->context.active++; + if (prev) { + pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); + WARN_ON(prev->context.active < 1); + prev->context.active--; + } + + again: +#endif /* CONFIG_SMP */ + + /* If we already have a valid assigned context, skip all that */ + id = next->context.id; + if (likely(id != MMU_NO_CONTEXT)) { +#ifdef DEBUG_MAP_CONSISTENCY + if (context_mm[id] != next) + pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", + next, id, id, context_mm[id]); +#endif + goto ctxt_ok; + } + + /* We really don't have a context, let's try to acquire one */ + id = next_context; + if (id > last_context) + id = first_context; + map = context_map; + + /* No more free contexts, let's try to steal one */ + if (nr_free_contexts == 0) { +#ifdef CONFIG_SMP + if (num_online_cpus() > 1) { + id = steal_context_smp(id); + if (id == MMU_NO_CONTEXT) + goto again; + goto stolen; + } +#endif /* CONFIG_SMP */ + if (no_selective_tlbil) + id = steal_all_contexts(); + else + id = steal_context_up(id); + goto stolen; + } + nr_free_contexts--; + + /* We know there's at least one free context, try to find it */ + while (__test_and_set_bit(id, map)) { + id = find_next_zero_bit(map, last_context+1, id); + if (id > last_context) + id = first_context; + } + stolen: + next_context = id + 1; + context_mm[id] = next; + next->context.id = id; + pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); + + context_check_map(); + ctxt_ok: + + /* If that context got marked stale on this CPU, then flush the + * local TLB for it and unmark it before we use it + */ + if (test_bit(id, stale_map[cpu])) { + pr_hardcont(" | stale flush %d [%d..%d]", + id, cpu_first_thread_sibling(cpu), + cpu_last_thread_sibling(cpu)); + + local_flush_tlb_mm(next); + + /* XXX This clear should ultimately be part of local_flush_tlb_mm */ + for (i = cpu_first_thread_sibling(cpu); + i <= cpu_last_thread_sibling(cpu); i++) { + if (stale_map[i]) + __clear_bit(id, stale_map[i]); + } + } + + /* Flick the MMU and release lock */ + pr_hardcont(" -> %d\n", id); + set_context(id, next->pgd); + raw_spin_unlock(&context_lock); +} + +/* + * Set up the context for a new address space. + */ +int init_new_context(struct task_struct *t, struct mm_struct *mm) +{ + pr_hard("initing context for mm @%p\n", mm); + + mm->context.id = MMU_NO_CONTEXT; + mm->context.active = 0; + +#ifdef CONFIG_PPC_MM_SLICES + if (slice_mm_new_context(mm)) + slice_set_user_psize(mm, mmu_virtual_psize); +#endif + + return 0; +} + +/* + * We're finished using the context for an address space. + */ +void destroy_context(struct mm_struct *mm) +{ + unsigned long flags; + unsigned int id; + + if (mm->context.id == MMU_NO_CONTEXT) + return; + + WARN_ON(mm->context.active != 0); + + raw_spin_lock_irqsave(&context_lock, flags); + id = mm->context.id; + if (id != MMU_NO_CONTEXT) { + __clear_bit(id, context_map); + mm->context.id = MMU_NO_CONTEXT; +#ifdef DEBUG_MAP_CONSISTENCY + mm->context.active = 0; +#endif + context_mm[id] = NULL; + nr_free_contexts++; + } + raw_spin_unlock_irqrestore(&context_lock, flags); +} + +#ifdef CONFIG_SMP + +static int mmu_context_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned int)(long)hcpu; + + /* We don't touch CPU 0 map, it's allocated at aboot and kept + * around forever + */ + if (cpu == boot_cpuid) + return NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); + stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); + kfree(stale_map[cpu]); + stale_map[cpu] = NULL; + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + clear_tasks_mm_cpumask(cpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block mmu_context_cpu_nb = { + .notifier_call = mmu_context_cpu_notify, +}; + +#endif /* CONFIG_SMP */ + +/* + * Initialize the context management stuff. + */ +void __init mmu_context_init(void) +{ + /* Mark init_mm as being active on all possible CPUs since + * we'll get called with prev == init_mm the first time + * we schedule on a given CPU + */ + init_mm.context.active = NR_CPUS; + + /* + * The MPC8xx has only 16 contexts. We rotate through them on each + * task switch. A better way would be to keep track of tasks that + * own contexts, and implement an LRU usage. That way very active + * tasks don't always have to pay the TLB reload overhead. The + * kernel pages are mapped shared, so the kernel can run on behalf + * of any task that makes a kernel entry. Shared does not mean they + * are not protected, just that the ASID comparison is not performed. + * -- Dan + * + * The IBM4xx has 256 contexts, so we can just rotate through these + * as a way of "switching" contexts. If the TID of the TLB is zero, + * the PID/TID comparison is disabled, so we can use a TID of zero + * to represent all kernel pages as shared among all contexts. + * -- Dan + * + * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We + * should normally never have to steal though the facility is + * present if needed. + * -- BenH + */ + if (mmu_has_feature(MMU_FTR_TYPE_8xx)) { + first_context = 0; + last_context = 15; + no_selective_tlbil = true; + } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) { + first_context = 1; + last_context = 65535; + no_selective_tlbil = false; + } else { + first_context = 1; + last_context = 255; + no_selective_tlbil = false; + } + +#ifdef DEBUG_CLAMP_LAST_CONTEXT + last_context = DEBUG_CLAMP_LAST_CONTEXT; +#endif + /* + * Allocate the maps used by context management + */ + context_map = memblock_virt_alloc(CTX_MAP_SIZE, 0); + context_mm = memblock_virt_alloc(sizeof(void *) * (last_context + 1), 0); +#ifndef CONFIG_SMP + stale_map[0] = memblock_virt_alloc(CTX_MAP_SIZE, 0); +#else + stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0); + + register_cpu_notifier(&mmu_context_cpu_nb); +#endif + + printk(KERN_INFO + "MMU: Allocated %zu bytes of context maps for %d contexts\n", + 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)), + last_context - first_context + 1); + + /* + * Some processors have too few contexts to reserve one for + * init_mm, and require using context 0 for a normal task. + * Other processors reserve the use of context zero for the kernel. + * This code assumes first_context < 32. + */ + context_map[0] = (1 << first_context) - 1; + next_context = first_context; + nr_free_contexts = last_context - first_context + 1; +} + diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h new file mode 100644 index 000000000..085b66b10 --- /dev/null +++ b/arch/powerpc/mm/mmu_decl.h @@ -0,0 +1,167 @@ +/* + * Declarations of procedures and variables shared between files + * in arch/ppc/mm/. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <linux/mm.h> +#include <asm/tlbflush.h> +#include <asm/mmu.h> + +#ifdef CONFIG_PPC_MMU_NOHASH + +/* + * On 40x and 8xx, we directly inline tlbia and tlbivax + */ +#if defined(CONFIG_40x) || defined(CONFIG_8xx) +static inline void _tlbil_all(void) +{ + asm volatile ("sync; tlbia; isync" : : : "memory"); +} +static inline void _tlbil_pid(unsigned int pid) +{ + asm volatile ("sync; tlbia; isync" : : : "memory"); +} +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) + +#else /* CONFIG_40x || CONFIG_8xx */ +extern void _tlbil_all(void); +extern void _tlbil_pid(unsigned int pid); +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbil_pid_noind(unsigned int pid); +#else +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) +#endif +#endif /* !(CONFIG_40x || CONFIG_8xx) */ + +/* + * On 8xx, we directly inline tlbie, on others, it's extern + */ +#ifdef CONFIG_8xx +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); +} +#elif defined(CONFIG_PPC_BOOK3E) +extern void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else +extern void __tlbil_va(unsigned long address, unsigned int pid); +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + __tlbil_va(address, pid); +} +#endif /* CONFIG_8xx */ + +#if defined(CONFIG_PPC_BOOK3E) || defined(CONFIG_PPC_47x) +extern void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else +static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + BUG(); +} +#endif + +#else /* CONFIG_PPC_MMU_NOHASH */ + +extern void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap); + + +extern void _tlbie(unsigned long address); +extern void _tlbia(void); + +#endif /* CONFIG_PPC_MMU_NOHASH */ + +#ifdef CONFIG_PPC32 + +extern void mapin_ram(void); +extern int map_page(unsigned long va, phys_addr_t pa, int flags); +extern void setbat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot); + +extern int __map_without_bats; +extern int __allow_ioremap_reserved; +extern unsigned long ioremap_base; +extern unsigned int rtas_data, rtas_size; + +struct hash_pte; +extern struct hash_pte *Hash, *Hash_end; +extern unsigned long Hash_size, Hash_mask; + +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 +extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags); +#endif /* CONFIG_PPC64 */ + +extern unsigned long ioremap_bot; +extern unsigned long __max_low_memory; +extern phys_addr_t __initial_memory_limit_addr; +extern phys_addr_t total_memory; +extern phys_addr_t total_lowmem; +extern phys_addr_t memstart_addr; +extern phys_addr_t lowmem_end_addr; + +#ifdef CONFIG_WII +extern unsigned long wii_hole_start; +extern unsigned long wii_hole_size; + +extern unsigned long wii_mmu_mapin_mem2(unsigned long top); +extern void wii_memory_fixups(void); +#endif + +/* ...and now those things that may be slightly different between processor + * architectures. -- Dan + */ +#if defined(CONFIG_8xx) +#define MMU_init_hw() do { } while(0) +#define mmu_mapin_ram(top) (0UL) + +#elif defined(CONFIG_4xx) +extern void MMU_init_hw(void); +extern unsigned long mmu_mapin_ram(unsigned long top); + +#elif defined(CONFIG_PPC_FSL_BOOK3E) +extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx); +extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt, + phys_addr_t phys); +#ifdef CONFIG_PPC32 +extern void MMU_init_hw(void); +extern unsigned long mmu_mapin_ram(unsigned long top); +extern void adjust_total_lowmem(void); +extern int switch_to_as1(void); +extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu); +#endif +extern void loadcam_entry(unsigned int index); + +struct tlbcam { + u32 MAS0; + u32 MAS1; + unsigned long MAS2; + u32 MAS3; + u32 MAS7; +}; +#elif defined(CONFIG_PPC32) +/* anything 32-bit except 4xx or 8xx */ +extern void MMU_init_hw(void); +extern unsigned long mmu_mapin_ram(unsigned long top); +#endif diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c new file mode 100644 index 000000000..5e80621d9 --- /dev/null +++ b/arch/powerpc/mm/numa.c @@ -0,0 +1,1652 @@ +/* + * pSeries NUMA support + * + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#define pr_fmt(fmt) "numa: " fmt + +#include <linux/threads.h> +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/export.h> +#include <linux/nodemask.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/memblock.h> +#include <linux/of.h> +#include <linux/pfn.h> +#include <linux/cpuset.h> +#include <linux/node.h> +#include <linux/stop_machine.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <asm/cputhreads.h> +#include <asm/sparsemem.h> +#include <asm/prom.h> +#include <asm/smp.h> +#include <asm/cputhreads.h> +#include <asm/topology.h> +#include <asm/firmware.h> +#include <asm/paca.h> +#include <asm/hvcall.h> +#include <asm/setup.h> +#include <asm/vdso.h> + +static int numa_enabled = 1; + +static char *cmdline __initdata; + +static int numa_debug; +#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } + +int numa_cpu_lookup_table[NR_CPUS]; +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +struct pglist_data *node_data[MAX_NUMNODES]; + +EXPORT_SYMBOL(numa_cpu_lookup_table); +EXPORT_SYMBOL(node_to_cpumask_map); +EXPORT_SYMBOL(node_data); + +static int min_common_depth; +static int n_mem_addr_cells, n_mem_size_cells; +static int form1_affinity; + +#define MAX_DISTANCE_REF_POINTS 4 +static int distance_ref_points_depth; +static const __be32 *distance_ref_points; +static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; + +/* + * Allocate node_to_cpumask_map based on number of available nodes + * Requires node_possible_map to be valid. + * + * Note: cpumask_of_node() is not valid until after this is done. + */ +static void __init setup_node_to_cpumask_map(void) +{ + unsigned int node; + + /* setup nr_node_ids if not done yet */ + if (nr_node_ids == MAX_NUMNODES) + setup_nr_node_ids(); + + /* allocate the map */ + for (node = 0; node < nr_node_ids; node++) + alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); + + /* cpumask_of_node() will now work */ + dbg("Node to cpumask map for %d nodes\n", nr_node_ids); +} + +static int __init fake_numa_create_new_node(unsigned long end_pfn, + unsigned int *nid) +{ + unsigned long long mem; + char *p = cmdline; + static unsigned int fake_nid; + static unsigned long long curr_boundary; + + /* + * Modify node id, iff we started creating NUMA nodes + * We want to continue from where we left of the last time + */ + if (fake_nid) + *nid = fake_nid; + /* + * In case there are no more arguments to parse, the + * node_id should be the same as the last fake node id + * (we've handled this above). + */ + if (!p) + return 0; + + mem = memparse(p, &p); + if (!mem) + return 0; + + if (mem < curr_boundary) + return 0; + + curr_boundary = mem; + + if ((end_pfn << PAGE_SHIFT) > mem) { + /* + * Skip commas and spaces + */ + while (*p == ',' || *p == ' ' || *p == '\t') + p++; + + cmdline = p; + fake_nid++; + *nid = fake_nid; + dbg("created new fake_node with id %d\n", fake_nid); + return 1; + } + return 0; +} + +static void reset_numa_cpu_lookup_table(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + numa_cpu_lookup_table[cpu] = -1; +} + +static void update_numa_cpu_lookup_table(unsigned int cpu, int node) +{ + numa_cpu_lookup_table[cpu] = node; +} + +static void map_cpu_to_node(int cpu, int node) +{ + update_numa_cpu_lookup_table(cpu, node); + + dbg("adding cpu %d to node %d\n", cpu, node); + + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + cpumask_set_cpu(cpu, node_to_cpumask_map[node]); +} + +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) +static void unmap_cpu_from_node(unsigned long cpu) +{ + int node = numa_cpu_lookup_table[cpu]; + + dbg("removing cpu %lu from node %d\n", cpu, node); + + if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { + cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + } else { + printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", + cpu, node); + } +} +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ + +/* must hold reference to node during call */ +static const __be32 *of_get_associativity(struct device_node *dev) +{ + return of_get_property(dev, "ibm,associativity", NULL); +} + +/* + * Returns the property linux,drconf-usable-memory if + * it exists (the property exists only in kexec/kdump kernels, + * added by kexec-tools) + */ +static const __be32 *of_get_usable_memory(struct device_node *memory) +{ + const __be32 *prop; + u32 len; + prop = of_get_property(memory, "linux,drconf-usable-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return NULL; + return prop; +} + +int __node_distance(int a, int b) +{ + int i; + int distance = LOCAL_DISTANCE; + + if (!form1_affinity) + return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); + + for (i = 0; i < distance_ref_points_depth; i++) { + if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) + break; + + /* Double the distance for each NUMA level */ + distance *= 2; + } + + return distance; +} +EXPORT_SYMBOL(__node_distance); + +static void initialize_distance_lookup_table(int nid, + const __be32 *associativity) +{ + int i; + + if (!form1_affinity) + return; + + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + + entry = &associativity[be32_to_cpu(distance_ref_points[i])]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } +} + +/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa + * info is found. + */ +static int associativity_to_nid(const __be32 *associativity) +{ + int nid = -1; + + if (min_common_depth == -1) + goto out; + + if (of_read_number(associativity, 1) >= min_common_depth) + nid = of_read_number(&associativity[min_common_depth], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= MAX_NUMNODES) + nid = -1; + + if (nid > 0 && + of_read_number(associativity, 1) >= distance_ref_points_depth) + initialize_distance_lookup_table(nid, associativity); + +out: + return nid; +} + +/* Returns the nid associated with the given device tree node, + * or -1 if not found. + */ +static int of_node_to_nid_single(struct device_node *device) +{ + int nid = -1; + const __be32 *tmp; + + tmp = of_get_associativity(device); + if (tmp) + nid = associativity_to_nid(tmp); + return nid; +} + +/* Walk the device tree upwards, looking for an associativity id */ +int of_node_to_nid(struct device_node *device) +{ + struct device_node *tmp; + int nid = -1; + + of_node_get(device); + while (device) { + nid = of_node_to_nid_single(device); + if (nid != -1) + break; + + tmp = device; + device = of_get_parent(tmp); + of_node_put(tmp); + } + of_node_put(device); + + return nid; +} +EXPORT_SYMBOL_GPL(of_node_to_nid); + +static int __init find_min_common_depth(void) +{ + int depth; + struct device_node *root; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + /* + * This property is a set of 32-bit integers, each representing + * an index into the ibm,associativity nodes. + * + * With form 0 affinity the first integer is for an SMP configuration + * (should be all 0's) and the second is for a normal NUMA + * configuration. We have only one level of NUMA. + * + * With form 1 affinity the first integer is the most significant + * NUMA boundary and the following are progressively less significant + * boundaries. There can be more than one level of NUMA. + */ + distance_ref_points = of_get_property(root, + "ibm,associativity-reference-points", + &distance_ref_points_depth); + + if (!distance_ref_points) { + dbg("NUMA: ibm,associativity-reference-points not found.\n"); + goto err; + } + + distance_ref_points_depth /= sizeof(int); + + if (firmware_has_feature(FW_FEATURE_OPAL) || + firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { + dbg("Using form 1 affinity\n"); + form1_affinity = 1; + } + + if (form1_affinity) { + depth = of_read_number(distance_ref_points, 1); + } else { + if (distance_ref_points_depth < 2) { + printk(KERN_WARNING "NUMA: " + "short ibm,associativity-reference-points\n"); + goto err; + } + + depth = of_read_number(&distance_ref_points[1], 1); + } + + /* + * Warn and cap if the hardware supports more than + * MAX_DISTANCE_REF_POINTS domains. + */ + if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { + printk(KERN_WARNING "NUMA: distance array capped at " + "%d entries\n", MAX_DISTANCE_REF_POINTS); + distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; + } + + of_node_put(root); + return depth; + +err: + of_node_put(root); + return -1; +} + +static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) +{ + struct device_node *memory = NULL; + + memory = of_find_node_by_type(memory, "memory"); + if (!memory) + panic("numa.c: No memory nodes found!"); + + *n_addr_cells = of_n_addr_cells(memory); + *n_size_cells = of_n_size_cells(memory); + of_node_put(memory); +} + +static unsigned long read_n_cells(int n, const __be32 **buf) +{ + unsigned long result = 0; + + while (n--) { + result = (result << 32) | of_read_number(*buf, 1); + (*buf)++; + } + return result; +} + +/* + * Read the next memblock list entry from the ibm,dynamic-memory property + * and return the information in the provided of_drconf_cell structure. + */ +static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp) +{ + const __be32 *cp; + + drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp); + + cp = *cellp; + drmem->drc_index = of_read_number(cp, 1); + drmem->reserved = of_read_number(&cp[1], 1); + drmem->aa_index = of_read_number(&cp[2], 1); + drmem->flags = of_read_number(&cp[3], 1); + + *cellp = cp + 4; +} + +/* + * Retrieve and validate the ibm,dynamic-memory property of the device tree. + * + * The layout of the ibm,dynamic-memory property is a number N of memblock + * list entries followed by N memblock list entries. Each memblock list entry + * contains information as laid out in the of_drconf_cell struct above. + */ +static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm) +{ + const __be32 *prop; + u32 len, entries; + + prop = of_get_property(memory, "ibm,dynamic-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + + entries = of_read_number(prop++, 1); + + /* Now that we know the number of entries, revalidate the size + * of the property read in to ensure we have everything + */ + if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) + return 0; + + *dm = prop; + return entries; +} + +/* + * Retrieve and validate the ibm,lmb-size property for drconf memory + * from the device tree. + */ +static u64 of_get_lmb_size(struct device_node *memory) +{ + const __be32 *prop; + u32 len; + + prop = of_get_property(memory, "ibm,lmb-size", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + + return read_n_cells(n_mem_size_cells, &prop); +} + +struct assoc_arrays { + u32 n_arrays; + u32 array_sz; + const __be32 *arrays; +}; + +/* + * Retrieve and validate the list of associativity arrays for drconf + * memory from the ibm,associativity-lookup-arrays property of the + * device tree.. + * + * The layout of the ibm,associativity-lookup-arrays property is a number N + * indicating the number of associativity arrays, followed by a number M + * indicating the size of each associativity array, followed by a list + * of N associativity arrays. + */ +static int of_get_assoc_arrays(struct device_node *memory, + struct assoc_arrays *aa) +{ + const __be32 *prop; + u32 len; + + prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len); + if (!prop || len < 2 * sizeof(unsigned int)) + return -1; + + aa->n_arrays = of_read_number(prop++, 1); + aa->array_sz = of_read_number(prop++, 1); + + /* Now that we know the number of arrays and size of each array, + * revalidate the size of the property read in. + */ + if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int)) + return -1; + + aa->arrays = prop; + return 0; +} + +/* + * This is like of_node_to_nid_single() for memory represented in the + * ibm,dynamic-reconfiguration-memory node. + */ +static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, + struct assoc_arrays *aa) +{ + int default_nid = 0; + int nid = default_nid; + int index; + + if (min_common_depth > 0 && min_common_depth <= aa->array_sz && + !(drmem->flags & DRCONF_MEM_AI_INVALID) && + drmem->aa_index < aa->n_arrays) { + index = drmem->aa_index * aa->array_sz + min_common_depth - 1; + nid = of_read_number(&aa->arrays[index], 1); + + if (nid == 0xffff || nid >= MAX_NUMNODES) + nid = default_nid; + } + + return nid; +} + +/* + * Figure out to which domain a cpu belongs and stick it there. + * Return the id of the domain used. + */ +static int numa_setup_cpu(unsigned long lcpu) +{ + int nid = -1; + struct device_node *cpu; + + /* + * If a valid cpu-to-node mapping is already available, use it + * directly instead of querying the firmware, since it represents + * the most recent mapping notified to us by the platform (eg: VPHN). + */ + if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) { + map_cpu_to_node(lcpu, nid); + return nid; + } + + cpu = of_get_cpu_node(lcpu, NULL); + + if (!cpu) { + WARN_ON(1); + if (cpu_present(lcpu)) + goto out_present; + else + goto out; + } + + nid = of_node_to_nid_single(cpu); + +out_present: + if (nid < 0 || !node_online(nid)) + nid = first_online_node; + + map_cpu_to_node(lcpu, nid); + of_node_put(cpu); +out: + return nid; +} + +static void verify_cpu_node_mapping(int cpu, int node) +{ + int base, sibling, i; + + /* Verify that all the threads in the core belong to the same node */ + base = cpu_first_thread_sibling(cpu); + + for (i = 0; i < threads_per_core; i++) { + sibling = base + i; + + if (sibling == cpu || cpu_is_offline(sibling)) + continue; + + if (cpu_to_node(sibling) != node) { + WARN(1, "CPU thread siblings %d and %d don't belong" + " to the same node!\n", cpu, sibling); + break; + } + } +} + +static int cpu_numa_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + unsigned long lcpu = (unsigned long)hcpu; + int ret = NOTIFY_DONE, nid; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + nid = numa_setup_cpu(lcpu); + verify_cpu_node_mapping((int)lcpu, nid); + ret = NOTIFY_OK; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + unmap_cpu_from_node(lcpu); + ret = NOTIFY_OK; + break; +#endif + } + return ret; +} + +/* + * Check and possibly modify a memory region to enforce the memory limit. + * + * Returns the size the region should have to enforce the memory limit. + * This will either be the original value of size, a truncated value, + * or zero. If the returned value of size is 0 the region should be + * discarded as it lies wholly above the memory limit. + */ +static unsigned long __init numa_enforce_memory_limit(unsigned long start, + unsigned long size) +{ + /* + * We use memblock_end_of_DRAM() in here instead of memory_limit because + * we've already adjusted it for the limit and it takes care of + * having memory holes below the limit. Also, in the case of + * iommu_is_off, memory_limit is not set but is implicitly enforced. + */ + + if (start + size <= memblock_end_of_DRAM()) + return size; + + if (start >= memblock_end_of_DRAM()) + return 0; + + return memblock_end_of_DRAM() - start; +} + +/* + * Reads the counter for a given entry in + * linux,drconf-usable-memory property + */ +static inline int __init read_usm_ranges(const __be32 **usm) +{ + /* + * For each lmb in ibm,dynamic-memory a corresponding + * entry in linux,drconf-usable-memory property contains + * a counter followed by that many (base, size) duple. + * read the counter from linux,drconf-usable-memory + */ + return read_n_cells(n_mem_size_cells, usm); +} + +/* + * Extract NUMA information from the ibm,dynamic-reconfiguration-memory + * node. This assumes n_mem_{addr,size}_cells have been set. + */ +static void __init parse_drconf_memory(struct device_node *memory) +{ + const __be32 *uninitialized_var(dm), *usm; + unsigned int n, rc, ranges, is_kexec_kdump = 0; + unsigned long lmb_size, base, size, sz; + int nid; + struct assoc_arrays aa = { .arrays = NULL }; + + n = of_get_drconf_memory(memory, &dm); + if (!n) + return; + + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) + return; + + rc = of_get_assoc_arrays(memory, &aa); + if (rc) + return; + + /* check if this is a kexec/kdump kernel */ + usm = of_get_usable_memory(memory); + if (usm != NULL) + is_kexec_kdump = 1; + + for (; n != 0; --n) { + struct of_drconf_cell drmem; + + read_drconf_cell(&drmem, &dm); + + /* skip this block if the reserved bit is set in flags (0x80) + or if the block is not assigned to this partition (0x8) */ + if ((drmem.flags & DRCONF_MEM_RESERVED) + || !(drmem.flags & DRCONF_MEM_ASSIGNED)) + continue; + + base = drmem.base_addr; + size = lmb_size; + ranges = 1; + + if (is_kexec_kdump) { + ranges = read_usm_ranges(&usm); + if (!ranges) /* there are no (base, size) duple */ + continue; + } + do { + if (is_kexec_kdump) { + base = read_n_cells(n_mem_addr_cells, &usm); + size = read_n_cells(n_mem_size_cells, &usm); + } + nid = of_drconf_to_nid_single(&drmem, &aa); + fake_numa_create_new_node( + ((base + size) >> PAGE_SHIFT), + &nid); + node_set_online(nid); + sz = numa_enforce_memory_limit(base, size); + if (sz) + memblock_set_node(base, sz, + &memblock.memory, nid); + } while (--ranges); + } +} + +static int __init parse_numa_properties(void) +{ + struct device_node *memory; + int default_nid = 0; + unsigned long i; + + if (numa_enabled == 0) { + printk(KERN_WARNING "NUMA disabled by user\n"); + return -1; + } + + min_common_depth = find_min_common_depth(); + + if (min_common_depth < 0) + return min_common_depth; + + dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + + /* + * Even though we connect cpus to numa domains later in SMP + * init, we need to know the node ids now. This is because + * each node to be onlined must have NODE_DATA etc backing it. + */ + for_each_present_cpu(i) { + struct device_node *cpu; + int nid; + + cpu = of_get_cpu_node(i, NULL); + BUG_ON(!cpu); + nid = of_node_to_nid_single(cpu); + of_node_put(cpu); + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ + if (nid < 0) + continue; + node_set_online(nid); + } + + get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); + + for_each_node_by_type(memory, "memory") { + unsigned long start; + unsigned long size; + int nid; + int ranges; + const __be32 *memcell_buf; + unsigned int len; + + memcell_buf = of_get_property(memory, + "linux,usable-memory", &len); + if (!memcell_buf || len <= 0) + memcell_buf = of_get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + /* ranges in cell */ + ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); +new_range: + /* these are order-sensitive, and modify the buffer pointer */ + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); + + /* + * Assumption: either all memory nodes or none will + * have associativity properties. If none, then + * everything goes to default_nid. + */ + nid = of_node_to_nid_single(memory); + if (nid < 0) + nid = default_nid; + + fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); + node_set_online(nid); + + if (!(size = numa_enforce_memory_limit(start, size))) { + if (--ranges) + goto new_range; + else + continue; + } + + memblock_set_node(start, size, &memblock.memory, nid); + + if (--ranges) + goto new_range; + } + + /* + * Now do the same thing for each MEMBLOCK listed in the + * ibm,dynamic-memory property in the + * ibm,dynamic-reconfiguration-memory node. + */ + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) + parse_drconf_memory(memory); + + return 0; +} + +static void __init setup_nonnuma(void) +{ + unsigned long top_of_ram = memblock_end_of_DRAM(); + unsigned long total_ram = memblock_phys_mem_size(); + unsigned long start_pfn, end_pfn; + unsigned int nid = 0; + struct memblock_region *reg; + + printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", + top_of_ram, total_ram); + printk(KERN_DEBUG "Memory hole size: %ldMB\n", + (top_of_ram - total_ram) >> 20); + + for_each_memblock(memory, reg) { + start_pfn = memblock_region_memory_base_pfn(reg); + end_pfn = memblock_region_memory_end_pfn(reg); + + fake_numa_create_new_node(end_pfn, &nid); + memblock_set_node(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), + &memblock.memory, nid); + node_set_online(nid); + } +} + +void __init dump_numa_cpu_topology(void) +{ + unsigned int node; + unsigned int cpu, count; + + if (min_common_depth == -1 || !numa_enabled) + return; + + for_each_online_node(node) { + printk(KERN_DEBUG "Node %d CPUs:", node); + + count = 0; + /* + * If we used a CPU iterator here we would miss printing + * the holes in the cpumap. + */ + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + if (cpumask_test_cpu(cpu, + node_to_cpumask_map[node])) { + if (count == 0) + printk(" %u", cpu); + ++count; + } else { + if (count > 1) + printk("-%u", cpu - 1); + count = 0; + } + } + + if (count > 1) + printk("-%u", nr_cpu_ids - 1); + printk("\n"); + } +} + +static void __init dump_numa_memory_topology(void) +{ + unsigned int node; + unsigned int count; + + if (min_common_depth == -1 || !numa_enabled) + return; + + for_each_online_node(node) { + unsigned long i; + + printk(KERN_DEBUG "Node %d Memory:", node); + + count = 0; + + for (i = 0; i < memblock_end_of_DRAM(); + i += (1 << SECTION_SIZE_BITS)) { + if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) { + if (count == 0) + printk(" 0x%lx", i); + ++count; + } else { + if (count > 0) + printk("-0x%lx", i); + count = 0; + } + } + + if (count > 0) + printk("-0x%lx", i); + printk("\n"); + } +} + +static struct notifier_block ppc64_numa_nb = { + .notifier_call = cpu_numa_callback, + .priority = 1 /* Must run before sched domains notifier. */ +}; + +/* Initialize NODE_DATA for a node on the local memory */ +static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) +{ + u64 spanned_pages = end_pfn - start_pfn; + const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES); + u64 nd_pa; + void *nd; + int tnid; + + if (spanned_pages) + pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", + nid, start_pfn << PAGE_SHIFT, + (end_pfn << PAGE_SHIFT) - 1); + else + pr_info("Initmem setup node %d\n", nid); + + nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); + nd = __va(nd_pa); + + /* report and initialize */ + pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); + tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); + if (tnid != nid) + pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid); + + node_data[nid] = nd; + memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); + NODE_DATA(nid)->node_id = nid; + NODE_DATA(nid)->node_start_pfn = start_pfn; + NODE_DATA(nid)->node_spanned_pages = spanned_pages; +} + +void __init initmem_init(void) +{ + int nid, cpu; + + max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; + max_pfn = max_low_pfn; + + if (parse_numa_properties()) + setup_nonnuma(); + else + dump_numa_memory_topology(); + + memblock_dump_all(); + + /* + * Reduce the possible NUMA nodes to the online NUMA nodes, + * since we do not support node hotplug. This ensures that we + * lower the maximum NUMA node ID to what is actually present. + */ + nodes_and(node_possible_map, node_possible_map, node_online_map); + + for_each_online_node(nid) { + unsigned long start_pfn, end_pfn; + + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + setup_node_data(nid, start_pfn, end_pfn); + sparse_memory_present_with_active_regions(nid); + } + + sparse_init(); + + setup_node_to_cpumask_map(); + + reset_numa_cpu_lookup_table(); + register_cpu_notifier(&ppc64_numa_nb); + /* + * We need the numa_cpu_lookup_table to be accurate for all CPUs, + * even before we online them, so that we can use cpu_to_{node,mem} + * early in boot, cf. smp_prepare_cpus(). + */ + for_each_present_cpu(cpu) { + numa_setup_cpu((unsigned long)cpu); + } +} + +static int __init early_numa(char *p) +{ + if (!p) + return 0; + + if (strstr(p, "off")) + numa_enabled = 0; + + if (strstr(p, "debug")) + numa_debug = 1; + + p = strstr(p, "fake="); + if (p) + cmdline = p + strlen("fake="); + + return 0; +} +early_param("numa", early_numa); + +static bool topology_updates_enabled = true; + +static int __init early_topology_updates(char *p) +{ + if (!p) + return 0; + + if (!strcmp(p, "off")) { + pr_info("Disabling topology updates\n"); + topology_updates_enabled = false; + } + + return 0; +} +early_param("topology_updates", early_topology_updates); + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Find the node associated with a hot added memory section for + * memory represented in the device tree by the property + * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory. + */ +static int hot_add_drconf_scn_to_nid(struct device_node *memory, + unsigned long scn_addr) +{ + const __be32 *dm; + unsigned int drconf_cell_cnt, rc; + unsigned long lmb_size; + struct assoc_arrays aa; + int nid = -1; + + drconf_cell_cnt = of_get_drconf_memory(memory, &dm); + if (!drconf_cell_cnt) + return -1; + + lmb_size = of_get_lmb_size(memory); + if (!lmb_size) + return -1; + + rc = of_get_assoc_arrays(memory, &aa); + if (rc) + return -1; + + for (; drconf_cell_cnt != 0; --drconf_cell_cnt) { + struct of_drconf_cell drmem; + + read_drconf_cell(&drmem, &dm); + + /* skip this block if it is reserved or not assigned to + * this partition */ + if ((drmem.flags & DRCONF_MEM_RESERVED) + || !(drmem.flags & DRCONF_MEM_ASSIGNED)) + continue; + + if ((scn_addr < drmem.base_addr) + || (scn_addr >= (drmem.base_addr + lmb_size))) + continue; + + nid = of_drconf_to_nid_single(&drmem, &aa); + break; + } + + return nid; +} + +/* + * Find the node associated with a hot added memory section for memory + * represented in the device tree as a node (i.e. memory@XXXX) for + * each memblock. + */ +static int hot_add_node_scn_to_nid(unsigned long scn_addr) +{ + struct device_node *memory; + int nid = -1; + + for_each_node_by_type(memory, "memory") { + unsigned long start, size; + int ranges; + const __be32 *memcell_buf; + unsigned int len; + + memcell_buf = of_get_property(memory, "reg", &len); + if (!memcell_buf || len <= 0) + continue; + + /* ranges in cell */ + ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells); + + while (ranges--) { + start = read_n_cells(n_mem_addr_cells, &memcell_buf); + size = read_n_cells(n_mem_size_cells, &memcell_buf); + + if ((scn_addr < start) || (scn_addr >= (start + size))) + continue; + + nid = of_node_to_nid_single(memory); + break; + } + + if (nid >= 0) + break; + } + + of_node_put(memory); + + return nid; +} + +/* + * Find the node associated with a hot added memory section. Section + * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that + * sections are fully contained within a single MEMBLOCK. + */ +int hot_add_scn_to_nid(unsigned long scn_addr) +{ + struct device_node *memory = NULL; + int nid, found = 0; + + if (!numa_enabled || (min_common_depth < 0)) + return first_online_node; + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + nid = hot_add_drconf_scn_to_nid(memory, scn_addr); + of_node_put(memory); + } else { + nid = hot_add_node_scn_to_nid(scn_addr); + } + + if (nid < 0 || !node_online(nid)) + nid = first_online_node; + + if (NODE_DATA(nid)->node_spanned_pages) + return nid; + + for_each_online_node(nid) { + if (NODE_DATA(nid)->node_spanned_pages) { + found = 1; + break; + } + } + + BUG_ON(!found); + return nid; +} + +static u64 hot_add_drconf_memory_max(void) +{ + struct device_node *memory = NULL; + unsigned int drconf_cell_cnt = 0; + u64 lmb_size = 0; + const __be32 *dm = NULL; + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + drconf_cell_cnt = of_get_drconf_memory(memory, &dm); + lmb_size = of_get_lmb_size(memory); + of_node_put(memory); + } + return lmb_size * drconf_cell_cnt; +} + +/* + * memory_hotplug_max - return max address of memory that may be added + * + * This is currently only used on systems that support drconfig memory + * hotplug. + */ +u64 memory_hotplug_max(void) +{ + return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM()); +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + +/* Virtual Processor Home Node (VPHN) support */ +#ifdef CONFIG_PPC_SPLPAR + +#include "vphn.h" + +struct topology_update_data { + struct topology_update_data *next; + unsigned int cpu; + int old_nid; + int new_nid; +}; + +static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; +static cpumask_t cpu_associativity_changes_mask; +static int vphn_enabled; +static int prrn_enabled; +static void reset_topology_timer(void); + +/* + * Store the current values of the associativity change counters in the + * hypervisor. + */ +static void setup_cpu_associativity_change_counters(void) +{ + int cpu; + + /* The VPHN feature supports a maximum of 8 reference points */ + BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8); + + for_each_possible_cpu(cpu) { + int i; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < distance_ref_points_depth; i++) + counts[i] = hypervisor_counts[i]; + } +} + +/* + * The hypervisor maintains a set of 8 associativity change counters in + * the VPA of each cpu that correspond to the associativity levels in the + * ibm,associativity-reference-points property. When an associativity + * level changes, the corresponding counter is incremented. + * + * Set a bit in cpu_associativity_changes_mask for each cpu whose home + * node associativity levels have changed. + * + * Returns the number of cpus with unhandled associativity changes. + */ +static int update_cpu_associativity_changes_mask(void) +{ + int cpu; + cpumask_t *changes = &cpu_associativity_changes_mask; + + for_each_possible_cpu(cpu) { + int i, changed = 0; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < distance_ref_points_depth; i++) { + if (hypervisor_counts[i] != counts[i]) { + counts[i] = hypervisor_counts[i]; + changed = 1; + } + } + if (changed) { + cpumask_or(changes, changes, cpu_sibling_mask(cpu)); + cpu = cpu_last_thread_sibling(cpu); + } + } + + return cpumask_weight(changes); +} + +/* + * Retrieve the new associativity information for a virtual processor's + * home node. + */ +static long hcall_vphn(unsigned long cpu, __be32 *associativity) +{ + long rc; + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + u64 flags = 1; + int hwcpu = get_hard_smp_processor_id(cpu); + + rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); + vphn_unpack_associativity(retbuf, associativity); + + return rc; +} + +static long vphn_get_associativity(unsigned long cpu, + __be32 *associativity) +{ + long rc; + + rc = hcall_vphn(cpu, associativity); + + switch (rc) { + case H_FUNCTION: + printk(KERN_INFO + "VPHN is not supported. Disabling polling...\n"); + stop_topology_update(); + break; + case H_HARDWARE: + printk(KERN_ERR + "hcall_vphn() experienced a hardware fault " + "preventing VPHN. Disabling polling...\n"); + stop_topology_update(); + } + + return rc; +} + +/* + * Update the CPU maps and sysfs entries for a single CPU when its NUMA + * characteristics change. This function doesn't perform any locking and is + * only safe to call from stop_machine(). + */ +static int update_cpu_topology(void *data) +{ + struct topology_update_data *update; + unsigned long cpu; + + if (!data) + return -EINVAL; + + cpu = smp_processor_id(); + + for (update = data; update; update = update->next) { + int new_nid = update->new_nid; + if (cpu != update->cpu) + continue; + + unmap_cpu_from_node(cpu); + map_cpu_to_node(cpu, new_nid); + set_cpu_numa_node(cpu, new_nid); + set_cpu_numa_mem(cpu, local_memory_node(new_nid)); + vdso_getcpu_init(); + } + + return 0; +} + +static int update_lookup_table(void *data) +{ + struct topology_update_data *update; + + if (!data) + return -EINVAL; + + /* + * Upon topology update, the numa-cpu lookup table needs to be updated + * for all threads in the core, including offline CPUs, to ensure that + * future hotplug operations respect the cpu-to-node associativity + * properly. + */ + for (update = data; update; update = update->next) { + int nid, base, j; + + nid = update->new_nid; + base = cpu_first_thread_sibling(update->cpu); + + for (j = 0; j < threads_per_core; j++) { + update_numa_cpu_lookup_table(base + j, nid); + } + } + + return 0; +} + +/* + * Update the node maps and sysfs entries for each cpu whose home node + * has changed. Returns 1 when the topology has changed, and 0 otherwise. + */ +int arch_update_cpu_topology(void) +{ + unsigned int cpu, sibling, changed = 0; + struct topology_update_data *updates, *ud; + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + cpumask_t updated_cpus; + struct device *dev; + int weight, new_nid, i = 0; + + if (!prrn_enabled && !vphn_enabled) + return 0; + + weight = cpumask_weight(&cpu_associativity_changes_mask); + if (!weight) + return 0; + + updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL); + if (!updates) + return 0; + + cpumask_clear(&updated_cpus); + + for_each_cpu(cpu, &cpu_associativity_changes_mask) { + /* + * If siblings aren't flagged for changes, updates list + * will be too short. Skip on this update and set for next + * update. + */ + if (!cpumask_subset(cpu_sibling_mask(cpu), + &cpu_associativity_changes_mask)) { + pr_info("Sibling bits not set for associativity " + "change, cpu%d\n", cpu); + cpumask_or(&cpu_associativity_changes_mask, + &cpu_associativity_changes_mask, + cpu_sibling_mask(cpu)); + cpu = cpu_last_thread_sibling(cpu); + continue; + } + + /* Use associativity from first thread for all siblings */ + vphn_get_associativity(cpu, associativity); + new_nid = associativity_to_nid(associativity); + if (new_nid < 0 || !node_online(new_nid)) + new_nid = first_online_node; + + if (new_nid == numa_cpu_lookup_table[cpu]) { + cpumask_andnot(&cpu_associativity_changes_mask, + &cpu_associativity_changes_mask, + cpu_sibling_mask(cpu)); + cpu = cpu_last_thread_sibling(cpu); + continue; + } + + for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + ud = &updates[i++]; + ud->cpu = sibling; + ud->new_nid = new_nid; + ud->old_nid = numa_cpu_lookup_table[sibling]; + cpumask_set_cpu(sibling, &updated_cpus); + if (i < weight) + ud->next = &updates[i]; + } + cpu = cpu_last_thread_sibling(cpu); + } + + pr_debug("Topology update for the following CPUs:\n"); + if (cpumask_weight(&updated_cpus)) { + for (ud = &updates[0]; ud; ud = ud->next) { + pr_debug("cpu %d moving from node %d " + "to %d\n", ud->cpu, + ud->old_nid, ud->new_nid); + } + } + + /* + * In cases where we have nothing to update (because the updates list + * is too short or because the new topology is same as the old one), + * skip invoking update_cpu_topology() via stop-machine(). This is + * necessary (and not just a fast-path optimization) since stop-machine + * can end up electing a random CPU to run update_cpu_topology(), and + * thus trick us into setting up incorrect cpu-node mappings (since + * 'updates' is kzalloc()'ed). + * + * And for the similar reason, we will skip all the following updating. + */ + if (!cpumask_weight(&updated_cpus)) + goto out; + + stop_machine(update_cpu_topology, &updates[0], &updated_cpus); + + /* + * Update the numa-cpu lookup table with the new mappings, even for + * offline CPUs. It is best to perform this update from the stop- + * machine context. + */ + stop_machine(update_lookup_table, &updates[0], + cpumask_of(raw_smp_processor_id())); + + for (ud = &updates[0]; ud; ud = ud->next) { + unregister_cpu_under_node(ud->cpu, ud->old_nid); + register_cpu_under_node(ud->cpu, ud->new_nid); + + dev = get_cpu_device(ud->cpu); + if (dev) + kobject_uevent(&dev->kobj, KOBJ_CHANGE); + cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask); + changed = 1; + } + +out: + kfree(updates); + return changed; +} + +static void topology_work_fn(struct work_struct *work) +{ + rebuild_sched_domains(); +} +static DECLARE_WORK(topology_work, topology_work_fn); + +static void topology_schedule_update(void) +{ + schedule_work(&topology_work); +} + +static void topology_timer_fn(unsigned long ignored) +{ + if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask)) + topology_schedule_update(); + else if (vphn_enabled) { + if (update_cpu_associativity_changes_mask() > 0) + topology_schedule_update(); + reset_topology_timer(); + } +} +static struct timer_list topology_timer = + TIMER_INITIALIZER(topology_timer_fn, 0, 0); + +static void reset_topology_timer(void) +{ + topology_timer.data = 0; + topology_timer.expires = jiffies + 60 * HZ; + mod_timer(&topology_timer, topology_timer.expires); +} + +#ifdef CONFIG_SMP + +static void stage_topology_update(int core_id) +{ + cpumask_or(&cpu_associativity_changes_mask, + &cpu_associativity_changes_mask, cpu_sibling_mask(core_id)); + reset_topology_timer(); +} + +static int dt_update_callback(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct of_reconfig_data *update = data; + int rc = NOTIFY_DONE; + + switch (action) { + case OF_RECONFIG_UPDATE_PROPERTY: + if (!of_prop_cmp(update->dn->type, "cpu") && + !of_prop_cmp(update->prop->name, "ibm,associativity")) { + u32 core_id; + of_property_read_u32(update->dn, "reg", &core_id); + stage_topology_update(core_id); + rc = NOTIFY_OK; + } + break; + } + + return rc; +} + +static struct notifier_block dt_update_nb = { + .notifier_call = dt_update_callback, +}; + +#endif + +/* + * Start polling for associativity changes. + */ +int start_topology_update(void) +{ + int rc = 0; + + if (firmware_has_feature(FW_FEATURE_PRRN)) { + if (!prrn_enabled) { + prrn_enabled = 1; + vphn_enabled = 0; +#ifdef CONFIG_SMP + rc = of_reconfig_notifier_register(&dt_update_nb); +#endif + } + } else if (firmware_has_feature(FW_FEATURE_VPHN) && + lppaca_shared_proc(get_lppaca())) { + if (!vphn_enabled) { + prrn_enabled = 0; + vphn_enabled = 1; + setup_cpu_associativity_change_counters(); + init_timer_deferrable(&topology_timer); + reset_topology_timer(); + } + } + + return rc; +} + +/* + * Disable polling for VPHN associativity changes. + */ +int stop_topology_update(void) +{ + int rc = 0; + + if (prrn_enabled) { + prrn_enabled = 0; +#ifdef CONFIG_SMP + rc = of_reconfig_notifier_unregister(&dt_update_nb); +#endif + } else if (vphn_enabled) { + vphn_enabled = 0; + rc = del_timer_sync(&topology_timer); + } + + return rc; +} + +int prrn_is_enabled(void) +{ + return prrn_enabled; +} + +static int topology_read(struct seq_file *file, void *v) +{ + if (vphn_enabled || prrn_enabled) + seq_puts(file, "on\n"); + else + seq_puts(file, "off\n"); + + return 0; +} + +static int topology_open(struct inode *inode, struct file *file) +{ + return single_open(file, topology_read, NULL); +} + +static ssize_t topology_write(struct file *file, const char __user *buf, + size_t count, loff_t *off) +{ + char kbuf[4]; /* "on" or "off" plus null. */ + int read_len; + + read_len = count < 3 ? count : 3; + if (copy_from_user(kbuf, buf, read_len)) + return -EINVAL; + + kbuf[read_len] = '\0'; + + if (!strncmp(kbuf, "on", 2)) + start_topology_update(); + else if (!strncmp(kbuf, "off", 3)) + stop_topology_update(); + else + return -EINVAL; + + return count; +} + +static const struct file_operations topology_ops = { + .read = seq_read, + .write = topology_write, + .open = topology_open, + .release = single_release +}; + +static int topology_update_init(void) +{ + /* Do not poll for changes if disabled at boot */ + if (topology_updates_enabled) + start_topology_update(); + + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) + return -ENOMEM; + + return 0; +} +device_initcall(topology_update_init); +#endif /* CONFIG_PPC_SPLPAR */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c new file mode 100644 index 000000000..83dfcb55f --- /dev/null +++ b/arch/powerpc/mm/pgtable.c @@ -0,0 +1,241 @@ +/* + * This file contains common routines for dealing with free of page tables + * Along with common page table handling code + * + * Derived from arch/powerpc/mm/tlb_64.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/hugetlb.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +static inline int is_exec_fault(void) +{ + return current->thread.regs && TRAP(current->thread.regs) == 0x400; +} + +/* We only try to do i/d cache coherency on stuff that looks like + * reasonably "normal" PTEs. We currently require a PTE to be present + * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * on userspace PTEs + */ +static inline int pte_looks_normal(pte_t pte) +{ + return (pte_val(pte) & + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); +} + +static struct page *maybe_pte_to_page(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + struct page *page; + + if (unlikely(!pfn_valid(pfn))) + return NULL; + page = pfn_to_page(pfn); + if (PageReserved(page)) + return NULL; + return page; +} + +#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 + +/* Server-style MMU handles coherency when hashing if HW exec permission + * is supposed per page (currently 64-bit only). If not, then, we always + * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec + * support falls into the same category. + */ + +static pte_t set_pte_filter(pte_t pte) +{ + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct page *pg = maybe_pte_to_page(pte); + if (!pg) + return pte; + if (!test_bit(PG_arch_1, &pg->flags)) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + } + } + return pte; +} + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + return pte; +} + +#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ + +/* Embedded type MMU with HW exec support. This is a bit more complicated + * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so + * instead we "filter out" the exec permission for non clean pages. + */ +static pte_t set_pte_filter(pte_t pte) +{ + struct page *pg; + + /* No exec permission in the first place, move on */ + if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + return pte; + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + return pte; + + /* If the page clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + return pte; + + /* If it's an exec fault, we flush the cache and make it clean */ + if (is_exec_fault()) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + return pte; + } + + /* Else, we filter out _PAGE_EXEC */ + return __pte(pte_val(pte) & ~_PAGE_EXEC); +} + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + struct page *pg; + + /* So here, we only care about exec faults, as we use them + * to recover lost _PAGE_EXEC and perform I$/D$ coherency + * if necessary. Also if _PAGE_EXEC is already set, same deal, + * we just bail out + */ + if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + return pte; + +#ifdef CONFIG_DEBUG_VM + /* So this is an exec fault, _PAGE_EXEC is not set. If it was + * an error we would have bailed out earlier in do_page_fault() + * but let's make sure of it + */ + if (WARN_ON(!(vma->vm_flags & VM_EXEC))) + return pte; +#endif /* CONFIG_DEBUG_VM */ + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + goto bail; + + /* If the page is already clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + goto bail; + + /* Clean the page and set PG_arch_1 */ + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + + bail: + return __pte(pte_val(pte) | _PAGE_EXEC); +} + +#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ + +/* + * set_pte stores a linux PTE into the linux page table. + */ +void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte) +{ + /* + * When handling numa faults, we already have the pte marked + * _PAGE_PRESENT, but we can be sure that it is not in hpte. + * Hence we can use set_pte_at for them. + */ + VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER)); + + /* Note: mm->context.id might not yet have been assigned as + * this context might not have been activated yet when this + * is called. + */ + pte = set_pte_filter(pte); + + /* Perform the setting of the PTE */ + __set_pte_at(mm, addr, ptep, pte, 0); +} + +/* + * This is called when relaxing access to a PTE. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, pte_t entry, int dirty) +{ + int changed; + entry = set_access_flags_filter(entry, vma, dirty); + changed = !pte_same(*(ptep), entry); + if (changed) { + if (!is_vm_hugetlb_page(vma)) + assert_pte_locked(vma->vm_mm, address); + __ptep_set_access_flags(ptep, entry); + flush_tlb_page_nohash(vma, address); + } + return changed; +} + +#ifdef CONFIG_DEBUG_VM +void assert_pte_locked(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + if (mm == &init_mm) + return; + pgd = mm->pgd + pgd_index(addr); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, addr); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, addr); + /* + * khugepaged to collapse normal pages to hugepage, first set + * pmd to none to force page fault/gup to take mmap_sem. After + * pmd is set to none, we do a pte_clear which does this assertion + * so if we find pmd none, return. + */ + if (pmd_none(*pmd)) + return; + BUG_ON(!pmd_present(*pmd)); + assert_spin_locked(pte_lockptr(mm, pmd)); +} +#endif /* CONFIG_DEBUG_VM */ + diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c new file mode 100644 index 000000000..7692d1bb1 --- /dev/null +++ b/arch/powerpc/mm/pgtable_32.c @@ -0,0 +1,466 @@ +/* + * This file contains the routines setting up the linux page tables. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/memblock.h> +#include <linux/slab.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/fixmap.h> +#include <asm/io.h> +#include <asm/setup.h> + +#include "mmu_decl.h" + +unsigned long ioremap_base; +unsigned long ioremap_bot; +EXPORT_SYMBOL(ioremap_bot); /* aka VMALLOC_END */ + +#ifdef CONFIG_6xx +#define HAVE_BATS 1 +#endif + +#if defined(CONFIG_FSL_BOOKE) +#define HAVE_TLBCAM 1 +#endif + +extern char etext[], _stext[]; + +#ifdef HAVE_BATS +extern phys_addr_t v_mapped_by_bats(unsigned long va); +extern unsigned long p_mapped_by_bats(phys_addr_t pa); +#else /* !HAVE_BATS */ +#define v_mapped_by_bats(x) (0UL) +#define p_mapped_by_bats(x) (0UL) +#endif /* HAVE_BATS */ + +#ifdef HAVE_TLBCAM +extern phys_addr_t v_mapped_by_tlbcam(unsigned long va); +extern unsigned long p_mapped_by_tlbcam(phys_addr_t pa); +#else /* !HAVE_TLBCAM */ +#define v_mapped_by_tlbcam(x) (0UL) +#define p_mapped_by_tlbcam(x) (0UL) +#endif /* HAVE_TLBCAM */ + +#define PGDIR_ORDER (32 + PGD_T_LOG2 - PGDIR_SHIFT) + +#ifndef CONFIG_PPC_4K_PAGES +static struct kmem_cache *pgtable_cache; + +void pgtable_cache_init(void) +{ + pgtable_cache = kmem_cache_create("PGDIR cache", 1 << PGDIR_ORDER, + 1 << PGDIR_ORDER, 0, NULL); + if (pgtable_cache == NULL) + panic("Couldn't allocate pgtable caches"); +} +#endif + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *ret; + + /* pgdir take page or two with 4K pages and a page fraction otherwise */ +#ifndef CONFIG_PPC_4K_PAGES + ret = kmem_cache_alloc(pgtable_cache, GFP_KERNEL | __GFP_ZERO); +#else + ret = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, + PGDIR_ORDER - PAGE_SHIFT); +#endif + return ret; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifndef CONFIG_PPC_4K_PAGES + kmem_cache_free(pgtable_cache, (void *)pgd); +#else + free_pages((unsigned long)pgd, PGDIR_ORDER - PAGE_SHIFT); +#endif +} + +__init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte; + + if (slab_is_available()) { + pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + } else { + pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); + if (pte) + clear_page(pte); + } + return pte; +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *ptepage; + + gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; + + ptepage = alloc_pages(flags, 0); + if (!ptepage) + return NULL; + if (!pgtable_page_ctor(ptepage)) { + __free_page(ptepage); + return NULL; + } + return ptepage; +} + +void __iomem * +ioremap(phys_addr_t addr, unsigned long size) +{ + return __ioremap_caller(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap); + +void __iomem * +ioremap_wc(phys_addr_t addr, unsigned long size) +{ + return __ioremap_caller(addr, size, _PAGE_NO_CACHE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_wc); + +void __iomem * +ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +{ + /* writeable implies dirty for kernel addresses */ + if ((flags & (_PAGE_RW | _PAGE_RO)) != _PAGE_RO) + flags |= _PAGE_DIRTY | _PAGE_HWWRITE; + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + flags &= ~(_PAGE_USER | _PAGE_EXEC); + +#ifdef _PAGE_BAP_SR + /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format + * which means that we just cleared supervisor access... oops ;-) This + * restores it + */ + flags |= _PAGE_BAP_SR; +#endif + + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + +void __iomem * +__ioremap(phys_addr_t addr, unsigned long size, unsigned long flags) +{ + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); +} + +void __iomem * +__ioremap_caller(phys_addr_t addr, unsigned long size, unsigned long flags, + void *caller) +{ + unsigned long v, i; + phys_addr_t p; + int err; + + /* Make sure we have the base flags */ + if ((flags & _PAGE_PRESENT) == 0) + flags |= pgprot_val(PAGE_KERNEL); + + /* Non-cacheable page cannot be coherent */ + if (flags & _PAGE_NO_CACHE) + flags &= ~_PAGE_COHERENT; + + /* + * Choose an address to map it to. + * Once the vmalloc system is running, we use it. + * Before then, we use space going down from ioremap_base + * (ioremap_bot records where we're up to). + */ + p = addr & PAGE_MASK; + size = PAGE_ALIGN(addr + size) - p; + + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (p < 16*1024*1024) + p += _ISA_MEM_BASE; + +#ifndef CONFIG_CRASH_DUMP + /* + * Don't allow anybody to remap normal RAM that we're using. + * mem_init() sets high_memory so only do the check after that. + */ + if (slab_is_available() && (p < virt_to_phys(high_memory)) && + !(__allow_ioremap_reserved && memblock_is_region_reserved(p, size))) { + printk("__ioremap(): phys addr 0x%llx is RAM lr %ps\n", + (unsigned long long)p, __builtin_return_address(0)); + return NULL; + } +#endif + + if (size == 0) + return NULL; + + /* + * Is it already mapped? Perhaps overlapped by a previous + * BAT mapping. If the whole area is mapped then we're done, + * otherwise remap it since we want to keep the virt addrs for + * each request contiguous. + * + * We make the assumption here that if the bottom and top + * of the range we want are mapped then it's mapped to the + * same virt address (and this is contiguous). + * -- Cort + */ + if ((v = p_mapped_by_bats(p)) /*&& p_mapped_by_bats(p+size-1)*/ ) + goto out; + + if ((v = p_mapped_by_tlbcam(p))) + goto out; + + if (slab_is_available()) { + struct vm_struct *area; + area = get_vm_area_caller(size, VM_IOREMAP, caller); + if (area == 0) + return NULL; + area->phys_addr = p; + v = (unsigned long) area->addr; + } else { + v = (ioremap_bot -= size); + } + + /* + * Should check if it is a candidate for a BAT mapping + */ + + err = 0; + for (i = 0; i < size && err == 0; i += PAGE_SIZE) + err = map_page(v+i, p+i, flags); + if (err) { + if (slab_is_available()) + vunmap((void *)v); + return NULL; + } + +out: + return (void __iomem *) (v + ((unsigned long)addr & ~PAGE_MASK)); +} +EXPORT_SYMBOL(__ioremap); + +void iounmap(volatile void __iomem *addr) +{ + /* + * If mapped by BATs then there is nothing to do. + * Calling vfree() generates a benign warning. + */ + if (v_mapped_by_bats((unsigned long)addr)) return; + + if (addr > high_memory && (unsigned long) addr < ioremap_bot) + vunmap((void *) (PAGE_MASK & (unsigned long)addr)); +} +EXPORT_SYMBOL(iounmap); + +int map_page(unsigned long va, phys_addr_t pa, int flags) +{ + pmd_t *pd; + pte_t *pg; + int err = -ENOMEM; + + /* Use upper 10 bits of VA to index the first level map */ + pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va); + /* Use middle 10 bits of VA to index the second-level map */ + pg = pte_alloc_kernel(pd, va); + if (pg != 0) { + err = 0; + /* The PTE should never be already set nor present in the + * hash table + */ + BUG_ON((pte_val(*pg) & (_PAGE_PRESENT | _PAGE_HASHPTE)) && + flags); + set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } + smp_wmb(); + return err; +} + +/* + * Map in a chunk of physical memory starting at start. + */ +void __init __mapin_ram_chunk(unsigned long offset, unsigned long top) +{ + unsigned long v, s, f; + phys_addr_t p; + int ktext; + + s = offset; + v = PAGE_OFFSET + s; + p = memstart_addr + s; + for (; s < top; s += PAGE_SIZE) { + ktext = ((char *) v >= _stext && (char *) v < etext); + f = ktext ? pgprot_val(PAGE_KERNEL_TEXT) : pgprot_val(PAGE_KERNEL); + map_page(v, p, f); +#ifdef CONFIG_PPC_STD_MMU_32 + if (ktext) + hash_preload(&init_mm, v, 0, 0x300); +#endif + v += PAGE_SIZE; + p += PAGE_SIZE; + } +} + +void __init mapin_ram(void) +{ + unsigned long s, top; + +#ifndef CONFIG_WII + top = total_lowmem; + s = mmu_mapin_ram(top); + __mapin_ram_chunk(s, top); +#else + if (!wii_hole_size) { + s = mmu_mapin_ram(total_lowmem); + __mapin_ram_chunk(s, total_lowmem); + } else { + top = wii_hole_start; + s = mmu_mapin_ram(top); + __mapin_ram_chunk(s, top); + + top = memblock_end_of_DRAM(); + s = wii_mmu_mapin_mem2(top); + __mapin_ram_chunk(s, top); + } +#endif +} + +/* Scan the real Linux page tables and return a PTE pointer for + * a virtual address in a context. + * Returns true (1) if PTE was found, zero otherwise. The pointer to + * the PTE pointer is unmodified if PTE is not found. + */ +int +get_pteptr(struct mm_struct *mm, unsigned long addr, pte_t **ptep, pmd_t **pmdp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int retval = 0; + + pgd = pgd_offset(mm, addr & PAGE_MASK); + if (pgd) { + pud = pud_offset(pgd, addr & PAGE_MASK); + if (pud && pud_present(*pud)) { + pmd = pmd_offset(pud, addr & PAGE_MASK); + if (pmd_present(*pmd)) { + pte = pte_offset_map(pmd, addr & PAGE_MASK); + if (pte) { + retval = 1; + *ptep = pte; + if (pmdp) + *pmdp = pmd; + /* XXX caller needs to do pte_unmap, yuck */ + } + } + } + } + return(retval); +} + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __change_page_attr(struct page *page, pgprot_t prot) +{ + pte_t *kpte; + pmd_t *kpmd; + unsigned long address; + + BUG_ON(PageHighMem(page)); + address = (unsigned long)page_address(page); + + if (v_mapped_by_bats(address) || v_mapped_by_tlbcam(address)) + return 0; + if (!get_pteptr(&init_mm, address, &kpte, &kpmd)) + return -EINVAL; + __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0); + wmb(); + flush_tlb_page(NULL, address); + pte_unmap(kpte); + + return 0; +} + +/* + * Change the page attributes of an page in the linear mapping. + * + * THIS CONFLICTS WITH BAT MAPPINGS, DEBUG USE ONLY + */ +static int change_page_attr(struct page *page, int numpages, pgprot_t prot) +{ + int i, err = 0; + unsigned long flags; + + local_irq_save(flags); + for (i = 0; i < numpages; i++, page++) { + err = __change_page_attr(page, prot); + if (err) + break; + } + local_irq_restore(flags); + return err; +} + + +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + + change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); +} +#endif /* CONFIG_DEBUG_PAGEALLOC */ + +static int fixmaps; + +void __set_fixmap (enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) +{ + unsigned long address = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + + map_page(address, phys, pgprot_val(flags)); + fixmaps++; +} + +void __this_fixmap_does_not_exist(void) +{ + WARN_ON(1); +} diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c new file mode 100644 index 000000000..6bfadf1aa --- /dev/null +++ b/arch/powerpc/mm/pgtable_64.c @@ -0,0 +1,884 @@ +/* + * This file contains ioremap and related functions for 64-bit machines. + * + * Derived from arch/ppc64/mm/init.c + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@samba.org) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/stddef.h> +#include <linux/vmalloc.h> +#include <linux/memblock.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> + +#include <asm/pgalloc.h> +#include <asm/page.h> +#include <asm/prom.h> +#include <asm/io.h> +#include <asm/mmu_context.h> +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/smp.h> +#include <asm/machdep.h> +#include <asm/tlb.h> +#include <asm/processor.h> +#include <asm/cputable.h> +#include <asm/sections.h> +#include <asm/firmware.h> +#include <asm/dma.h> + +#include "mmu_decl.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/thp.h> + +/* Some sanity checking */ +#if TASK_SIZE_USER64 > PGTABLE_RANGE +#error TASK_SIZE_USER64 exceeds pagetable range +#endif + +#ifdef CONFIG_PPC_STD_MMU_64 +#if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) +#error TASK_SIZE_USER64 exceeds user VSID range +#endif +#endif + +unsigned long ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PPC_MMU_NOHASH +static __ref void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} +#endif /* CONFIG_PPC_MMU_NOHASH */ + +/* + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table + * and adds an entry to the HPT, possibly bolting it + */ +int map_kernel_page(unsigned long ea, unsigned long pa, int flags) +{ + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (slab_is_available()) { + pgdp = pgd_offset_k(ea); + pudp = pud_alloc(&init_mm, pgdp, ea); + if (!pudp) + return -ENOMEM; + pmdp = pmd_alloc(&init_mm, pudp, ea); + if (!pmdp) + return -ENOMEM; + ptep = pte_alloc_kernel(pmdp, ea); + if (!ptep) + return -ENOMEM; + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); + } else { +#ifdef CONFIG_PPC_MMU_NOHASH + pgdp = pgd_offset_k(ea); +#ifdef PUD_TABLE_SIZE + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* PUD_TABLE_SIZE */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); +#else /* CONFIG_PPC_MMU_NOHASH */ + /* + * If the mm subsystem is not fully up, we cannot create a + * linux page table entry for this mapping. Simply bolt an + * entry in the hardware page table. + * + */ + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, + mmu_io_psize, mmu_kernel_ssize)) { + printk(KERN_ERR "Failed to do bolted mapping IO " + "memory at %016lx !\n", pa); + return -ENOMEM; + } +#endif /* !CONFIG_PPC_MMU_NOHASH */ + } + +#ifdef CONFIG_PPC_BOOK3E_64 + /* + * With hardware tablewalk, a sync is needed to ensure that + * subsequent accesses see the PTE we just wrote. Unlike userspace + * mappings, we can't tolerate spurious faults, so make sure + * the new PTE will be seen the first time. + */ + mb(); +#else + smp_wmb(); +#endif + return 0; +} + + +/** + * __ioremap_at - Low level function to establish the page tables + * for an IO mapping + */ +void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, + unsigned long flags) +{ + unsigned long i; + + /* Make sure we have the base flags */ + if ((flags & _PAGE_PRESENT) == 0) + flags |= pgprot_val(PAGE_KERNEL); + + /* Non-cacheable page cannot be coherent */ + if (flags & _PAGE_NO_CACHE) + flags &= ~_PAGE_COHERENT; + + /* We don't support the 4K PFN hack with ioremap */ + if (flags & _PAGE_4K_PFN) + return NULL; + + WARN_ON(pa & ~PAGE_MASK); + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + for (i = 0; i < size; i += PAGE_SIZE) + if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) + return NULL; + + return (void __iomem *)ea; +} + +/** + * __iounmap_from - Low level function to tear down the page tables + * for an IO mapping. This is used for mappings that + * are manipulated manually, like partial unmapping of + * PCI IOs or ISA space. + */ +void __iounmap_at(void *ea, unsigned long size) +{ + WARN_ON(((unsigned long)ea) & ~PAGE_MASK); + WARN_ON(size & ~PAGE_MASK); + + unmap_kernel_range((unsigned long)ea, size); +} + +void __iomem * __ioremap_caller(phys_addr_t addr, unsigned long size, + unsigned long flags, void *caller) +{ + phys_addr_t paligned; + void __iomem *ret; + + /* + * Choose an address to map it to. + * Once the imalloc system is running, we use it. + * Before that, we map using addresses going + * up from ioremap_bot. imalloc will use + * the addresses from ioremap_bot through + * IMALLOC_END + * + */ + paligned = addr & PAGE_MASK; + size = PAGE_ALIGN(addr + size) - paligned; + + if ((size == 0) || (paligned == 0)) + return NULL; + + if (slab_is_available()) { + struct vm_struct *area; + + area = __get_vm_area_caller(size, VM_IOREMAP, + ioremap_bot, IOREMAP_END, + caller); + if (area == NULL) + return NULL; + + area->phys_addr = paligned; + ret = __ioremap_at(paligned, area->addr, size, flags); + if (!ret) + vunmap(area->addr); + } else { + ret = __ioremap_at(paligned, (void *)ioremap_bot, size, flags); + if (ret) + ioremap_bot += size; + } + + if (ret) + ret += addr & ~PAGE_MASK; + return ret; +} + +void __iomem * __ioremap(phys_addr_t addr, unsigned long size, + unsigned long flags) +{ + return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); +} + +void __iomem * ioremap(phys_addr_t addr, unsigned long size) +{ + unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; + void *caller = __builtin_return_address(0); + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, flags, caller); + return __ioremap_caller(addr, size, flags, caller); +} + +void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) +{ + unsigned long flags = _PAGE_NO_CACHE; + void *caller = __builtin_return_address(0); + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, flags, caller); + return __ioremap_caller(addr, size, flags, caller); +} + +void __iomem * ioremap_prot(phys_addr_t addr, unsigned long size, + unsigned long flags) +{ + void *caller = __builtin_return_address(0); + + /* writeable implies dirty for kernel addresses */ + if (flags & _PAGE_RW) + flags |= _PAGE_DIRTY; + + /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ + flags &= ~(_PAGE_USER | _PAGE_EXEC); + +#ifdef _PAGE_BAP_SR + /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format + * which means that we just cleared supervisor access... oops ;-) This + * restores it + */ + flags |= _PAGE_BAP_SR; +#endif + + if (ppc_md.ioremap) + return ppc_md.ioremap(addr, size, flags, caller); + return __ioremap_caller(addr, size, flags, caller); +} + + +/* + * Unmap an IO region and remove it from imalloc'd list. + * Access to IO memory should be serialized by driver. + */ +void __iounmap(volatile void __iomem *token) +{ + void *addr; + + if (!slab_is_available()) + return; + + addr = (void *) ((unsigned long __force) + PCI_FIX_ADDR(token) & PAGE_MASK); + if ((unsigned long)addr < ioremap_bot) { + printk(KERN_WARNING "Attempt to iounmap early bolted mapping" + " at 0x%p\n", addr); + return; + } + vunmap(addr); +} + +void iounmap(volatile void __iomem *token) +{ + if (ppc_md.iounmap) + ppc_md.iounmap(token); + else + __iounmap(token); +} + +EXPORT_SYMBOL(ioremap); +EXPORT_SYMBOL(ioremap_wc); +EXPORT_SYMBOL(ioremap_prot); +EXPORT_SYMBOL(__ioremap); +EXPORT_SYMBOL(__ioremap_at); +EXPORT_SYMBOL(iounmap); +EXPORT_SYMBOL(__iounmap); +EXPORT_SYMBOL(__iounmap_at); + +#ifndef __PAGETABLE_PUD_FOLDED +/* 4 level page table */ +struct page *pgd_page(pgd_t pgd) +{ + if (pgd_huge(pgd)) + return pte_page(pgd_pte(pgd)); + return virt_to_page(pgd_page_vaddr(pgd)); +} +#endif + +struct page *pud_page(pud_t pud) +{ + if (pud_huge(pud)) + return pte_page(pud_pte(pud)); + return virt_to_page(pud_page_vaddr(pud)); +} + +/* + * For hugepage we have pfn in the pmd, we use PTE_RPN_SHIFT bits for flags + * For PTE page, we have a PTE_FRAG_SIZE (4K) aligned virtual address. + */ +struct page *pmd_page(pmd_t pmd) +{ + if (pmd_trans_huge(pmd) || pmd_huge(pmd)) + return pfn_to_page(pmd_pfn(pmd)); + return virt_to_page(pmd_page_vaddr(pmd)); +} + +#ifdef CONFIG_PPC_64K_PAGES +static pte_t *get_from_cache(struct mm_struct *mm) +{ + void *pte_frag, *ret; + + spin_lock(&mm->page_table_lock); + ret = mm->context.pte_frag; + if (ret) { + pte_frag = ret + PTE_FRAG_SIZE; + /* + * If we have taken up all the fragments mark PTE page NULL + */ + if (((unsigned long)pte_frag & ~PAGE_MASK) == 0) + pte_frag = NULL; + mm->context.pte_frag = pte_frag; + } + spin_unlock(&mm->page_table_lock); + return (pte_t *)ret; +} + +static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) +{ + void *ret = NULL; + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | + __GFP_REPEAT | __GFP_ZERO); + if (!page) + return NULL; + if (!kernel && !pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + + ret = page_address(page); + spin_lock(&mm->page_table_lock); + /* + * If we find pgtable_page set, we return + * the allocated page with single fragement + * count. + */ + if (likely(!mm->context.pte_frag)) { + atomic_set(&page->_count, PTE_FRAG_NR); + mm->context.pte_frag = ret + PTE_FRAG_SIZE; + } + spin_unlock(&mm->page_table_lock); + + return (pte_t *)ret; +} + +pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) +{ + pte_t *pte; + + pte = get_from_cache(mm); + if (pte) + return pte; + + return __alloc_for_cache(mm, kernel); +} + +void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) +{ + struct page *page = virt_to_page(table); + if (put_page_testzero(page)) { + if (!kernel) + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +#ifdef CONFIG_SMP +static void page_table_free_rcu(void *table) +{ + struct page *page = virt_to_page(table); + if (put_page_testzero(page)) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } +} + +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) +{ + unsigned long pgf = (unsigned long)table; + + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf |= shift; + tlb_remove_table(tlb, (void *)pgf); +} + +void __tlb_remove_table(void *_table) +{ + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; + + if (!shift) + /* PTE page needs special handling */ + page_table_free_rcu(table); + else { + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(shift), table); + } +} +#else +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) +{ + if (!shift) { + /* PTE page needs special handling */ + struct page *page = virt_to_page(table); + if (put_page_testzero(page)) { + pgtable_page_dtor(page); + free_hot_cold_page(page, 0); + } + } else { + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(shift), table); + } +} +#endif +#endif /* CONFIG_PPC_64K_PAGES */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +/* + * This is called when relaxing access to a hugepage. It's also called in the page + * fault path when we don't hit any of the major fault cases, ie, a minor + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have + * handled those two for us, we additionally deal with missing execute + * permission here on some processors + */ +int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp, pmd_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + changed = !pmd_same(*(pmdp), entry); + if (changed) { + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); + /* + * Since we are not supporting SW TLB systems, we don't + * have any thing similar to flush_tlb_page_nohash() + */ + } + return changed; +} + +unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long clr, + unsigned long set) +{ + + unsigned long old, tmp; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&mm->page_table_lock); +#endif + +#ifdef PTE_ATOMIC_UPDATES + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + andi. %1,%0,%6\n\ + bne- 1b \n\ + andc %1,%0,%4 \n\ + or %1,%1,%7\n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) + : "cc" ); +#else + old = pmd_val(*pmdp); + *pmdp = __pmd((old & ~clr) | set); +#endif + trace_hugepage_update(addr, old, clr, set); + if (old & _PAGE_HASHPTE) + hpte_do_hugepage_flush(mm, addr, pmdp, old); + return old; +} + +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (pmd_trans_huge(*pmdp)) { + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + } else { + /* + * khugepaged calls this for normal pmd + */ + pmd = *pmdp; + pmd_clear(pmdp); + /* + * Wait for all pending hash_page to finish. This is needed + * in case of subpage collapse. When we collapse normal pages + * to hugepage, we first clear the pmd, then invalidate all + * the PTE entries. The assumption here is that any low level + * page fault will see a none pmd and take the slow path that + * will wait on mmap_sem. But we could very well be in a + * hash_page with local ptep pointer value. Such a hash page + * can result in adding new HPTE entries for normal subpages. + * That means we could be modifying the page content as we + * copy them to a huge page. So wait for parallel hash_page + * to finish before invalidating HPTE entries. We can do this + * by sending an IPI to all the cpus and executing a dummy + * function there. + */ + kick_all_cpus_sync(); + /* + * Now invalidate the hpte entries in the range + * covered by pmd. This make sure we take a + * fault and will find the pmd as none, which will + * result in a major fault which takes mmap_sem and + * hence wait for collapse to complete. Without this + * the __collapse_huge_page_copy can result in copying + * the old content. + */ + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); + } + return pmd; +} + +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +/* + * We currently remove entries from the hashtable regardless of whether + * the entry was young or dirty. The generic routines only flush if the + * entry was young or dirty which is not good enough. + * + * We should be more intelligent about this but for the moment we override + * these functions and force a tlb flush unconditionally + */ +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); +} + +/* + * We mark the pmd splitting and invalidate all the hpte + * entries for this hugepage. + */ +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + unsigned long old, tmp; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pmd_trans_huge(*pmdp)); + assert_spin_locked(&vma->vm_mm->page_table_lock); +#endif + +#ifdef PTE_ATOMIC_UPDATES + + __asm__ __volatile__( + "1: ldarx %0,0,%3\n\ + andi. %1,%0,%6\n\ + bne- 1b \n\ + ori %1,%0,%4 \n\ + stdcx. %1,0,%3 \n\ + bne- 1b" + : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) + : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) + : "cc" ); +#else + old = pmd_val(*pmdp); + *pmdp = __pmd(old | _PAGE_SPLITTING); +#endif + /* + * If we didn't had the splitting flag set, go and flush the + * HPTE entries. + */ + trace_hugepage_splitting(address, old); + if (!(old & _PAGE_SPLITTING)) { + /* We need to flush the hpte */ + if (old & _PAGE_HASHPTE) + hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old); + } + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); +} + +/* + * We want to put the pgtable in pmd and use pgtable for tracking + * the base page size hptes + */ +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) +{ + pgtable_t *pgtable_slot; + assert_spin_locked(&mm->page_table_lock); + /* + * we store the pgtable in the second half of PMD + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + *pgtable_slot = pgtable; + /* + * expose the deposited pgtable to other cpus. + * before we set the hugepage PTE at pmd level + * hash fault code looks at the deposted pgtable + * to store hash index values. + */ + smp_wmb(); +} + +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) +{ + pgtable_t pgtable; + pgtable_t *pgtable_slot; + + assert_spin_locked(&mm->page_table_lock); + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Once we withdraw, mark the entry NULL. + */ + *pgtable_slot = NULL; + /* + * We store HPTE information in the deposited PTE fragment. + * zero out the content on withdraw. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + return pgtable; +} + +/* + * set a new huge pmd. We should not be called for updating + * an existing pmd entry. That should go via pmd_hugepage_update. + */ +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ +#ifdef CONFIG_DEBUG_VM + WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER)); + assert_spin_locked(&mm->page_table_lock); + WARN_ON(!pmd_trans_huge(pmd)); +#endif + trace_hugepage_set_pmd(addr, pmd_val(pmd)); + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); +} + +void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); +} + +/* + * A linux hugepage PMD was changed and the corresponding hash table entries + * neesd to be flushed. + */ +void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, unsigned long old_pmd) +{ + int ssize; + unsigned int psize; + unsigned long vsid; + unsigned long flags = 0; + const struct cpumask *tmp; + + /* get the base page size,vsid and segment size */ +#ifdef CONFIG_DEBUG_VM + psize = get_slice_psize(mm, addr); + BUG_ON(psize == MMU_PAGE_16M); +#endif + if (old_pmd & _PAGE_COMBO) + psize = MMU_PAGE_4K; + else + psize = MMU_PAGE_64K; + + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + WARN_ON(vsid == 0); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + + tmp = cpumask_of(smp_processor_id()); + if (cpumask_equal(mm_cpumask(mm), tmp)) + flags |= HPTE_LOCAL_UPDATE; + + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); +} + +static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) +{ + pmd_val(pmd) |= pgprot_val(pgprot); + return pmd; +} + +pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) +{ + pmd_t pmd; + /* + * For a valid pte, we would have _PAGE_PRESENT always + * set. We use this to check THP page at pmd level. + * leaf pte for huge page, bottom two bits != 00 + */ + pmd_val(pmd) = pfn << PTE_RPN_SHIFT; + pmd_val(pmd) |= _PAGE_THP_HUGE; + pmd = pmd_set_protbits(pmd, pgprot); + return pmd; +} + +pmd_t mk_pmd(struct page *page, pgprot_t pgprot) +{ + return pfn_pmd(page_to_pfn(page), pgprot); +} + +pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + + pmd_val(pmd) &= _HPAGE_CHG_MASK; + pmd = pmd_set_protbits(pmd, newprot); + return pmd; +} + +/* + * This is called at the end of handling a user page fault, when the + * fault has been handled by updating a HUGE PMD entry in the linux page tables. + * We use it to preload an HPTE into the hash table corresponding to + * the updated linux HUGE PMD entry. + */ +void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + return; +} + +pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old_pmd; + pgtable_t pgtable; + unsigned long old; + pgtable_t *pgtable_slot; + + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); + old_pmd = __pmd(old); + /* + * We have pmd == none and we are holding page_table_lock. + * So we can safely go and clear the pgtable hash + * index info. + */ + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; + pgtable = *pgtable_slot; + /* + * Let's zero out old valid and hash index details + * hash fault look at them. + */ + memset(pgtable, 0, PTE_FRAG_SIZE); + /* + * Serialize against find_linux_pte_or_hugepte which does lock-less + * lookup in page tables with local interrupts disabled. For huge pages + * it casts pmd_t to pte_t. Since format of pte_t is different from + * pmd_t we want to prevent transit from pmd pointing to page table + * to pmd pointing to huge page (and back) while interrupts are disabled. + * We clear pmd to possibly replace it with page table pointer in + * different code paths. So make sure we wait for the parallel + * find_linux_pte_or_hugepage to finish. + */ + kick_all_cpus_sync(); + return old_pmd; +} + +int has_transparent_hugepage(void) +{ + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) + return 0; + /* + * We support THP only if PMD_SIZE is 16MB. + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) + return 0; + /* + * We need to make sure that we support 16MB hugepage in a segement + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE + * of 64K. + */ + /* + * If we have 64K HPTE, we will be using that by default + */ + if (mmu_psize_defs[MMU_PAGE_64K].shift && + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) + return 0; + /* + * Ok we only have 4K HPTE + */ + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) + return 0; + + return 1; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c new file mode 100644 index 000000000..6b2f3e457 --- /dev/null +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -0,0 +1,289 @@ +/* + * This file contains the routines for handling the MMU on those + * PowerPC implementations where the MMU substantially follows the + * architecture specification. This includes the 6xx, 7xx, 7xxx, + * and 8260 implementations but excludes the 8xx and 4xx. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/memblock.h> + +#include <asm/prom.h> +#include <asm/mmu.h> +#include <asm/machdep.h> + +#include "mmu_decl.h" + +struct hash_pte *Hash, *Hash_end; +unsigned long Hash_size, Hash_mask; +unsigned long _SDR1; + +struct ppc_bat BATS[8][2]; /* 8 pairs of IBAT, DBAT */ + +struct batrange { /* stores address ranges mapped by BATs */ + unsigned long start; + unsigned long limit; + phys_addr_t phys; +} bat_addrs[8]; + +/* + * Return PA for this VA if it is mapped by a BAT, or 0 + */ +phys_addr_t v_mapped_by_bats(unsigned long va) +{ + int b; + for (b = 0; b < 4; ++b) + if (va >= bat_addrs[b].start && va < bat_addrs[b].limit) + return bat_addrs[b].phys + (va - bat_addrs[b].start); + return 0; +} + +/* + * Return VA for a given PA or 0 if not mapped + */ +unsigned long p_mapped_by_bats(phys_addr_t pa) +{ + int b; + for (b = 0; b < 4; ++b) + if (pa >= bat_addrs[b].phys + && pa < (bat_addrs[b].limit-bat_addrs[b].start) + +bat_addrs[b].phys) + return bat_addrs[b].start+(pa-bat_addrs[b].phys); + return 0; +} + +unsigned long __init mmu_mapin_ram(unsigned long top) +{ + unsigned long tot, bl, done; + unsigned long max_size = (256<<20); + + if (__map_without_bats) { + printk(KERN_DEBUG "RAM mapped without BATs\n"); + return 0; + } + + /* Set up BAT2 and if necessary BAT3 to cover RAM. */ + + /* Make sure we don't map a block larger than the + smallest alignment of the physical address. */ + tot = top; + for (bl = 128<<10; bl < max_size; bl <<= 1) { + if (bl * 2 > tot) + break; + } + + setbat(2, PAGE_OFFSET, 0, bl, PAGE_KERNEL_X); + done = (unsigned long)bat_addrs[2].limit - PAGE_OFFSET + 1; + if ((done < tot) && !bat_addrs[3].limit) { + /* use BAT3 to cover a bit more */ + tot -= done; + for (bl = 128<<10; bl < max_size; bl <<= 1) + if (bl * 2 > tot) + break; + setbat(3, PAGE_OFFSET+done, done, bl, PAGE_KERNEL_X); + done = (unsigned long)bat_addrs[3].limit - PAGE_OFFSET + 1; + } + + return done; +} + +/* + * Set up one of the I/D BAT (block address translation) register pairs. + * The parameters are not checked; in particular size must be a power + * of 2 between 128k and 256M. + */ +void __init setbat(int index, unsigned long virt, phys_addr_t phys, + unsigned int size, pgprot_t prot) +{ + unsigned int bl; + int wimgxpp; + struct ppc_bat *bat = BATS[index]; + unsigned long flags = pgprot_val(prot); + + if ((flags & _PAGE_NO_CACHE) || + (cpu_has_feature(CPU_FTR_NEED_COHERENT) == 0)) + flags &= ~_PAGE_COHERENT; + + bl = (size >> 17) - 1; + if (PVR_VER(mfspr(SPRN_PVR)) != 1) { + /* 603, 604, etc. */ + /* Do DBAT first */ + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT | _PAGE_GUARDED); + wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ + bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; + if (flags & _PAGE_USER) + bat[1].batu |= 1; /* Vp = 1 */ + if (flags & _PAGE_GUARDED) { + /* G bit must be zero in IBATs */ + bat[0].batu = bat[0].batl = 0; + } else { + /* make IBAT same as DBAT */ + bat[0] = bat[1]; + } + } else { + /* 601 cpu */ + if (bl > BL_8M) + bl = BL_8M; + wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE + | _PAGE_COHERENT); + wimgxpp |= (flags & _PAGE_RW)? + ((flags & _PAGE_USER)? PP_RWRW: PP_RWXX): PP_RXRX; + bat->batu = virt | wimgxpp | 4; /* Ks=0, Ku=1 */ + bat->batl = phys | bl | 0x40; /* V=1 */ + } + + bat_addrs[index].start = virt; + bat_addrs[index].limit = virt + ((bl + 1) << 17) - 1; + bat_addrs[index].phys = phys; +} + +/* + * Preload a translation in the hash table + */ +void hash_preload(struct mm_struct *mm, unsigned long ea, + unsigned long access, unsigned long trap) +{ + pmd_t *pmd; + + if (Hash == 0) + return; + pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea); + if (!pmd_none(*pmd)) + add_hash_page(mm->context.id, ea, pmd_val(*pmd)); +} + +/* + * Initialize the hash table and patch the instructions in hashtable.S. + */ +void __init MMU_init_hw(void) +{ + unsigned int hmask, mb, mb2; + unsigned int n_hpteg, lg_n_hpteg; + + extern unsigned int hash_page_patch_A[]; + extern unsigned int hash_page_patch_B[], hash_page_patch_C[]; + extern unsigned int hash_page[]; + extern unsigned int flush_hash_patch_A[], flush_hash_patch_B[]; + + if (!mmu_has_feature(MMU_FTR_HPTE_TABLE)) { + /* + * Put a blr (procedure return) instruction at the + * start of hash_page, since we can still get DSI + * exceptions on a 603. + */ + hash_page[0] = 0x4e800020; + flush_icache_range((unsigned long) &hash_page[0], + (unsigned long) &hash_page[1]); + return; + } + + if ( ppc_md.progress ) ppc_md.progress("hash:enter", 0x105); + +#define LG_HPTEG_SIZE 6 /* 64 bytes per HPTEG */ +#define SDR1_LOW_BITS ((n_hpteg - 1) >> 10) +#define MIN_N_HPTEG 1024 /* min 64kB hash table */ + + /* + * Allow 1 HPTE (1/8 HPTEG) for each page of memory. + * This is less than the recommended amount, but then + * Linux ain't AIX. + */ + n_hpteg = total_memory / (PAGE_SIZE * 8); + if (n_hpteg < MIN_N_HPTEG) + n_hpteg = MIN_N_HPTEG; + lg_n_hpteg = __ilog2(n_hpteg); + if (n_hpteg & (n_hpteg - 1)) { + ++lg_n_hpteg; /* round up if not power of 2 */ + n_hpteg = 1 << lg_n_hpteg; + } + Hash_size = n_hpteg << LG_HPTEG_SIZE; + + /* + * Find some memory for the hash table. + */ + if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); + Hash = __va(memblock_alloc(Hash_size, Hash_size)); + memset(Hash, 0, Hash_size); + _SDR1 = __pa(Hash) | SDR1_LOW_BITS; + + Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size); + + printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n", + (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash); + + + /* + * Patch up the instructions in hashtable.S:create_hpte + */ + if ( ppc_md.progress ) ppc_md.progress("hash:patch", 0x345); + Hash_mask = n_hpteg - 1; + hmask = Hash_mask >> (16 - LG_HPTEG_SIZE); + mb2 = mb = 32 - LG_HPTEG_SIZE - lg_n_hpteg; + if (lg_n_hpteg > 16) + mb2 = 16 - LG_HPTEG_SIZE; + + hash_page_patch_A[0] = (hash_page_patch_A[0] & ~0xffff) + | ((unsigned int)(Hash) >> 16); + hash_page_patch_A[1] = (hash_page_patch_A[1] & ~0x7c0) | (mb << 6); + hash_page_patch_A[2] = (hash_page_patch_A[2] & ~0x7c0) | (mb2 << 6); + hash_page_patch_B[0] = (hash_page_patch_B[0] & ~0xffff) | hmask; + hash_page_patch_C[0] = (hash_page_patch_C[0] & ~0xffff) | hmask; + + /* + * Ensure that the locations we've patched have been written + * out from the data cache and invalidated in the instruction + * cache, on those machines with split caches. + */ + flush_icache_range((unsigned long) &hash_page_patch_A[0], + (unsigned long) &hash_page_patch_C[1]); + + /* + * Patch up the instructions in hashtable.S:flush_hash_page + */ + flush_hash_patch_A[0] = (flush_hash_patch_A[0] & ~0xffff) + | ((unsigned int)(Hash) >> 16); + flush_hash_patch_A[1] = (flush_hash_patch_A[1] & ~0x7c0) | (mb << 6); + flush_hash_patch_A[2] = (flush_hash_patch_A[2] & ~0x7c0) | (mb2 << 6); + flush_hash_patch_B[0] = (flush_hash_patch_B[0] & ~0xffff) | hmask; + flush_icache_range((unsigned long) &flush_hash_patch_A[0], + (unsigned long) &flush_hash_patch_B[1]); + + if ( ppc_md.progress ) ppc_md.progress("hash:done", 0x205); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* We don't currently support the first MEMBLOCK not mapping 0 + * physical on those processors + */ + BUG_ON(first_memblock_base != 0); + + /* 601 can only access 16MB at the moment */ + if (PVR_VER(mfspr(SPRN_PVR)) == 1) + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x01000000)); + else /* Anything else has 256M mapped */ + memblock_set_current_limit(min_t(u64, first_memblock_size, 0x10000000)); +} diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c new file mode 100644 index 000000000..6e450ca66 --- /dev/null +++ b/arch/powerpc/mm/slb.c @@ -0,0 +1,332 @@ +/* + * PowerPC64 SLB support. + * + * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM + * Based on earlier code written by: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/pgtable.h> +#include <asm/mmu.h> +#include <asm/mmu_context.h> +#include <asm/paca.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/smp.h> +#include <linux/compiler.h> +#include <asm/udbg.h> +#include <asm/code-patching.h> + + +extern void slb_allocate_realmode(unsigned long ea); +extern void slb_allocate_user(unsigned long ea); + +static void slb_allocate(unsigned long ea) +{ + /* Currently, we do real mode for all SLBs including user, but + * that will change if we bring back dynamic VSIDs + */ + slb_allocate_realmode(ea); +} + +#define slb_esid_mask(ssize) \ + (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) + +static inline unsigned long mk_esid_data(unsigned long ea, int ssize, + unsigned long slot) +{ + return (ea & slb_esid_mask(ssize)) | SLB_ESID_V | slot; +} + +static inline unsigned long mk_vsid_data(unsigned long ea, int ssize, + unsigned long flags) +{ + return (get_kernel_vsid(ea, ssize) << slb_vsid_shift(ssize)) | flags | + ((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT); +} + +static inline void slb_shadow_update(unsigned long ea, int ssize, + unsigned long flags, + unsigned long entry) +{ + /* + * Clear the ESID first so the entry is not valid while we are + * updating it. No write barriers are needed here, provided + * we only update the current CPU's SLB shadow buffer. + */ + get_slb_shadow()->save_area[entry].esid = 0; + get_slb_shadow()->save_area[entry].vsid = + cpu_to_be64(mk_vsid_data(ea, ssize, flags)); + get_slb_shadow()->save_area[entry].esid = + cpu_to_be64(mk_esid_data(ea, ssize, entry)); +} + +static inline void slb_shadow_clear(unsigned long entry) +{ + get_slb_shadow()->save_area[entry].esid = 0; +} + +static inline void create_shadowed_slbe(unsigned long ea, int ssize, + unsigned long flags, + unsigned long entry) +{ + /* + * Updating the shadow buffer before writing the SLB ensures + * we don't get a stale entry here if we get preempted by PHYP + * between these two statements. + */ + slb_shadow_update(ea, ssize, flags, entry); + + asm volatile("slbmte %0,%1" : + : "r" (mk_vsid_data(ea, ssize, flags)), + "r" (mk_esid_data(ea, ssize, entry)) + : "memory" ); +} + +static void __slb_flush_and_rebolt(void) +{ + /* If you change this make sure you change SLB_NUM_BOLTED + * and PR KVM appropriately too. */ + unsigned long linear_llp, vmalloc_llp, lflags, vflags; + unsigned long ksp_esid_data, ksp_vsid_data; + + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; + lflags = SLB_VSID_KERNEL | linear_llp; + vflags = SLB_VSID_KERNEL | vmalloc_llp; + + ksp_esid_data = mk_esid_data(get_paca()->kstack, mmu_kernel_ssize, 2); + if ((ksp_esid_data & ~0xfffffffUL) <= PAGE_OFFSET) { + ksp_esid_data &= ~SLB_ESID_V; + ksp_vsid_data = 0; + slb_shadow_clear(2); + } else { + /* Update stack entry; others don't change */ + slb_shadow_update(get_paca()->kstack, mmu_kernel_ssize, lflags, 2); + ksp_vsid_data = + be64_to_cpu(get_slb_shadow()->save_area[2].vsid); + } + + /* We need to do this all in asm, so we're sure we don't touch + * the stack between the slbia and rebolting it. */ + asm volatile("isync\n" + "slbia\n" + /* Slot 1 - first VMALLOC segment */ + "slbmte %0,%1\n" + /* Slot 2 - kernel stack */ + "slbmte %2,%3\n" + "isync" + :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), + "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)), + "r"(ksp_vsid_data), + "r"(ksp_esid_data) + : "memory"); +} + +void slb_flush_and_rebolt(void) +{ + + WARN_ON(!irqs_disabled()); + + /* + * We can't take a PMU exception in the following code, so hard + * disable interrupts. + */ + hard_irq_disable(); + + __slb_flush_and_rebolt(); + get_paca()->slb_cache_ptr = 0; +} + +void slb_vmalloc_update(void) +{ + unsigned long vflags; + + vflags = SLB_VSID_KERNEL | mmu_psize_defs[mmu_vmalloc_psize].sllp; + slb_shadow_update(VMALLOC_START, mmu_kernel_ssize, vflags, 1); + slb_flush_and_rebolt(); +} + +/* Helper function to compare esids. There are four cases to handle. + * 1. The system is not 1T segment size capable. Use the GET_ESID compare. + * 2. The system is 1T capable, both addresses are < 1T, use the GET_ESID compare. + * 3. The system is 1T capable, only one of the two addresses is > 1T. This is not a match. + * 4. The system is 1T capable, both addresses are > 1T, use the GET_ESID_1T macro to compare. + */ +static inline int esids_match(unsigned long addr1, unsigned long addr2) +{ + int esid_1t_count; + + /* System is not 1T segment size capable. */ + if (!mmu_has_feature(MMU_FTR_1T_SEGMENT)) + return (GET_ESID(addr1) == GET_ESID(addr2)); + + esid_1t_count = (((addr1 >> SID_SHIFT_1T) != 0) + + ((addr2 >> SID_SHIFT_1T) != 0)); + + /* both addresses are < 1T */ + if (esid_1t_count == 0) + return (GET_ESID(addr1) == GET_ESID(addr2)); + + /* One address < 1T, the other > 1T. Not a match */ + if (esid_1t_count == 1) + return 0; + + /* Both addresses are > 1T. */ + return (GET_ESID_1T(addr1) == GET_ESID_1T(addr2)); +} + +/* Flush all user entries from the segment table of the current processor. */ +void switch_slb(struct task_struct *tsk, struct mm_struct *mm) +{ + unsigned long offset; + unsigned long slbie_data = 0; + unsigned long pc = KSTK_EIP(tsk); + unsigned long stack = KSTK_ESP(tsk); + unsigned long exec_base; + + /* + * We need interrupts hard-disabled here, not just soft-disabled, + * so that a PMU interrupt can't occur, which might try to access + * user memory (to get a stack trace) and possible cause an SLB miss + * which would update the slb_cache/slb_cache_ptr fields in the PACA. + */ + hard_irq_disable(); + offset = get_paca()->slb_cache_ptr; + if (!mmu_has_feature(MMU_FTR_NO_SLBIE_B) && + offset <= SLB_CACHE_ENTRIES) { + int i; + asm volatile("isync" : : : "memory"); + for (i = 0; i < offset; i++) { + slbie_data = (unsigned long)get_paca()->slb_cache[i] + << SID_SHIFT; /* EA */ + slbie_data |= user_segment_size(slbie_data) + << SLBIE_SSIZE_SHIFT; + slbie_data |= SLBIE_C; /* C set for user addresses */ + asm volatile("slbie %0" : : "r" (slbie_data)); + } + asm volatile("isync" : : : "memory"); + } else { + __slb_flush_and_rebolt(); + } + + /* Workaround POWER5 < DD2.1 issue */ + if (offset == 1 || offset > SLB_CACHE_ENTRIES) + asm volatile("slbie %0" : : "r" (slbie_data)); + + get_paca()->slb_cache_ptr = 0; + get_paca()->context = mm->context; + + /* + * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. + */ + exec_base = 0x10000000; + + if (is_kernel_addr(pc) || is_kernel_addr(stack) || + is_kernel_addr(exec_base)) + return; + + slb_allocate(pc); + + if (!esids_match(pc, stack)) + slb_allocate(stack); + + if (!esids_match(pc, exec_base) && + !esids_match(stack, exec_base)) + slb_allocate(exec_base); +} + +static inline void patch_slb_encoding(unsigned int *insn_addr, + unsigned int immed) +{ + int insn = (*insn_addr & 0xffff0000) | immed; + patch_instruction(insn_addr, insn); +} + +extern u32 slb_compare_rr_to_size[]; +extern u32 slb_miss_kernel_load_linear[]; +extern u32 slb_miss_kernel_load_io[]; +extern u32 slb_compare_rr_to_size[]; +extern u32 slb_miss_kernel_load_vmemmap[]; + +void slb_set_size(u16 size) +{ + if (mmu_slb_size == size) + return; + + mmu_slb_size = size; + patch_slb_encoding(slb_compare_rr_to_size, mmu_slb_size); +} + +void slb_initialize(void) +{ + unsigned long linear_llp, vmalloc_llp, io_llp; + unsigned long lflags, vflags; + static int slb_encoding_inited; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + unsigned long vmemmap_llp; +#endif + + /* Prepare our SLB miss handler based on our page size */ + linear_llp = mmu_psize_defs[mmu_linear_psize].sllp; + io_llp = mmu_psize_defs[mmu_io_psize].sllp; + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp; + get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_llp = mmu_psize_defs[mmu_vmemmap_psize].sllp; +#endif + if (!slb_encoding_inited) { + slb_encoding_inited = 1; + patch_slb_encoding(slb_miss_kernel_load_linear, + SLB_VSID_KERNEL | linear_llp); + patch_slb_encoding(slb_miss_kernel_load_io, + SLB_VSID_KERNEL | io_llp); + patch_slb_encoding(slb_compare_rr_to_size, + mmu_slb_size); + + pr_devel("SLB: linear LLP = %04lx\n", linear_llp); + pr_devel("SLB: io LLP = %04lx\n", io_llp); + +#ifdef CONFIG_SPARSEMEM_VMEMMAP + patch_slb_encoding(slb_miss_kernel_load_vmemmap, + SLB_VSID_KERNEL | vmemmap_llp); + pr_devel("SLB: vmemmap LLP = %04lx\n", vmemmap_llp); +#endif + } + + get_paca()->stab_rr = SLB_NUM_BOLTED; + + lflags = SLB_VSID_KERNEL | linear_llp; + vflags = SLB_VSID_KERNEL | vmalloc_llp; + + /* Invalidate the entire SLB (even slot 0) & all the ERATS */ + asm volatile("isync":::"memory"); + asm volatile("slbmte %0,%0"::"r" (0) : "memory"); + asm volatile("isync; slbia; isync":::"memory"); + create_shadowed_slbe(PAGE_OFFSET, mmu_kernel_ssize, lflags, 0); + + create_shadowed_slbe(VMALLOC_START, mmu_kernel_ssize, vflags, 1); + + /* For the boot cpu, we're running on the stack in init_thread_union, + * which is in the first segment of the linear mapping, and also + * get_paca()->kstack hasn't been initialized yet. + * For secondary cpus, we need to bolt the kernel stack entry now. + */ + slb_shadow_clear(2); + if (raw_smp_processor_id() != boot_cpuid && + (get_paca()->kstack & slb_esid_mask(mmu_kernel_ssize)) > PAGE_OFFSET) + create_shadowed_slbe(get_paca()->kstack, + mmu_kernel_ssize, lflags, 2); + + asm volatile("isync":::"memory"); +} diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S new file mode 100644 index 000000000..736d18b3c --- /dev/null +++ b/arch/powerpc/mm/slb_low.S @@ -0,0 +1,321 @@ +/* + * Low-level SLB routines + * + * Copyright (C) 2004 David Gibson <dwg@au.ibm.com>, IBM + * + * Based on earlier C version: + * Dave Engebretsen and Mike Corrigan {engebret|mikejc}@us.ibm.com + * Copyright (c) 2001 Dave Engebretsen + * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/firmware.h> + +/* void slb_allocate_realmode(unsigned long ea); + * + * Create an SLB entry for the given EA (user or kernel). + * r3 = faulting address, r13 = PACA + * r9, r10, r11 are clobbered by this function + * No other registers are examined or changed. + */ +_GLOBAL(slb_allocate_realmode) + /* + * check for bad kernel/user address + * (ea & ~REGION_MASK) >= PGTABLE_RANGE + */ + rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4) + bne- 8f + + srdi r9,r3,60 /* get region */ + srdi r10,r3,SID_SHIFT /* get esid */ + cmpldi cr7,r9,0xc /* cmp PAGE_OFFSET for later use */ + + /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ + blt cr7,0f /* user or kernel? */ + + /* kernel address: proto-VSID = ESID */ + /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but + * this code will generate the protoVSID 0xfffffffff for the + * top segment. That's ok, the scramble below will translate + * it to VSID 0, which is reserved as a bad VSID - one which + * will never have any pages in it. */ + + /* Check if hitting the linear mapping or some other kernel space + */ + bne cr7,1f + + /* Linear mapping encoding bits, the "li" instruction below will + * be patched by the kernel at boot + */ +.globl slb_miss_kernel_load_linear +slb_miss_kernel_load_linear: + li r11,0 + /* + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * r9 = region id. + */ + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + + +BEGIN_FTR_SECTION + b slb_finish_load +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) + b slb_finish_load_1T + +1: +#ifdef CONFIG_SPARSEMEM_VMEMMAP + /* Check virtual memmap region. To be patches at kernel boot */ + cmpldi cr0,r9,0xf + bne 1f +.globl slb_miss_kernel_load_vmemmap +slb_miss_kernel_load_vmemmap: + li r11,0 + b 6f +1: +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ + + /* vmalloc mapping gets the encoding from the PACA as the mapping + * can be demoted from 64K -> 4K dynamically on some machines + */ + clrldi r11,r10,48 + cmpldi r11,(VMALLOC_SIZE >> 28) - 1 + bgt 5f + lhz r11,PACAVMALLOCSLLP(r13) + b 6f +5: + /* IO mapping */ +.globl slb_miss_kernel_load_io +slb_miss_kernel_load_io: + li r11,0 +6: + /* + * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * r9 = region id. + */ + addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha + addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + +BEGIN_FTR_SECTION + b slb_finish_load +END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) + b slb_finish_load_1T + +0: + /* when using slices, we extract the psize off the slice bitmaps + * and then we need to get the sllp encoding off the mmu_psize_defs + * array. + * + * XXX This is a bit inefficient especially for the normal case, + * so we should try to implement a fast path for the standard page + * size using the old sllp value so we avoid the array. We cannot + * really do dynamic patching unfortunately as processes might flip + * between 4k and 64k standard page size + */ +#ifdef CONFIG_PPC_MM_SLICES + /* r10 have esid */ + cmpldi r10,16 + /* below SLICE_LOW_TOP */ + blt 5f + /* + * Handle hpsizes, + * r9 is get_paca()->context.high_slices_psize[index], r11 is mask_index + */ + srdi r11,r10,(SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT + 1) /* index */ + addi r9,r11,PACAHIGHSLICEPSIZE + lbzx r9,r13,r9 /* r9 is hpsizes[r11] */ + /* r11 = (r10 >> (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)) & 0x1 */ + rldicl r11,r10,(64 - (SLICE_HIGH_SHIFT - SLICE_LOW_SHIFT)),63 + b 6f + +5: + /* + * Handle lpsizes + * r9 is get_paca()->context.low_slices_psize, r11 is index + */ + ld r9,PACALOWSLICESPSIZE(r13) + mr r11,r10 +6: + sldi r11,r11,2 /* index * 4 */ + /* Extract the psize and multiply to get an array offset */ + srd r9,r9,r11 + andi. r9,r9,0xf + mulli r9,r9,MMUPSIZEDEFSIZE + + /* Now get to the array and obtain the sllp + */ + ld r11,PACATOC(r13) + ld r11,mmu_psize_defs@got(r11) + add r11,r11,r9 + ld r11,MMUPSIZESLLP(r11) + ori r11,r11,SLB_VSID_USER +#else + /* paca context sllp already contains the SLB_VSID_USER bits */ + lhz r11,PACACONTEXTSLLP(r13) +#endif /* CONFIG_PPC_MM_SLICES */ + + ld r9,PACACONTEXTID(r13) +BEGIN_FTR_SECTION + cmpldi r10,0x1000 + bge slb_finish_load_1T +END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) + b slb_finish_load + +8: /* invalid EA */ + li r10,0 /* BAD_VSID */ + li r9,0 /* BAD_VSID */ + li r11,SLB_VSID_USER /* flags don't much matter */ + b slb_finish_load + +#ifdef __DISABLED__ + +/* void slb_allocate_user(unsigned long ea); + * + * Create an SLB entry for the given EA (user or kernel). + * r3 = faulting address, r13 = PACA + * r9, r10, r11 are clobbered by this function + * No other registers are examined or changed. + * + * It is called with translation enabled in order to be able to walk the + * page tables. This is not currently used. + */ +_GLOBAL(slb_allocate_user) + /* r3 = faulting address */ + srdi r10,r3,28 /* get esid */ + + crset 4*cr7+lt /* set "user" flag for later */ + + /* check if we fit in the range covered by the pagetables*/ + srdi. r9,r3,PGTABLE_EADDR_SIZE + crnot 4*cr0+eq,4*cr0+eq + beqlr + + /* now we need to get to the page tables in order to get the page + * size encoding from the PMD. In the future, we'll be able to deal + * with 1T segments too by getting the encoding from the PGD instead + */ + ld r9,PACAPGDIR(r13) + cmpldi cr0,r9,0 + beqlr + rlwinm r11,r10,8,25,28 + ldx r9,r9,r11 /* get pgd_t */ + cmpldi cr0,r9,0 + beqlr + rlwinm r11,r10,3,17,28 + ldx r9,r9,r11 /* get pmd_t */ + cmpldi cr0,r9,0 + beqlr + + /* build vsid flags */ + andi. r11,r9,SLB_VSID_LLP + ori r11,r11,SLB_VSID_USER + + /* get context to calculate proto-VSID */ + ld r9,PACACONTEXTID(r13) + /* fall through slb_finish_load */ + +#endif /* __DISABLED__ */ + + +/* + * Finish loading of an SLB entry and return + * + * r3 = EA, r9 = context, r10 = ESID, r11 = flags, clobbers r9, cr7 = <> PAGE_OFFSET + */ +slb_finish_load: + rldimi r10,r9,ESID_BITS,0 + ASM_VSID_SCRAMBLE(r10,r9,256M) + /* + * bits above VSID_BITS_256M need to be ignored from r10 + * also combine VSID and flags + */ + rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M)) + + /* r3 = EA, r11 = VSID data */ + /* + * Find a slot, round robin. Previously we tried to find a + * free slot first but that took too long. Unfortunately we + * dont have any LRU information to help us choose a slot. + */ + +7: ld r10,PACASTABRR(r13) + addi r10,r10,1 + /* This gets soft patched on boot. */ +.globl slb_compare_rr_to_size +slb_compare_rr_to_size: + cmpldi r10,0 + + blt+ 4f + li r10,SLB_NUM_BOLTED + +4: + std r10,PACASTABRR(r13) + +3: + rldimi r3,r10,0,36 /* r3= EA[0:35] | entry */ + oris r10,r3,SLB_ESID_V@h /* r3 |= SLB_ESID_V */ + + /* r3 = ESID data, r11 = VSID data */ + + /* + * No need for an isync before or after this slbmte. The exception + * we enter with and the rfid we exit with are context synchronizing. + */ + slbmte r11,r10 + + /* we're done for kernel addresses */ + crclr 4*cr0+eq /* set result to "success" */ + bgelr cr7 + + /* Update the slb cache */ + lhz r3,PACASLBCACHEPTR(r13) /* offset = paca->slb_cache_ptr */ + cmpldi r3,SLB_CACHE_ENTRIES + bge 1f + + /* still room in the slb cache */ + sldi r11,r3,2 /* r11 = offset * sizeof(u32) */ + srdi r10,r10,28 /* get the 36 bits of the ESID */ + add r11,r11,r13 /* r11 = (u32 *)paca + offset */ + stw r10,PACASLBCACHE(r11) /* paca->slb_cache[offset] = esid */ + addi r3,r3,1 /* offset++ */ + b 2f +1: /* offset >= SLB_CACHE_ENTRIES */ + li r3,SLB_CACHE_ENTRIES+1 +2: + sth r3,PACASLBCACHEPTR(r13) /* paca->slb_cache_ptr = offset */ + crclr 4*cr0+eq /* set result to "success" */ + blr + +/* + * Finish loading of a 1T SLB entry (for the kernel linear mapping) and return. + * + * r3 = EA, r9 = context, r10 = ESID(256MB), r11 = flags, clobbers r9 + */ +slb_finish_load_1T: + srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ + rldimi r10,r9,ESID_BITS_1T,0 + ASM_VSID_SCRAMBLE(r10,r9,1T) + /* + * bits above VSID_BITS_1T need to be ignored from r10 + * also combine VSID and flags + */ + rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) + li r10,MMU_SEGSIZE_1T + rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ + + /* r3 = EA, r11 = VSID data */ + clrrdi r3,r3,SID_SHIFT_1T /* clear out non-ESID bits */ + b 7b + diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c new file mode 100644 index 000000000..0f432a702 --- /dev/null +++ b/arch/powerpc/mm/slice.c @@ -0,0 +1,701 @@ +/* + * address space "slices" (meta-segments) support + * + * Copyright (C) 2007 Benjamin Herrenschmidt, IBM Corporation. + * + * Based on hugetlb implementation + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/err.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/hugetlb.h> +#include <asm/mman.h> +#include <asm/mmu.h> +#include <asm/copro.h> +#include <asm/hugetlb.h> + +/* some sanity checks */ +#if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE +#error PGTABLE_RANGE exceeds slice_mask high_slices size +#endif + +static DEFINE_SPINLOCK(slice_convert_lock); + + +#ifdef DEBUG +int _slice_debug = 1; + +static void slice_print_mask(const char *label, struct slice_mask mask) +{ + char *p, buf[16 + 3 + 64 + 1]; + int i; + + if (!_slice_debug) + return; + p = buf; + for (i = 0; i < SLICE_NUM_LOW; i++) + *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0'; + *(p++) = ' '; + *(p++) = '-'; + *(p++) = ' '; + for (i = 0; i < SLICE_NUM_HIGH; i++) + *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; + *(p++) = 0; + + printk(KERN_DEBUG "%s:%s\n", label, buf); +} + +#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0) + +#else + +static void slice_print_mask(const char *label, struct slice_mask mask) {} +#define slice_dbg(fmt...) + +#endif + +static struct slice_mask slice_range_to_mask(unsigned long start, + unsigned long len) +{ + unsigned long end = start + len - 1; + struct slice_mask ret = { 0, 0 }; + + if (start < SLICE_LOW_TOP) { + unsigned long mend = min(end, SLICE_LOW_TOP); + unsigned long mstart = min(start, SLICE_LOW_TOP); + + ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(mstart)); + } + + if ((start + len) > SLICE_LOW_TOP) + ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) + - (1ul << GET_HIGH_SLICE_INDEX(start)); + + return ret; +} + +static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + + if ((mm->task_size - len) < addr) + return 0; + vma = find_vma(mm, addr); + return (!vma || (addr + len) <= vma->vm_start); +} + +static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) +{ + return !slice_area_is_free(mm, slice << SLICE_LOW_SHIFT, + 1ul << SLICE_LOW_SHIFT); +} + +static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) +{ + unsigned long start = slice << SLICE_HIGH_SHIFT; + unsigned long end = start + (1ul << SLICE_HIGH_SHIFT); + + /* Hack, so that each addresses is controlled by exactly one + * of the high or low area bitmaps, the first high area starts + * at 4GB, not 0 */ + if (start == 0) + start = SLICE_LOW_TOP; + + return !slice_area_is_free(mm, start, end - start); +} + +static struct slice_mask slice_mask_for_free(struct mm_struct *mm) +{ + struct slice_mask ret = { 0, 0 }; + unsigned long i; + + for (i = 0; i < SLICE_NUM_LOW; i++) + if (!slice_low_has_vma(mm, i)) + ret.low_slices |= 1u << i; + + if (mm->task_size <= SLICE_LOW_TOP) + return ret; + + for (i = 0; i < SLICE_NUM_HIGH; i++) + if (!slice_high_has_vma(mm, i)) + ret.high_slices |= 1ul << i; + + return ret; +} + +static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) +{ + unsigned char *hpsizes; + int index, mask_index; + struct slice_mask ret = { 0, 0 }; + unsigned long i; + u64 lpsizes; + + lpsizes = mm->context.low_slices_psize; + for (i = 0; i < SLICE_NUM_LOW; i++) + if (((lpsizes >> (i * 4)) & 0xf) == psize) + ret.low_slices |= 1u << i; + + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) + ret.high_slices |= 1ul << i; + } + + return ret; +} + +static int slice_check_fit(struct slice_mask mask, struct slice_mask available) +{ + return (mask.low_slices & available.low_slices) == mask.low_slices && + (mask.high_slices & available.high_slices) == mask.high_slices; +} + +static void slice_flush_segments(void *parm) +{ + struct mm_struct *mm = parm; + unsigned long flags; + + if (mm != current->active_mm) + return; + + /* update the paca copy of the context struct */ + get_paca()->context = current->active_mm->context; + + local_irq_save(flags); + slb_flush_and_rebolt(); + local_irq_restore(flags); +} + +static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psize) +{ + int index, mask_index; + /* Write the new slice psize bits */ + unsigned char *hpsizes; + u64 lpsizes; + unsigned long i, flags; + + slice_dbg("slice_convert(mm=%p, psize=%d)\n", mm, psize); + slice_print_mask(" mask", mask); + + /* We need to use a spinlock here to protect against + * concurrent 64k -> 4k demotion ... + */ + spin_lock_irqsave(&slice_convert_lock, flags); + + lpsizes = mm->context.low_slices_psize; + for (i = 0; i < SLICE_NUM_LOW; i++) + if (mask.low_slices & (1u << i)) + lpsizes = (lpsizes & ~(0xful << (i * 4))) | + (((unsigned long)psize) << (i * 4)); + + /* Assign the value back */ + mm->context.low_slices_psize = lpsizes; + + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (mask.high_slices & (1ul << i)) + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + slice_dbg(" lsps=%lx, hsps=%lx\n", + mm->context.low_slices_psize, + mm->context.high_slices_psize); + + spin_unlock_irqrestore(&slice_convert_lock, flags); + + copro_flush_all_slbs(mm); +} + +/* + * Compute which slice addr is part of; + * set *boundary_addr to the start or end boundary of that slice + * (depending on 'end' parameter); + * return boolean indicating if the slice is marked as available in the + * 'available' slice_mark. + */ +static bool slice_scan_available(unsigned long addr, + struct slice_mask available, + int end, + unsigned long *boundary_addr) +{ + unsigned long slice; + if (addr < SLICE_LOW_TOP) { + slice = GET_LOW_SLICE_INDEX(addr); + *boundary_addr = (slice + end) << SLICE_LOW_SHIFT; + return !!(available.low_slices & (1u << slice)); + } else { + slice = GET_HIGH_SLICE_INDEX(addr); + *boundary_addr = (slice + end) ? + ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; + return !!(available.high_slices & (1ul << slice)); + } +} + +static unsigned long slice_find_area_bottomup(struct mm_struct *mm, + unsigned long len, + struct slice_mask available, + int psize) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long addr, found, next_end; + struct vm_unmapped_area_info info; + + info.flags = 0; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + + addr = TASK_UNMAPPED_BASE; + while (addr < TASK_SIZE) { + info.low_limit = addr; + if (!slice_scan_available(addr, available, 1, &addr)) + continue; + + next_slice: + /* + * At this point [info.low_limit; addr) covers + * available slices only and ends at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the next available slice. + */ + if (addr >= TASK_SIZE) + addr = TASK_SIZE; + else if (slice_scan_available(addr, available, 1, &next_end)) { + addr = next_end; + goto next_slice; + } + info.high_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + return -ENOMEM; +} + +static unsigned long slice_find_area_topdown(struct mm_struct *mm, + unsigned long len, + struct slice_mask available, + int psize) +{ + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long addr, found, prev; + struct vm_unmapped_area_info info; + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.align_mask = PAGE_MASK & ((1ul << pshift) - 1); + info.align_offset = 0; + + addr = mm->mmap_base; + while (addr > PAGE_SIZE) { + info.high_limit = addr; + if (!slice_scan_available(addr - 1, available, 0, &addr)) + continue; + + prev_slice: + /* + * At this point [addr; info.high_limit) covers + * available slices only and starts at a slice boundary. + * Check if we need to reduce the range, or if we can + * extend it to cover the previous available slice. + */ + if (addr < PAGE_SIZE) + addr = PAGE_SIZE; + else if (slice_scan_available(addr - 1, available, 0, &prev)) { + addr = prev; + goto prev_slice; + } + info.low_limit = addr; + + found = vm_unmapped_area(&info); + if (!(found & ~PAGE_MASK)) + return found; + } + + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + return slice_find_area_bottomup(mm, len, available, psize); +} + + +static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, + struct slice_mask mask, int psize, + int topdown) +{ + if (topdown) + return slice_find_area_topdown(mm, len, mask, psize); + else + return slice_find_area_bottomup(mm, len, mask, psize); +} + +#define or_mask(dst, src) do { \ + (dst).low_slices |= (src).low_slices; \ + (dst).high_slices |= (src).high_slices; \ +} while (0) + +#define andnot_mask(dst, src) do { \ + (dst).low_slices &= ~(src).low_slices; \ + (dst).high_slices &= ~(src).high_slices; \ +} while (0) + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_PAGE_BASE MMU_PAGE_64K +#else +#define MMU_PAGE_BASE MMU_PAGE_4K +#endif + +unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, + unsigned long flags, unsigned int psize, + int topdown) +{ + struct slice_mask mask = {0, 0}; + struct slice_mask good_mask; + struct slice_mask potential_mask = {0,0} /* silence stupid warning */; + struct slice_mask compat_mask = {0, 0}; + int fixed = (flags & MAP_FIXED); + int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + struct mm_struct *mm = current->mm; + unsigned long newaddr; + + /* Sanity checks */ + BUG_ON(mm->task_size == 0); + + slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); + slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", + addr, len, flags, topdown); + + if (len > mm->task_size) + return -ENOMEM; + if (len & ((1ul << pshift) - 1)) + return -EINVAL; + if (fixed && (addr & ((1ul << pshift) - 1))) + return -EINVAL; + if (fixed && addr > (mm->task_size - len)) + return -ENOMEM; + + /* If hint, make sure it matches our alignment restrictions */ + if (!fixed && addr) { + addr = _ALIGN_UP(addr, 1ul << pshift); + slice_dbg(" aligned addr=%lx\n", addr); + /* Ignore hint if it's too large or overlaps a VMA */ + if (addr > mm->task_size - len || + !slice_area_is_free(mm, addr, len)) + addr = 0; + } + + /* First make up a "good" mask of slices that have the right size + * already + */ + good_mask = slice_mask_for_size(mm, psize); + slice_print_mask(" good_mask", good_mask); + + /* + * Here "good" means slices that are already the right page size, + * "compat" means slices that have a compatible page size (i.e. + * 4k in a 64k pagesize kernel), and "free" means slices without + * any VMAs. + * + * If MAP_FIXED: + * check if fits in good | compat => OK + * check if fits in good | compat | free => convert free + * else bad + * If have hint: + * check if hint fits in good => OK + * check if hint fits in good | free => convert free + * Otherwise: + * search in good, found => OK + * search in good | free, found => convert free + * search in good | compat | free, found => convert free. + */ + +#ifdef CONFIG_PPC_64K_PAGES + /* If we support combo pages, we can allow 64k pages in 4k slices */ + if (psize == MMU_PAGE_64K) { + compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + if (fixed) + or_mask(good_mask, compat_mask); + } +#endif + + /* First check hint if it's valid or if we have MAP_FIXED */ + if (addr != 0 || fixed) { + /* Build a mask for the requested range */ + mask = slice_range_to_mask(addr, len); + slice_print_mask(" mask", mask); + + /* Check if we fit in the good mask. If we do, we just return, + * nothing else to do + */ + if (slice_check_fit(mask, good_mask)) { + slice_dbg(" fits good !\n"); + return addr; + } + } else { + /* Now let's see if we can find something in the existing + * slices for that size + */ + newaddr = slice_find_area(mm, len, good_mask, psize, topdown); + if (newaddr != -ENOMEM) { + /* Found within the good mask, we don't have to setup, + * we thus return directly + */ + slice_dbg(" found area at 0x%lx\n", newaddr); + return newaddr; + } + } + + /* We don't fit in the good mask, check what other slices are + * empty and thus can be converted + */ + potential_mask = slice_mask_for_free(mm); + or_mask(potential_mask, good_mask); + slice_print_mask(" potential", potential_mask); + + if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { + slice_dbg(" fits potential !\n"); + goto convert; + } + + /* If we have MAP_FIXED and failed the above steps, then error out */ + if (fixed) + return -EBUSY; + + slice_dbg(" search...\n"); + + /* If we had a hint that didn't work out, see if we can fit + * anywhere in the good area. + */ + if (addr) { + addr = slice_find_area(mm, len, good_mask, psize, topdown); + if (addr != -ENOMEM) { + slice_dbg(" found area at 0x%lx\n", addr); + return addr; + } + } + + /* Now let's see if we can find something in the existing slices + * for that size plus free slices + */ + addr = slice_find_area(mm, len, potential_mask, psize, topdown); + +#ifdef CONFIG_PPC_64K_PAGES + if (addr == -ENOMEM && psize == MMU_PAGE_64K) { + /* retry the search with 4k-page slices included */ + or_mask(potential_mask, compat_mask); + addr = slice_find_area(mm, len, potential_mask, psize, + topdown); + } +#endif + + if (addr == -ENOMEM) + return -ENOMEM; + + mask = slice_range_to_mask(addr, len); + slice_dbg(" found potential area at 0x%lx\n", addr); + slice_print_mask(" mask", mask); + + convert: + andnot_mask(mask, good_mask); + andnot_mask(mask, compat_mask); + if (mask.low_slices || mask.high_slices) { + slice_convert(mm, mask, psize); + if (psize > MMU_PAGE_BASE) + on_each_cpu(slice_flush_segments, mm, 1); + } + return addr; + +} +EXPORT_SYMBOL_GPL(slice_get_unmapped_area); + +unsigned long arch_get_unmapped_area(struct file *filp, + unsigned long addr, + unsigned long len, + unsigned long pgoff, + unsigned long flags) +{ + return slice_get_unmapped_area(addr, len, flags, + current->mm->context.user_psize, 0); +} + +unsigned long arch_get_unmapped_area_topdown(struct file *filp, + const unsigned long addr0, + const unsigned long len, + const unsigned long pgoff, + const unsigned long flags) +{ + return slice_get_unmapped_area(addr0, len, flags, + current->mm->context.user_psize, 1); +} + +unsigned int get_slice_psize(struct mm_struct *mm, unsigned long addr) +{ + unsigned char *hpsizes; + int index, mask_index; + + if (addr < SLICE_LOW_TOP) { + u64 lpsizes; + lpsizes = mm->context.low_slices_psize; + index = GET_LOW_SLICE_INDEX(addr); + return (lpsizes >> (index * 4)) & 0xf; + } + hpsizes = mm->context.high_slices_psize; + index = GET_HIGH_SLICE_INDEX(addr); + mask_index = index & 0x1; + return (hpsizes[index >> 1] >> (mask_index * 4)) & 0xf; +} +EXPORT_SYMBOL_GPL(get_slice_psize); + +/* + * This is called by hash_page when it needs to do a lazy conversion of + * an address space from real 64K pages to combo 4K pages (typically + * when hitting a non cacheable mapping on a processor or hypervisor + * that won't allow them for 64K pages). + * + * This is also called in init_new_context() to change back the user + * psize from whatever the parent context had it set to + * N.B. This may be called before mm->context.id has been set. + * + * This function will only change the content of the {low,high)_slice_psize + * masks, it will not flush SLBs as this shall be handled lazily by the + * caller. + */ +void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) +{ + int index, mask_index; + unsigned char *hpsizes; + unsigned long flags, lpsizes; + unsigned int old_psize; + int i; + + slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); + + spin_lock_irqsave(&slice_convert_lock, flags); + + old_psize = mm->context.user_psize; + slice_dbg(" old_psize=%d\n", old_psize); + if (old_psize == psize) + goto bail; + + mm->context.user_psize = psize; + wmb(); + + lpsizes = mm->context.low_slices_psize; + for (i = 0; i < SLICE_NUM_LOW; i++) + if (((lpsizes >> (i * 4)) & 0xf) == old_psize) + lpsizes = (lpsizes & ~(0xful << (i * 4))) | + (((unsigned long)psize) << (i * 4)); + /* Assign the value back */ + mm->context.low_slices_psize = lpsizes; + + hpsizes = mm->context.high_slices_psize; + for (i = 0; i < SLICE_NUM_HIGH; i++) { + mask_index = i & 0x1; + index = i >> 1; + if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == old_psize) + hpsizes[index] = (hpsizes[index] & + ~(0xf << (mask_index * 4))) | + (((unsigned long)psize) << (mask_index * 4)); + } + + + + + slice_dbg(" lsps=%lx, hsps=%lx\n", + mm->context.low_slices_psize, + mm->context.high_slices_psize); + + bail: + spin_unlock_irqrestore(&slice_convert_lock, flags); +} + +void slice_set_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long len, unsigned int psize) +{ + struct slice_mask mask = slice_range_to_mask(start, len); + + slice_convert(mm, mask, psize); +} + +#ifdef CONFIG_HUGETLB_PAGE +/* + * is_hugepage_only_range() is used by generic code to verify whether + * a normal mmap mapping (non hugetlbfs) is valid on a given area. + * + * until the generic code provides a more generic hook and/or starts + * calling arch get_unmapped_area for MAP_FIXED (which our implementation + * here knows how to deal with), we hijack it to keep standard mappings + * away from us. + * + * because of that generic code limitation, MAP_FIXED mapping cannot + * "convert" back a slice with no VMAs to the standard page size, only + * get_unmapped_area() can. It would be possible to fix it here but I + * prefer working on fixing the generic code instead. + * + * WARNING: This will not work if hugetlbfs isn't enabled since the + * generic code will redefine that function as 0 in that. This is ok + * for now as we only use slices with hugetlbfs enabled. This should + * be fixed as the generic code gets fixed. + */ +int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct slice_mask mask, available; + unsigned int psize = mm->context.user_psize; + + mask = slice_range_to_mask(addr, len); + available = slice_mask_for_size(mm, psize); +#ifdef CONFIG_PPC_64K_PAGES + /* We need to account for 4k slices too */ + if (psize == MMU_PAGE_64K) { + struct slice_mask compat_mask; + compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + or_mask(available, compat_mask); + } +#endif + +#if 0 /* too verbose */ + slice_dbg("is_hugepage_only_range(mm=%p, addr=%lx, len=%lx)\n", + mm, addr, len); + slice_print_mask(" mask", mask); + slice_print_mask(" available", available); +#endif + return !slice_check_fit(mask, available); +} +#endif diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c new file mode 100644 index 000000000..fa9fb5b4c --- /dev/null +++ b/arch/powerpc/mm/subpage-prot.c @@ -0,0 +1,267 @@ +/* + * Copyright 2007-2008 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> + +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/tlbflush.h> + +/* + * Free all pages allocated for subpage protection maps and pointers. + * Also makes sure that the subpage_prot_table structure is + * reinitialized for the next user. + */ +void subpage_prot_free(struct mm_struct *mm) +{ + struct subpage_prot_table *spt = &mm->context.spt; + unsigned long i, j, addr; + u32 **p; + + for (i = 0; i < 4; ++i) { + if (spt->low_prot[i]) { + free_page((unsigned long)spt->low_prot[i]); + spt->low_prot[i] = NULL; + } + } + addr = 0; + for (i = 0; i < 2; ++i) { + p = spt->protptrs[i]; + if (!p) + continue; + spt->protptrs[i] = NULL; + for (j = 0; j < SBP_L2_COUNT && addr < spt->maxaddr; + ++j, addr += PAGE_SIZE) + if (p[j]) + free_page((unsigned long)p[j]); + free_page((unsigned long)p); + } + spt->maxaddr = 0; +} + +void subpage_prot_init_new_context(struct mm_struct *mm) +{ + struct subpage_prot_table *spt = &mm->context.spt; + + memset(spt, 0, sizeof(*spt)); +} + +static void hpte_flush_range(struct mm_struct *mm, unsigned long addr, + int npages) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return; + pud = pud_offset(pgd, addr); + if (pud_none(*pud)) + return; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return; + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; npages > 0; --npages) { + pte_update(mm, addr, pte, 0, 0, 0); + addr += PAGE_SIZE; + ++pte; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(pte - 1, ptl); +} + +/* + * Clear the subpage protection map for an address range, allowing + * all accesses that are allowed by the pte permissions. + */ +static void subpage_prot_clear(unsigned long addr, unsigned long len) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt = &mm->context.spt; + u32 **spm, *spp; + unsigned long i; + size_t nw; + unsigned long next, limit; + + down_write(&mm->mmap_sem); + limit = addr + len; + if (limit > spt->maxaddr) + limit = spt->maxaddr; + for (; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + if (addr < 0x100000000UL) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) + continue; + } + spp = spm[(addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1)]; + if (!spp) + continue; + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + memset(spp, 0, nw * sizeof(u32)); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + up_write(&mm->mmap_sem); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + split_huge_page_pmd(vma, addr, pmd); + return 0; +} + +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + struct vm_area_struct *vma; + struct mm_walk subpage_proto_walk = { + .mm = mm, + .pmd_entry = subpage_walk_pmd_entry, + }; + + /* + * We don't try too hard, we just mark all the vma in that range + * VM_NOHUGEPAGE and split them. + */ + vma = find_vma(mm, addr); + /* + * If the range is in unmapped range, just return + */ + if (vma && ((addr + len) <= vma->vm_start)) + return; + + while (vma) { + if (vma->vm_start >= (addr + len)) + break; + vma->vm_flags |= VM_NOHUGEPAGE; + walk_page_vma(vma, &subpage_proto_walk); + vma = vma->vm_next; + } +} +#else +static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, + unsigned long len) +{ + return; +} +#endif + +/* + * Copy in a subpage protection map for an address range. + * The map has 2 bits per 4k subpage, so 32 bits per 64k page. + * Each 2-bit field is 0 to allow any access, 1 to prevent writes, + * 2 or 3 to prevent all accesses. + * Note that the normal page protections also apply; the subpage + * protection mechanism is an additional constraint, so putting 0 + * in a 2-bit field won't allow writes to a page that is otherwise + * write-protected. + */ +long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) +{ + struct mm_struct *mm = current->mm; + struct subpage_prot_table *spt = &mm->context.spt; + u32 **spm, *spp; + unsigned long i; + size_t nw; + unsigned long next, limit; + int err; + + /* Check parameters */ + if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || + addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE) + return -EINVAL; + + if (is_hugepage_only_range(mm, addr, len)) + return -EINVAL; + + if (!map) { + /* Clear out the protection map for the address range */ + subpage_prot_clear(addr, len); + return 0; + } + + if (!access_ok(VERIFY_READ, map, (len >> PAGE_SHIFT) * sizeof(u32))) + return -EFAULT; + + down_write(&mm->mmap_sem); + subpage_mark_vma_nohuge(mm, addr, len); + for (limit = addr + len; addr < limit; addr = next) { + next = pmd_addr_end(addr, limit); + err = -ENOMEM; + if (addr < 0x100000000UL) { + spm = spt->low_prot; + } else { + spm = spt->protptrs[addr >> SBP_L3_SHIFT]; + if (!spm) { + spm = (u32 **)get_zeroed_page(GFP_KERNEL); + if (!spm) + goto out; + spt->protptrs[addr >> SBP_L3_SHIFT] = spm; + } + } + spm += (addr >> SBP_L2_SHIFT) & (SBP_L2_COUNT - 1); + spp = *spm; + if (!spp) { + spp = (u32 *)get_zeroed_page(GFP_KERNEL); + if (!spp) + goto out; + *spm = spp; + } + spp += (addr >> PAGE_SHIFT) & (SBP_L1_COUNT - 1); + + local_irq_disable(); + demote_segment_4k(mm, addr); + local_irq_enable(); + + i = (addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + nw = PTRS_PER_PTE - i; + if (addr + (nw << PAGE_SHIFT) > next) + nw = (next - addr) >> PAGE_SHIFT; + + up_write(&mm->mmap_sem); + err = -EFAULT; + if (__copy_from_user(spp, map, nw * sizeof(u32))) + goto out2; + map += nw; + down_write(&mm->mmap_sem); + + /* now flush any existing HPTEs for the range */ + hpte_flush_range(mm, addr, nw); + } + if (limit > spt->maxaddr) + spt->maxaddr = limit; + err = 0; + out: + up_write(&mm->mmap_sem); + out2: + return err; +} diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c new file mode 100644 index 000000000..558e30cce --- /dev/null +++ b/arch/powerpc/mm/tlb_hash32.c @@ -0,0 +1,184 @@ +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU uses a hash table to store virtual to + * physical translations, these routines flush entries from the + * hash table also. + * -- paulus + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/export.h> + +#include <asm/tlbflush.h> +#include <asm/tlb.h> + +#include "mmu_decl.h" + +/* + * Called when unmapping pages to flush entries from the TLB/hash table. + */ +void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr) +{ + unsigned long ptephys; + + if (Hash != 0) { + ptephys = __pa(ptep) & PAGE_MASK; + flush_hash_pages(mm->context.id, addr, ptephys, 1); + } +} +EXPORT_SYMBOL(flush_hash_entry); + +/* + * Called by ptep_set_access_flags, must flush on CPUs for which the + * DSI handler can't just "fixup" the TLB on a write fault + */ +void flush_tlb_page_nohash(struct vm_area_struct *vma, unsigned long addr) +{ + if (Hash != 0) + return; + _tlbie(addr); +} + +/* + * Called at the end of a mmu_gather operation to make sure the + * TLB flush is completely done. + */ +void tlb_flush(struct mmu_gather *tlb) +{ + if (Hash == 0) { + /* + * 603 needs to flush the whole TLB here since + * it doesn't use a hash table. + */ + _tlbia(); + } +} + +/* + * TLB flushing: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * since the hardware hash table functions as an extension of the + * tlb as far as the linux tables are concerned, flush it too. + * -- Cort + */ + +static void flush_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + pmd_t *pmd; + unsigned long pmd_end; + int count; + unsigned int ctx = mm->context.id; + + if (Hash == 0) { + _tlbia(); + return; + } + start &= PAGE_MASK; + if (start >= end) + return; + end = (end - 1) | ~PAGE_MASK; + pmd = pmd_offset(pud_offset(pgd_offset(mm, start), start), start); + for (;;) { + pmd_end = ((start + PGDIR_SIZE) & PGDIR_MASK) - 1; + if (pmd_end > end) + pmd_end = end; + if (!pmd_none(*pmd)) { + count = ((pmd_end - start) >> PAGE_SHIFT) + 1; + flush_hash_pages(ctx, start, pmd_val(*pmd), count); + } + if (pmd_end == end) + break; + start = pmd_end + 1; + ++pmd; + } +} + +/* + * Flush kernel TLB entries in the given range + */ +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + flush_range(&init_mm, start, end); +} +EXPORT_SYMBOL(flush_tlb_kernel_range); + +/* + * Flush all the (user) entries for the address space described by mm. + */ +void flush_tlb_mm(struct mm_struct *mm) +{ + struct vm_area_struct *mp; + + if (Hash == 0) { + _tlbia(); + return; + } + + /* + * It is safe to go down the mm's list of vmas when called + * from dup_mmap, holding mmap_sem. It would also be safe from + * unmap_region or exit_mmap, but not from vmtruncate on SMP - + * but it seems dup_mmap is the only SMP case which gets here. + */ + for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); +} +EXPORT_SYMBOL(flush_tlb_mm); + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + struct mm_struct *mm; + pmd_t *pmd; + + if (Hash == 0) { + _tlbie(vmaddr); + return; + } + mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; + pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr); + if (!pmd_none(*pmd)) + flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1); +} +EXPORT_SYMBOL(flush_tlb_page); + +/* + * For each address in the range, find the pte for the address + * and check _PAGE_HASHPTE bit; if it is set, find and destroy + * the corresponding HPTE. + */ +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + flush_range(vma->vm_mm, start, end); +} +EXPORT_SYMBOL(flush_tlb_range); + +void __init early_init_mmu(void) +{ +} diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c new file mode 100644 index 000000000..c522969f0 --- /dev/null +++ b/arch/powerpc/mm/tlb_hash64.c @@ -0,0 +1,256 @@ +/* + * This file contains the routines for flushing entries from the + * TLB and MMU hash table. + * + * Derived from arch/ppc64/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * Dave Engebretsen <engebret@us.ibm.com> + * Rework for PPC64 port. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/bug.h> + +#include <trace/events/thp.h> + +DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); + +/* + * A linux PTE was changed and the corresponding hash table entry + * neesd to be flushed. This function will either perform the flush + * immediately or will batch it up if the current CPU has an active + * batch on it. + */ +void hpte_need_flush(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long pte, int huge) +{ + unsigned long vpn; + struct ppc64_tlb_batch *batch = &get_cpu_var(ppc64_tlb_batch); + unsigned long vsid; + unsigned int psize; + int ssize; + real_pte_t rpte; + int i; + + i = batch->index; + + /* Get page size (maybe move back to caller). + * + * NOTE: when using special 64K mappings in 4K environment like + * for SPEs, we obtain the page size from the slice, which thus + * must still exist (and thus the VMA not reused) at the time + * of this call + */ + if (huge) { +#ifdef CONFIG_HUGETLB_PAGE + psize = get_slice_psize(mm, addr); + /* Mask the address for the correct page size */ + addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); +#else + BUG(); + psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */ +#endif + } else { + psize = pte_pagesize_index(mm, addr, pte); + /* Mask the address for the standard page size. If we + * have a 64k page kernel, but the hardware does not + * support 64k pages, this might be different from the + * hardware page size encoded in the slice table. */ + addr &= PAGE_MASK; + } + + + /* Build full vaddr */ + if (!is_kernel_addr(addr)) { + ssize = user_segment_size(addr); + vsid = get_vsid(mm->context.id, addr, ssize); + } else { + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); + ssize = mmu_kernel_ssize; + } + WARN_ON(vsid == 0); + vpn = hpt_vpn(addr, vsid, ssize); + rpte = __real_pte(__pte(pte), ptep); + + /* + * Check if we have an active batch on this CPU. If not, just + * flush now and return. For now, we don global invalidates + * in that case, might be worth testing the mm cpu mask though + * and decide to use local invalidates instead... + */ + if (!batch->active) { + flush_hash_page(vpn, rpte, psize, ssize, 0); + put_cpu_var(ppc64_tlb_batch); + return; + } + + /* + * This can happen when we are in the middle of a TLB batch and + * we encounter memory pressure (eg copy_page_range when it tries + * to allocate a new pte). If we have to reclaim memory and end + * up scanning and resetting referenced bits then our batch context + * will change mid stream. + * + * We also need to ensure only one page size is present in a given + * batch + */ + if (i != 0 && (mm != batch->mm || batch->psize != psize || + batch->ssize != ssize)) { + __flush_tlb_pending(batch); + i = 0; + } + if (i == 0) { + batch->mm = mm; + batch->psize = psize; + batch->ssize = ssize; + } + batch->pte[i] = rpte; + batch->vpn[i] = vpn; + batch->index = ++i; + if (i >= PPC64_TLB_BATCH_NR) + __flush_tlb_pending(batch); + put_cpu_var(ppc64_tlb_batch); +} + +/* + * This function is called when terminating an mmu batch or when a batch + * is full. It will perform the flush of all the entries currently stored + * in a batch. + * + * Must be called from within some kind of spinlock/non-preempt region... + */ +void __flush_tlb_pending(struct ppc64_tlb_batch *batch) +{ + const struct cpumask *tmp; + int i, local = 0; + + i = batch->index; + tmp = cpumask_of(smp_processor_id()); + if (cpumask_equal(mm_cpumask(batch->mm), tmp)) + local = 1; + if (i == 1) + flush_hash_page(batch->vpn[0], batch->pte[0], + batch->psize, batch->ssize, local); + else + flush_hash_range(i, local); + batch->index = 0; +} + +void tlb_flush(struct mmu_gather *tlb) +{ + struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); + + /* If there's a TLB batch pending, then we must flush it because the + * pages are going to be freed and we really don't want to have a CPU + * access a freed page because it has a stale TLB + */ + if (tlbbatch->index) + __flush_tlb_pending(tlbbatch); + + put_cpu_var(ppc64_tlb_batch); +} + +/** + * __flush_hash_table_range - Flush all HPTEs for a given address range + * from the hash table (and the TLB). But keeps + * the linux PTEs intact. + * + * @mm : mm_struct of the target address space (generally init_mm) + * @start : starting address + * @end : ending address (not included in the flush) + * + * This function is mostly to be used by some IO hotplug code in order + * to remove all hash entries from a given address range used to map IO + * space on a removed PCI-PCI bidge without tearing down the full mapping + * since 64K pages may overlap with other bridges when using 64K pages + * with 4K HW pages on IO space. + * + * Because of that usage pattern, it is implemented for small size rather + * than speed. + */ +void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + int hugepage_shift; + unsigned long flags; + + start = _ALIGN_DOWN(start, PAGE_SIZE); + end = _ALIGN_UP(end, PAGE_SIZE); + + BUG_ON(!mm->pgd); + + /* Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + for (; start < end; start += PAGE_SIZE) { + pte_t *ptep = find_linux_pte_or_hugepte(mm->pgd, start, + &hugepage_shift); + unsigned long pte; + + if (ptep == NULL) + continue; + pte = pte_val(*ptep); + if (hugepage_shift) + trace_hugepage_invalidate(start, pte); + if (!(pte & _PAGE_HASHPTE)) + continue; + if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte))) + hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); + else + hpte_need_flush(mm, start, ptep, pte, 0); + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} + +void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) +{ + pte_t *pte; + pte_t *start_pte; + unsigned long flags; + + addr = _ALIGN_DOWN(addr, PMD_SIZE); + /* Note: Normally, we should only ever use a batch within a + * PTE locked section. This violates the rule, but will work + * since we don't actually modify the PTEs, we just flush the + * hash while leaving the PTEs intact (including their reference + * to being hashed). This is not the most performance oriented + * way to do things but is fine for our needs here. + */ + local_irq_save(flags); + arch_enter_lazy_mmu_mode(); + start_pte = pte_offset_map(pmd, addr); + for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { + unsigned long pteval = pte_val(*pte); + if (pteval & _PAGE_HASHPTE) + hpte_need_flush(mm, addr, pte, pteval, 0); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); + local_irq_restore(flags); +} diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S new file mode 100644 index 000000000..89bf95bd6 --- /dev/null +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -0,0 +1,1218 @@ +/* + * Low level TLB miss handlers for Book3E + * + * Copyright (C) 2008-2009 + * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/cputable.h> +#include <asm/pgtable.h> +#include <asm/exception-64e.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_asm.h> +#include <asm/kvm_booke_hv_asm.h> + +#ifdef CONFIG_PPC_64K_PAGES +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) +#else +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) +#endif +#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) +#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) +#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) + +/********************************************************************** + * * + * TLB miss handling for Book3E with a bolted linear mapping * + * No virtual page table, no nested TLB misses * + * * + **********************************************************************/ + +/* + * Note that, unlike non-bolted handlers, TLB_EXFRAME is not + * modified by the TLB miss handlers themselves, since the TLB miss + * handler code will not itself cause a recursive TLB miss. + * + * TLB_EXFRAME will be modified when crit/mc/debug exceptions are + * entered/exited. + */ +.macro tlb_prolog_bolted intnum addr + mtspr SPRN_SPRG_GEN_SCRATCH,r12 + mfspr r12,SPRN_SPRG_TLB_EXFRAME + std r13,EX_TLB_R13(r12) + std r10,EX_TLB_R10(r12) + mfspr r13,SPRN_SPRG_PACA + + mfcr r10 + std r11,EX_TLB_R11(r12) +#ifdef CONFIG_KVM_BOOKE_HV +BEGIN_FTR_SECTION + mfspr r11, SPRN_SRR1 +END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) +#endif + DO_KVM \intnum, SPRN_SRR1 + std r16,EX_TLB_R16(r12) + mfspr r16,\addr /* get faulting address */ + std r14,EX_TLB_R14(r12) + ld r14,PACAPGD(r13) + std r15,EX_TLB_R15(r12) + std r10,EX_TLB_CR(r12) + TLB_MISS_PROLOG_STATS +.endm + +.macro tlb_epilog_bolted + ld r14,EX_TLB_CR(r12) + ld r10,EX_TLB_R10(r12) + ld r11,EX_TLB_R11(r12) + ld r13,EX_TLB_R13(r12) + mtcr r14 + ld r14,EX_TLB_R14(r12) + ld r15,EX_TLB_R15(r12) + TLB_MISS_RESTORE_STATS + ld r16,EX_TLB_R16(r12) + mfspr r12,SPRN_SPRG_GEN_SCRATCH +.endm + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + + mfspr r11,SPRN_ESR + + srdi r15,r16,60 /* get region */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- dtlb_miss_fault_bolted /* Bail if fault addr is invalid */ + + rlwinm r10,r11,32-19,27,27 + rlwimi r10,r11,32-16,19,19 + cmpwi r15,0 /* user vs kernel check */ + ori r10,r10,_PAGE_PRESENT + oris r11,r10,_PAGE_ACCESSED@h + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_bolted + +tlb_miss_common_bolted: +/* + * This is the guts of the TLB miss handler for bolted-linear. + * We are entered with: + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq tlb_miss_fault_bolted /* No PGDIR, bail */ + +BEGIN_MMU_FTR_SECTION + /* Set the TLB reservation and search for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,R16) + ldx r14,r14,r15 /* grab pgd entry */ + beq tlb_miss_done_bolted /* tlb exists already, bail */ +MMU_FTR_SECTION_ELSE + ldx r14,r14,r15 /* grab pgd entry */ +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) + +#ifndef CONFIG_PPC_64K_PAGES + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ +#endif /* CONFIG_PPC_64K_PAGES */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab pmd entry */ + + rldicl r15,r16,64-PAGE_SHIFT+3,64-PTE_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_bolted + ldx r14,r14,r15 /* Grab PTE, normal (!huge) page */ + + /* Check if required permissions are met */ + andc. r15,r11,r14 + rldicr r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + bne- tlb_miss_fault_bolted + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + clrldi r15,r15,12 /* Clear crap at the top */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + mtspr SPRN_MAS2,r11 + andi. r11,r14,_PAGE_DIRTY + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: + mtspr SPRN_MAS7_MAS3,r15 + tlbwe + +tlb_miss_done_bolted: + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + tlb_epilog_bolted + rfi + +itlb_miss_kernel_bolted: + li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h +tlb_miss_kernel_bolted: + mfspr r10,SPRN_MAS1 + ld r14,PACA_KERNELPGD(r13) + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ tlb_miss_common_bolted + +tlb_miss_fault_bolted: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_EXEC|_PAGE_BAP_SX + bne itlb_miss_fault_bolted +dtlb_miss_fault_bolted: + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_bolted: + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_instruction_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_bolted) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + srdi r15,r16,60 /* get region */ + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne- itlb_miss_fault_bolted + + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + + cmpldi cr0,r15,0 /* Check for user region */ + oris r11,r11,_PAGE_ACCESSED@h + beq tlb_miss_common_bolted + b itlb_miss_kernel_bolted + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* + * TLB miss handling for e6500 and derivatives, using hardware tablewalk. + * + * Linear mapping is bolted: no virtual page table or nested TLB misses + * Indirect entries in TLB1, hardware loads resulting direct entries + * into TLB0 + * No HES or NV hint on TLB1, so we need to do software round-robin + * No tlbsrx. so we need a spinlock, and we have to deal + * with MAS-damage caused by tlbsx + * 4K pages only + */ + + START_EXCEPTION(instruction_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0 + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + ori r16,r16,1 + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_e6500 /* user/kernel test */ + + b tlb_miss_common_e6500 + + START_EXCEPTION(data_tlb_miss_e6500) + tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR + + ld r11,PACA_TCD_PTR(r13) + srdi. r15,r16,60 /* get region */ + rldicr r16,r16,0,62 + + TLB_MISS_STATS_SAVE_INFO_BOLTED + bne tlb_miss_kernel_e6500 /* user vs kernel check */ + +/* + * This is the guts of the TLB miss handler for e6500 and derivatives. + * We are entered with: + * + * r16 = page of faulting address (low bit 0 if data, 1 if instruction) + * r15 = crap (free to use) + * r14 = page table base + * r13 = PACA + * r11 = tlb_per_core ptr + * r10 = crap (free to use) + */ +tlb_miss_common_e6500: + crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */ + +BEGIN_FTR_SECTION /* CPU_FTR_SMT */ + /* + * Search if we already have an indirect entry for that virtual + * address, and if we do, bail out. + * + * MAS6:IND should be already set based on MAS4 + */ +1: lbarx r15,0,r11 + lhz r10,PACAPACAINDEX(r13) + cmpdi r15,0 + cmpdi cr1,r15,1 /* set cr1.eq = 0 for non-recursive */ + addi r10,r10,1 + bne 2f + stbcx. r10,0,r11 + bne 1b +3: + .subsection 1 +2: cmpd cr1,r15,r10 /* recursive lock due to mcheck/crit/etc? */ + beq cr1,3b /* unlock will happen if cr1.eq = 0 */ + lbz r15,0(r11) + cmpdi r15,0 + bne 2b + b 1b + .previous + + /* + * Erratum A-008139 says that we can't use tlbwe to change + * an indirect entry in any way (including replacing or + * invalidating) if the other thread could be in the process + * of a lookup. The workaround is to invalidate the entry + * with tlbilx before overwriting. + */ + + lbz r15,TCD_ESEL_NEXT(r11) + rlwinm r10,r15,16,0xff0000 + oris r10,r10,MAS0_TLBSEL(1)@h + mtspr SPRN_MAS0,r10 + isync + tlbre + mfspr r15,SPRN_MAS1 + andis. r15,r15,MAS1_VALID@h + beq 5f + +BEGIN_FTR_SECTION_NESTED(532) + mfspr r10,SPRN_MAS8 + rlwinm r10,r10,0,0x80000fff /* tgs,tlpid -> sgs,slpid */ + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) + + mfspr r10,SPRN_MAS1 + rlwinm r15,r10,0,0x3fff0000 /* tid -> spid */ + rlwimi r15,r10,20,0x00000003 /* ind,ts -> sind,sas */ + mfspr r10,SPRN_MAS6 + mtspr SPRN_MAS6,r15 + + mfspr r15,SPRN_MAS2 + isync + tlbilxva 0,r15 + isync + + mtspr SPRN_MAS6,r10 + +5: +BEGIN_FTR_SECTION_NESTED(532) + li r10,0 + mtspr SPRN_MAS8,r10 + mtspr SPRN_MAS5,r10 +END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532) + + tlbsx 0,r16 + mfspr r10,SPRN_MAS1 + andis. r15,r10,MAS1_VALID@h + bne tlb_miss_done_e6500 +FTR_SECTION_ELSE + mfspr r10,SPRN_MAS1 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT) + + oris r10,r10,MAS1_VALID@h + beq cr2,4f + rlwinm r10,r10,0,16,1 /* Clear TID */ +4: mtspr SPRN_MAS1,r10 + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- tlb_miss_fault_e6500 + + rldicl r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3 + cmpldi cr0,r14,0 + clrrdi r15,r15,3 + beq- tlb_miss_fault_e6500 /* No PGDIR, bail */ + ldx r14,r14,r15 /* grab pgd entry */ + + rldicl r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_e6500 /* Bad pgd entry or hugepage; bail */ + ldx r14,r14,r15 /* grab pud entry */ + + rldicl r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3 + clrrdi r15,r15,3 + cmpdi cr0,r14,0 + bge tlb_miss_fault_e6500 + ldx r14,r14,r15 /* Grab pmd entry */ + + mfspr r10,SPRN_MAS0 + cmpdi cr0,r14,0 + bge tlb_miss_fault_e6500 + + /* Now we build the MAS for a 2M indirect page: + * + * MAS 0 : ESEL needs to be filled by software round-robin + * MAS 1 : Fully set up + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * - TID already cleared if necessary + * MAS 2 : Default not 2M-aligned, need to be redone + * MAS 3+7 : Needs to be done + */ + + ori r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) + mtspr SPRN_MAS7_MAS3,r14 + + clrrdi r15,r16,21 /* make EA 2M-aligned */ + mtspr SPRN_MAS2,r15 + + lbz r15,TCD_ESEL_NEXT(r11) + lbz r16,TCD_ESEL_MAX(r11) + lbz r14,TCD_ESEL_FIRST(r11) + rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */ + addi r15,r15,1 /* increment esel_next */ + mtspr SPRN_MAS0,r10 + cmpw r15,r16 + iseleq r15,r14,r15 /* if next == last use first */ + stb r15,TCD_ESEL_NEXT(r11) + + tlbwe + +tlb_miss_done_e6500: + .macro tlb_unlock_e6500 +BEGIN_FTR_SECTION + beq cr1,1f /* no unlock if lock was recursively grabbed */ + li r15,0 + isync + stb r15,0(r11) +1: +END_FTR_SECTION_IFSET(CPU_FTR_SMT) + .endm + + tlb_unlock_e6500 + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + tlb_epilog_bolted + rfi + +tlb_miss_kernel_e6500: + ld r14,PACA_KERNELPGD(r13) + cmpldi cr1,r15,8 /* Check for vmalloc region */ + beq+ cr1,tlb_miss_common_e6500 + +tlb_miss_fault_e6500: + tlb_unlock_e6500 + /* We need to check if it was an instruction miss */ + andi. r16,r16,1 + bne itlb_miss_fault_e6500 +dtlb_miss_fault_e6500: + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_data_storage_book3e +itlb_miss_fault_e6500: + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + tlb_epilog_bolted + b exc_instruction_storage_book3e +#endif /* CONFIG_PPC_FSL_BOOK3E */ + +/********************************************************************** + * * + * TLB miss handling for Book3E with TLB reservation and HES support * + * * + **********************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* The page tables are mapped virtually linear. At this point, though, + * we don't know whether we are trying to fault in a first level + * virtual address or a virtual page table address. We can get that + * from bit 0x1 of the region ID which we have set for a page table + */ + andi. r10,r15,0x1 + bne- virt_page_table_tlb_miss + + std r14,EX_TLB_ESR(r12); /* save ESR */ + std r16,EX_TLB_DEAR(r12); /* save DEAR */ + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + li r11,_PAGE_PRESENT + oris r11,r11,_PAGE_ACCESSED@h + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r15,0 /* Check for user region */ + + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + rlwimi r11,r14,32-19,27,27 + rlwimi r11,r14,32-16,19,19 + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by writing a crazy value in ESR in our exception frame + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + + cmpldi cr0,r15,0 /* Check for user region */ + std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ + beq normal_tlb_miss + + li r11,_PAGE_PRESENT|_PAGE_BAP_SX /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of the first-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = faulting address + * r15 = region ID + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ +normal_tlb_miss: + /* So we first construct the page table address. We do that by + * shifting the bottom of the address (not the region ID) by + * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and + * or'ing the fourth high bit. + * + * NOTE: For 64K pages, we do things slightly differently in + * order to handle the weird page table format used by linux + */ + ori r10,r15,0x1 +#ifdef CONFIG_PPC_64K_PAGES + /* For the top bits, 16 bytes per PTE */ + rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 + /* Now create the bottom bits as 0 in position 0x8000 and + * the rest calculated for 8 bytes per PTE + */ + rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 + /* Insert the bottom bits in */ + rlwimi r14,r15,0,16,31 +#else + rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 +#endif + sldi r15,r10,60 + clrrdi r14,r14,3 + or r10,r15,r14 + +BEGIN_MMU_FTR_SECTION + /* Set the TLB reservation and search for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,R16) + ld r14,0(r10) + beq normal_tlb_miss_done +MMU_FTR_SECTION_ELSE + ld r14,0(r10) +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) + +finish_normal_tlb_miss: + /* Check if required permissions are met */ + andc. r15,r11,r14 + bne- normal_tlb_miss_access_fault + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + * + * TODO: mix up code below for better scheduling + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r11 + + /* Check page size, if not standard, update MAS1 */ + rldicl r11,r14,64-8,64-8 +#ifdef CONFIG_PPC_64K_PAGES + cmpldi cr0,r11,BOOK3E_PAGESZ_64K +#else + cmpldi cr0,r11,BOOK3E_PAGESZ_4K +#endif + beq- 1f + mfspr r11,SPRN_MAS1 + rlwimi r11,r14,31,21,24 + rlwinm r11,r11,0,21,19 + mtspr SPRN_MAS1,r11 +1: + /* Move RPN in position */ + rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + clrldi r15,r11,12 /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + andi. r11,r14,_PAGE_DIRTY + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: +BEGIN_MMU_FTR_SECTION + srdi r16,r15,32 + mtspr SPRN_MAS3,r15 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r15 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +normal_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +normal_tlb_miss_access_fault: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_EXEC + bne 1f + ld r14,EX_TLB_DEAR(r12) + ld r15,EX_TLB_ESR(r12) + mtspr SPRN_DEAR,r14 + mtspr SPRN_ESR,r15 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = region (top 4 bits of address) + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * Note that this should only ever be called as a second level handler + * with the current scheme when using SW load. + * That means we can always get the original fault DEAR at + * EX_TLB_DEAR-EX_TLB_SIZE(r12) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will restart the whole fault at level + * 0 so we don't care too much about clobbers + * + * XXX That code was written back when we couldn't clobber r14. We can now, + * so we could probably optimize things a bit + */ +virt_page_table_tlb_miss: + /* Are we hitting a kernel page table ? */ + andi. r10,r15,0x8 + + /* The cool thing now is that r10 contains 0 for user and 8 for kernel, + * and we happen to have the swapper_pg_dir at offset 8 from the user + * pgdir in the PACA :-). + */ + add r11,r10,r13 + + /* If kernel, we need to clear MAS1 TID */ + beq 1f + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 +1: +BEGIN_MMU_FTR_SECTION + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + */ + PPC_TLBSRX_DOT(0,R16) + beq virt_page_table_tlb_miss_done +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + bne- virt_page_table_tlb_miss_fault + + /* Get the PGD pointer */ + ld r15,PACAPGD(r11) + cmpldi cr0,r15,0 + beq- virt_page_table_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge virt_page_table_tlb_miss_fault + + /* Ok, we're all right, we can now create a kernel translation for + * a 4K or 64K page from r16 -> r15. + */ + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + * + * So we only do MAS 2 and 3 for now... + */ + clrldi r11,r15,4 /* remove region ID from RPN */ + ori r10,r11,1 /* Or-in SR */ + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +BEGIN_MMU_FTR_SECTION +virt_page_table_tlb_miss_done: + + /* We have overriden MAS2:EPN but currently our primary TLB miss + * handler will always restore it so that should not be an issue, + * if we ever optimize the primary handler to not write MAS2 on + * some cases, we'll have to restore MAS2:EPN here based on the + * original fault's DEAR. If we do that we have to modify the + * ITLB miss handler to also store SRR0 in the exception frame + * as DEAR. + * + * However, one nasty thing we did is we cleared the reservation + * (well, potentially we did). We do a trick here thus if we + * are not a level 0 exception (we interrupted the TLB miss) we + * offset the return address by -4 in order to replay the tlbsrx + * instruction there + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- 1f + ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) + addi r10,r11,-4 + std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) +1: +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) + /* Return to caller, normal case */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); + TLB_MISS_EPILOG_SUCCESS + rfi + +virt_page_table_tlb_miss_fault: + /* If we fault here, things are a little bit tricky. We need to call + * either data or instruction store fault, and we need to retrieve + * the original fault address and ESR (for data). + * + * The thing is, we know that in normal circumstances, this is + * always called as a second level tlb miss for SW load or as a first + * level TLB miss for HW load, so we should be able to peek at the + * relevant information in the first exception frame in the PACA. + * + * However, we do need to double check that, because we may just hit + * a stray kernel pointer or a userland attack trying to hit those + * areas. If that is the case, we do a data fault. (We can't get here + * from an instruction tlb miss anyway). + * + * Note also that when going to a fault, we must unwind the previous + * level as well. Since we are doing that, we don't need to clear or + * restore the TLB reservation neither. + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- virt_page_table_tlb_miss_whacko_fault + + /* We dig the original DEAR and ESR from slot 0 */ + ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) + ld r16,EX_TLB_ESR+PACA_EXTLB(r13) + + /* We check for the "special" ESR value for instruction faults */ + cmpdi cr0,r16,-1 + beq 1f + mtspr SPRN_DEAR,r15 + mtspr SPRN_ESR,r16 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +virt_page_table_tlb_miss_whacko_fault: + /* The linear fault will restart everything so ESR and DEAR will + * not have been clobbered, let's just fault with what we have + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + + +/************************************************************** + * * + * TLB miss handling for Book3E with hw page table support * + * * + **************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_htw) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_htw) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by keeping a crazy value for ESR in r14 + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = PGD pointer + * r14 = ESR + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will save/restore things for us + */ +htw_tlb_miss: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + * + * MAS1:IND should be already set based on MAS4 + */ + PPC_TLBSRX_DOT(0,R16) + beq htw_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- htw_tlb_miss_fault + + /* Get the PGD pointer */ + cmpldi cr0,r15,0 + beq- htw_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpdi cr0,r15,0 + bge htw_tlb_miss_fault + + /* Ok, we're all right, we can now create an indirect entry for + * a 1M or 256M page. + * + * The last trick is now that because we use "half" pages for + * the HTW (1M IND is 2K and 256M IND is 32K) we need to account + * for an added LSB bit to the RPN. For 64K pages, there is no + * problem as we already use 32K arrays (half PTE pages), but for + * 4K page we need to extract a bit from the virtual address and + * insert it into the "PA52" bit of the RPN. + */ +#ifndef CONFIG_PPC_64K_PAGES + rlwimi r15,r16,32-9,20,20 +#endif + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + */ +#ifdef CONFIG_PPC_64K_PAGES + ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) +#else + ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) +#endif + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +htw_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +htw_tlb_miss_fault: + /* We need to check if it was an instruction miss. We know this + * though because r14 would contain -1 + */ + cmpdi cr0,r14,-1 + beq 1f + mtspr SPRN_DEAR,r16 + mtspr SPRN_ESR,r14 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of "any" level TLB miss handler for kernel linear + * mapping misses. We are entered with: + * + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = ESR (data) or -1 (instruction) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * In addition we know that we will not re-enter, so in theory, we could + * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. + * + * We also need to be careful about MAS registers here & TLB reservation, + * as we know we'll have clobbered them if we interrupt the main TLB miss + * handlers in which case we probably want to do a full restart at level + * 0 rather than saving / restoring the MAS. + * + * Note: If we care about performance of that core, we can easily shuffle + * a few things around + */ +tlb_load_linear: + /* For now, we assume the linear mapping is contiguous and stops at + * linear_map_top. We also assume the size is a multiple of 1G, thus + * we only use 1G pages for now. That might have to be changed in a + * final implementation, especially when dealing with hypervisors + */ + ld r11,PACATOC(r13) + ld r11,linear_map_top@got(r11) + ld r10,0(r11) + tovirt(10,10) + cmpld cr0,r16,r10 + bge tlb_load_linear_fault + + /* MAS1 need whole new setup. */ + li r15,(BOOK3E_PAGESZ_1GB<<MAS1_TSIZE_SHIFT) + oris r15,r15,MAS1_VALID@h /* MAS1 needs V and TSIZE */ + mtspr SPRN_MAS1,r15 + + /* Already somebody there ? */ + PPC_TLBSRX_DOT(0,R16) + beq tlb_load_linear_done + + /* Now we build the remaining MAS. MAS0 and 2 should be fine + * with their defaults, which leaves us with MAS 3 and 7. The + * mapping is linear, so we just take the address, clear the + * region bits, and or in the permission bits which are currently + * hard wired + */ + clrrdi r10,r16,30 /* 1G page index */ + clrldi r10,r10,4 /* clear region bits */ + ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) + + tlbwe + +tlb_load_linear_done: + /* We use the "error" epilog for success as we do want to + * restore to the initial faulting context, whatever it was. + * We do that because we can't resume a fault within a TLB + * miss handler, due to MAS and TLB reservation being clobbered. + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_LINEAR) + TLB_MISS_EPILOG_ERROR + rfi + +tlb_load_linear_fault: + /* We keep the DEAR and ESR around, this shouldn't have happened */ + cmpdi cr0,r14,-1 + beq 1f + TLB_MISS_EPILOG_ERROR_SPECIAL + b exc_data_storage_book3e +1: TLB_MISS_EPILOG_ERROR_SPECIAL + b exc_instruction_storage_book3e + + +#ifdef CONFIG_BOOK3E_MMU_TLB_STATS +.tlb_stat_inc: +1: ldarx r8,0,r9 + addi r8,r8,1 + stdcx. r8,0,r9 + bne- 1b + blr +#endif diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c new file mode 100644 index 000000000..cbd3d0698 --- /dev/null +++ b/arch/powerpc/mm/tlb_nohash.c @@ -0,0 +1,760 @@ +/* + * This file contains the routines for TLB flushing. + * On machines where the MMU does not use a hash table to store virtual to + * physical translations (ie, SW loaded TLBs or Book3E compilant processors, + * this does -not- include 603 however which shares the implementation with + * hash based processors) + * + * -- BenH + * + * Copyright 2008,2009 Ben Herrenschmidt <benh@kernel.crashing.org> + * IBM Corp. + * + * Derived from arch/ppc/mm/init.c: + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au) + * and Cort Dougan (PReP) (cort@cs.nmt.edu) + * Copyright (C) 1996 Paul Mackerras + * + * Derived from "arch/i386/mm/init.c" + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/preempt.h> +#include <linux/spinlock.h> +#include <linux/memblock.h> +#include <linux/of_fdt.h> +#include <linux/hugetlb.h> + +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/code-patching.h> +#include <asm/hugetlb.h> +#include <asm/paca.h> + +#include "mmu_decl.h" + +/* + * This struct lists the sw-supported page sizes. The hardawre MMU may support + * other sizes not listed here. The .ind field is only used on MMUs that have + * indirect page table entries. + */ +#ifdef CONFIG_PPC_BOOK3E_MMU +#ifdef CONFIG_PPC_FSL_BOOK3E +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_2M] = { + .shift = 21, + .enc = BOOK3E_PAGESZ_2M, + }, + [MMU_PAGE_4M] = { + .shift = 22, + .enc = BOOK3E_PAGESZ_4M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_64M] = { + .shift = 26, + .enc = BOOK3E_PAGESZ_64M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#else +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .ind = 20, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_16K] = { + .shift = 14, + .enc = BOOK3E_PAGESZ_16K, + }, + [MMU_PAGE_64K] = { + .shift = 16, + .ind = 28, + .enc = BOOK3E_PAGESZ_64K, + }, + [MMU_PAGE_1M] = { + .shift = 20, + .enc = BOOK3E_PAGESZ_1M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .ind = 36, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +#endif /* CONFIG_FSL_BOOKE */ + +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif /* CONFIG_PPC_BOOK3E_MMU */ + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +#ifdef CONFIG_PPC64 + +int mmu_linear_psize; /* Page size used for the linear mapping */ +int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ +int book3e_htw_mode; /* HW tablewalk? Value is PPC_HTW_* */ +unsigned long linear_map_top; /* Top of linear mapping */ + + +/* + * Number of bytes to add to SPRN_SPRG_TLB_EXFRAME on crit/mcheck/debug + * exceptions. This is used for bolted and e6500 TLB miss handlers which + * do not modify this SPRG in the TLB miss code; for other TLB miss handlers, + * this is set to zero. + */ +int extlb_level_exc; + +#endif /* CONFIG_PPC64 */ + +#ifdef CONFIG_PPC_FSL_BOOK3E +/* next_tlbcam_idx is used to round-robin tlbcam entry assignment */ +DEFINE_PER_CPU(int, next_tlbcam_idx); +EXPORT_PER_CPU_SYMBOL(next_tlbcam_idx); +#endif + +/* + * Base TLB flushing operations: + * + * - flush_tlb_mm(mm) flushes the specified mm context TLB's + * - flush_tlb_page(vma, vmaddr) flushes one page + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes kernel pages + * + * - local_* variants of page and mm only apply to the current + * processor + */ + +/* + * These are the base non-SMP variants of page and mm flushing + */ +void local_flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (pid != MMU_NO_CONTEXT) + _tlbil_pid(pid); + preempt_enable(); +} +EXPORT_SYMBOL(local_flush_tlb_mm); + +void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + unsigned int pid; + + preempt_disable(); + pid = mm ? mm->context.id : 0; + if (pid != MMU_NO_CONTEXT) + _tlbil_va(vmaddr, pid, tsize, ind); + preempt_enable(); +} + +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(local_flush_tlb_page); + +/* + * And here are the SMP non-local implementations + */ +#ifdef CONFIG_SMP + +static DEFINE_RAW_SPINLOCK(tlbivax_lock); + +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_thread_cpumask(smp_processor_id())); +} + +struct tlb_flush_param { + unsigned long addr; + unsigned int pid; + unsigned int tsize; + unsigned int ind; +}; + +static void do_flush_tlb_mm_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_pid(p ? p->pid : 0); +} + +static void do_flush_tlb_page_ipi(void *param) +{ + struct tlb_flush_param *p = param; + + _tlbil_va(p->addr, p->pid, p->tsize, p->ind); +} + + +/* Note on invalidations and PID: + * + * We snapshot the PID with preempt disabled. At this point, it can still + * change either because: + * - our context is being stolen (PID -> NO_CONTEXT) on another CPU + * - we are invaliating some target that isn't currently running here + * and is concurrently acquiring a new PID on another CPU + * - some other CPU is re-acquiring a lost PID for this mm + * etc... + * + * However, this shouldn't be a problem as we only guarantee + * invalidation of TLB entries present prior to this call, so we + * don't care about the PID changing, and invalidating a stale PID + * is generally harmless. + */ + +void flush_tlb_mm(struct mm_struct *mm) +{ + unsigned int pid; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + if (!mm_is_core_local(mm)) { + struct tlb_flush_param p = { .pid = pid }; + /* Ignores smp_processor_id() even if set. */ + smp_call_function_many(mm_cpumask(mm), + do_flush_tlb_mm_ipi, &p, 1); + } + _tlbil_pid(pid); + no_context: + preempt_enable(); +} +EXPORT_SYMBOL(flush_tlb_mm); + +void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) +{ + struct cpumask *cpu_mask; + unsigned int pid; + + /* + * This function as well as __local_flush_tlb_page() must only be called + * for user contexts. + */ + if (unlikely(WARN_ON(!mm))) + return; + + preempt_disable(); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto bail; + cpu_mask = mm_cpumask(mm); + if (!mm_is_core_local(mm)) { + /* If broadcast tlbivax is supported, use it */ + if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { + int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); + if (lock) + raw_spin_lock(&tlbivax_lock); + _tlbivax_bcast(vmaddr, pid, tsize, ind); + if (lock) + raw_spin_unlock(&tlbivax_lock); + goto bail; + } else { + struct tlb_flush_param p = { + .pid = pid, + .addr = vmaddr, + .tsize = tsize, + .ind = ind, + }; + /* Ignores smp_processor_id() even if set in cpu_mask */ + smp_call_function_many(cpu_mask, + do_flush_tlb_page_ipi, &p, 1); + } + } + _tlbil_va(vmaddr, pid, tsize, ind); + bail: + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ +#ifdef CONFIG_HUGETLB_PAGE + if (vma && is_vm_hugetlb_page(vma)) + flush_hugetlb_page(vma, vmaddr); +#endif + + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + mmu_get_tsize(mmu_virtual_psize), 0); +} +EXPORT_SYMBOL(flush_tlb_page); + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_PPC_47x +void __init early_init_mmu_47x(void) +{ +#ifdef CONFIG_SMP + unsigned long root = of_get_flat_dt_root(); + if (of_get_flat_dt_prop(root, "cooperative-partition", NULL)) + mmu_clear_feature(MMU_FTR_USE_TLBIVAX_BCAST); +#endif /* CONFIG_SMP */ +} +#endif /* CONFIG_PPC_47x */ + +/* + * Flush kernel TLB entries in the given range + */ +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_SMP + preempt_disable(); + smp_call_function(do_flush_tlb_mm_ipi, NULL, 1); + _tlbil_pid(0); + preempt_enable(); +#else + _tlbil_pid(0); +#endif +} +EXPORT_SYMBOL(flush_tlb_kernel_range); + +/* + * Currently, for range flushing, we just do a full mm flush. This should + * be optimized based on a threshold on the size of the range, since + * some implementation can stack multiple tlbivax before a tlbsync but + * for now, we keep it that way + */ +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ + flush_tlb_mm(vma->vm_mm); +} +EXPORT_SYMBOL(flush_tlb_range); + +void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); +} + +/* + * Below are functions specific to the 64-bit variant of Book3E though that + * may change in the future + */ + +#ifdef CONFIG_PPC64 + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].enc; + + if (book3e_htw_mode != PPC_HTW_NONE) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + +#ifdef CONFIG_PPC_64K_PAGES + vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; +#else + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; +#endif + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +static void setup_page_sizes(void) +{ + unsigned int tlb0cfg; + unsigned int tlb0ps; + unsigned int eptcfg; + int i, psize; + +#ifdef CONFIG_PPC_FSL_BOOK3E + unsigned int mmucfg = mfspr(SPRN_MMUCFG); + int fsl_mmu = mmu_has_feature(MMU_FTR_TYPE_FSL_E); + + if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V1) { + unsigned int tlb1cfg = mfspr(SPRN_TLB1CFG); + unsigned int min_pg, max_pg; + + min_pg = (tlb1cfg & TLBnCFG_MINSIZE) >> TLBnCFG_MINSIZE_SHIFT; + max_pg = (tlb1cfg & TLBnCFG_MAXSIZE) >> TLBnCFG_MAXSIZE_SHIFT; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def; + unsigned int shift; + + def = &mmu_psize_defs[psize]; + shift = def->shift; + + if (shift == 0 || shift & 1) + continue; + + /* adjust to be in terms of 4^shift Kb */ + shift = (shift - 10) >> 1; + + if ((shift >= min_pg) && (shift <= max_pg)) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + goto out; + } + + if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) { + u32 tlb1cfg, tlb1ps; + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb1cfg = mfspr(SPRN_TLB1CFG); + tlb1ps = mfspr(SPRN_TLB1PS); + eptcfg = mfspr(SPRN_EPTCFG); + + if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT)) + book3e_htw_mode = PPC_HTW_E6500; + + /* + * We expect 4K subpage size and unrestricted indirect size. + * The lack of a restriction on indirect size is a Freescale + * extension, indicated by PSn = 0 but SPSn != 0. + */ + if (eptcfg != 2) + book3e_htw_mode = PPC_HTW_NONE; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (tlb1ps & (1U << (def->shift - 10))) { + def->flags |= MMU_PAGE_SIZE_DIRECT; + + if (book3e_htw_mode && psize == MMU_PAGE_2M) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + } + } + + goto out; + } +#endif + + tlb0cfg = mfspr(SPRN_TLB0CFG); + tlb0ps = mfspr(SPRN_TLB0PS); + eptcfg = mfspr(SPRN_EPTCFG); + + /* Look for supported direct sizes */ + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (tlb0ps & (1U << (def->shift - 10))) + def->flags |= MMU_PAGE_SIZE_DIRECT; + } + + /* Indirect page sizes supported ? */ + if ((tlb0cfg & TLBnCFG_IND) == 0 || + (tlb0cfg & TLBnCFG_PT) == 0) + goto out; + + book3e_htw_mode = PPC_HTW_IBM; + + /* Now, we only deal with one IND page size for each + * direct size. Hopefully all implementations today are + * unambiguous, but we might want to be careful in the + * future. + */ + for (i = 0; i < 3; i++) { + unsigned int ps, sps; + + sps = eptcfg & 0x1f; + eptcfg >>= 5; + ps = eptcfg & 0x1f; + eptcfg >>= 5; + if (!ps || !sps) + continue; + for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + + if (ps == (def->shift - 10)) + def->flags |= MMU_PAGE_SIZE_INDIRECT; + if (sps == (def->shift - 10)) + def->ind = ps + 10; + } + } + +out: + /* Cleanup array and print summary */ + pr_info("MMU: Supported page sizes\n"); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + struct mmu_psize_def *def = &mmu_psize_defs[psize]; + const char *__page_type_names[] = { + "unsupported", + "direct", + "indirect", + "direct & indirect" + }; + if (def->flags == 0) { + def->shift = 0; + continue; + } + pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10), + __page_type_names[def->flags & 0x3]); + } +} + +static void setup_mmu_htw(void) +{ + /* + * If we want to use HW tablewalk, enable it by patching the TLB miss + * handlers to branch to the one dedicated to it. + */ + + switch (book3e_htw_mode) { + case PPC_HTW_IBM: + patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e); + break; +#ifdef CONFIG_PPC_FSL_BOOK3E + case PPC_HTW_E6500: + extlb_level_exc = EX_TLB_SIZE; + patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e); + patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e); + break; +#endif + } + pr_info("MMU: Book3E HW tablewalk %s\n", + book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported"); +} + +/* + * Early initialization of the MMU TLB code + */ +static void early_init_this_mmu(void) +{ + unsigned int mas4; + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + switch (book3e_htw_mode) { + case PPC_HTW_E6500: + mas4 |= MAS4_INDD; + mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT; + mas4 |= MAS4_TLBSELD(1); + mmu_pte_psize = MMU_PAGE_2M; + break; + + case PPC_HTW_IBM: + mas4 |= MAS4_INDD; +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_256M; +#else + mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_1M; +#endif + break; + + case PPC_HTW_NONE: +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; +#else + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; +#endif + mmu_pte_psize = mmu_virtual_psize; + break; + } + mtspr(SPRN_MAS4, mas4); + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + unsigned int num_cams; + + /* use a quarter of the TLBCAM for bolted linear map */ + num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4; + linear_map_top = map_mem_in_cams(linear_map_top, num_cams); + } +#endif + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +static void __init early_init_mmu_global(void) +{ + /* XXX This will have to be decided at runtime, but right + * now our boot and TLB miss code hard wires it. Ideally + * we should find out a suitable page size and patch the + * TLB miss code (either that or use the PACA to store + * the value we want) + */ + mmu_linear_psize = MMU_PAGE_1G; + + /* XXX This should be decided at runtime based on supported + * page sizes in the TLB, but for now let's assume 16M is + * always there and a good fit (which it probably is) + * + * Freescale booke only supports 4K pages in TLB0, so use that. + */ + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) + mmu_vmemmap_psize = MMU_PAGE_4K; + else + mmu_vmemmap_psize = MMU_PAGE_16M; + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + /* Look for supported page sizes */ + setup_page_sizes(); + + /* Look for HW tablewalk support */ + setup_mmu_htw(); + +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + if (book3e_htw_mode == PPC_HTW_NONE) { + extlb_level_exc = EX_TLB_SIZE; + patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); + patch_exception(0x1e0, + exc_instruction_tlb_miss_bolted_book3e); + } + } +#endif + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = memblock_end_of_DRAM(); +} + +static void __init early_mmu_set_memory_limit(void) +{ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + /* + * Limit memory so we dont have linear faults. + * Unlike memblock_set_current_limit, which limits + * memory available during early boot, this permanently + * reduces the memory available to Linux. We need to + * do this because highmem is not supported on 64-bit. + */ + memblock_enforce_memory_limit(linear_map_top); + } +#endif + + memblock_set_current_limit(linear_map_top); +} + +/* boot cpu only */ +void __init early_init_mmu(void) +{ + early_init_mmu_global(); + early_init_this_mmu(); + early_mmu_set_memory_limit(); +} + +void early_init_mmu_secondary(void) +{ + early_init_this_mmu(); +} + +void setup_initial_memory_limit(phys_addr_t first_memblock_base, + phys_addr_t first_memblock_size) +{ + /* On non-FSL Embedded 64-bit, we adjust the RMA size to match + * the bolted TLB entry. We know for now that only 1G + * entries are supported though that may eventually + * change. + * + * on FSL Embedded 64-bit, we adjust the RMA size to match the + * first bolted TLB entry size. We still limit max to 1G even if + * the TLB could cover more. This is due to what the early init + * code is setup to do. + * + * We crop it to the size of the first MEMBLOCK to + * avoid going over total available memory just in case... + */ +#ifdef CONFIG_PPC_FSL_BOOK3E + if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + unsigned long linear_sz; + linear_sz = calc_cam_sz(first_memblock_size, PAGE_OFFSET, + first_memblock_base); + ppc64_rma_size = min_t(u64, linear_sz, 0x40000000); + } else +#endif + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); + + /* Finally limit subsequent allocations */ + memblock_set_current_limit(first_memblock_base + ppc64_rma_size); +} +#else /* ! CONFIG_PPC64 */ +void __init early_init_mmu(void) +{ +#ifdef CONFIG_PPC_47x + early_init_mmu_47x(); +#endif +} +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S new file mode 100644 index 000000000..43ff3c797 --- /dev/null +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -0,0 +1,426 @@ +/* + * This file contains low-level functions for performing various + * types of TLB invalidations on various processors with no hash + * table. + * + * This file implements the following functions for all no-hash + * processors. Some aren't implemented for some variants. Some + * are inline in tlbflush.h + * + * - tlbil_va + * - tlbil_pid + * - tlbil_all + * - tlbivax_bcast + * + * Code mostly moved over from misc_32.S + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * Partially rewritten by Cort Dougan (cort@cs.nmt.edu) + * Paul Mackerras, Kumar Gala and Benjamin Herrenschmidt. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/cputable.h> +#include <asm/mmu.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/processor.h> +#include <asm/bug.h> + +#if defined(CONFIG_40x) + +/* + * 40x implementation needs only tlbil_va + */ +_GLOBAL(__tlbil_va) + /* We run the search with interrupts disabled because we have to change + * the PID and I don't want to preempt when that happens. + */ + mfmsr r5 + mfspr r6,SPRN_PID + wrteei 0 + mtspr SPRN_PID,r4 + tlbsx. r3, 0, r3 + mtspr SPRN_PID,r6 + wrtee r5 + bne 1f + sync + /* There are only 64 TLB entries, so r3 < 64, which means bit 25 is + * clear. Since 25 is the V bit in the TLB_TAG, loading this value + * will invalidate the TLB entry. */ + tlbwe r3, r3, TLB_TAG + isync +1: blr + +#elif defined(CONFIG_8xx) + +/* + * Nothing to do for 8xx, everything is inline + */ + +#elif defined(CONFIG_44x) /* Includes 47x */ + +/* + * 440 implementation uses tlbsx/we for tlbil_va and a full sweep + * of the TLB for everything else. + */ +_GLOBAL(__tlbil_va) + mfspr r5,SPRN_MMUCR + mfmsr r10 + + /* + * We write 16 bits of STID since 47x supports that much, we + * will never be passed out of bounds values on 440 (hopefully) + */ + rlwimi r5,r4,0,16,31 + + /* We have to run the search with interrupts disabled, otherwise + * an interrupt which causes a TLB miss can clobber the MMUCR + * between the mtspr and the tlbsx. + * + * Critical and Machine Check interrupts take care of saving + * and restoring MMUCR, so only normal interrupts have to be + * taken care of. + */ + wrteei 0 + mtspr SPRN_MMUCR,r5 + tlbsx. r6,0,r3 + bne 10f + sync +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) + /* On 440 There are only 64 TLB entries, so r3 < 64, which means bit + * 22, is clear. Since 22 is the V bit in the TLB_PAGEID, loading this + * value will invalidate the TLB entry. + */ + tlbwe r6,r6,PPC44x_TLB_PAGEID + isync +10: wrtee r10 + blr +2: +#ifdef CONFIG_PPC_47x + oris r7,r6,0x8000 /* specify way explicitely */ + clrrwi r4,r3,12 /* get an EPN for the hashing with V = 0 */ + ori r4,r4,PPC47x_TLBE_SIZE + tlbwe r4,r7,0 /* write it */ + isync + wrtee r10 + blr +#else /* CONFIG_PPC_47x */ +1: trap + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; +#endif /* !CONFIG_PPC_47x */ + +_GLOBAL(_tlbil_all) +_GLOBAL(_tlbil_pid) +BEGIN_MMU_FTR_SECTION + b 2f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x) + li r3,0 + sync + + /* Load high watermark */ + lis r4,tlb_44x_hwater@ha + lwz r5,tlb_44x_hwater@l(r4) + +1: tlbwe r3,r3,PPC44x_TLB_PAGEID + addi r3,r3,1 + cmpw 0,r3,r5 + ble 1b + + isync + blr +2: +#ifdef CONFIG_PPC_47x + /* 476 variant. There's not simple way to do this, hopefully we'll + * try to limit the amount of such full invalidates + */ + mfmsr r11 /* Interrupts off */ + wrteei 0 + li r3,-1 /* Current set */ + lis r10,tlb_47x_boltmap@h + ori r10,r10,tlb_47x_boltmap@l + lis r7,0x8000 /* Specify way explicitely */ + + b 9f /* For each set */ + +1: li r9,4 /* Number of ways */ + li r4,0 /* Current way */ + li r6,0 /* Default entry value 0 */ + andi. r0,r8,1 /* Check if way 0 is bolted */ + mtctr r9 /* Load way counter */ + bne- 3f /* Bolted, skip loading it */ + +2: /* For each way */ + or r5,r3,r4 /* Make way|index for tlbre */ + rlwimi r5,r5,16,8,15 /* Copy index into position */ + tlbre r6,r5,0 /* Read entry */ +3: addis r4,r4,0x2000 /* Next way */ + andi. r0,r6,PPC47x_TLB0_VALID /* Valid entry ? */ + beq 4f /* Nope, skip it */ + rlwimi r7,r5,0,1,2 /* Insert way number */ + rlwinm r6,r6,0,21,19 /* Clear V */ + tlbwe r6,r7,0 /* Write it */ +4: bdnz 2b /* Loop for each way */ + srwi r8,r8,1 /* Next boltmap bit */ +9: cmpwi cr1,r3,255 /* Last set done ? */ + addi r3,r3,1 /* Next set */ + beq cr1,1f /* End of loop */ + andi. r0,r3,0x1f /* Need to load a new boltmap word ? */ + bne 1b /* No, loop */ + lwz r8,0(r10) /* Load boltmap entry */ + addi r10,r10,4 /* Next word */ + b 1b /* Then loop */ +1: isync /* Sync shadows */ + wrtee r11 +#else /* CONFIG_PPC_47x */ +1: trap + EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0; +#endif /* !CONFIG_PPC_47x */ + blr + +#ifdef CONFIG_PPC_47x + +/* + * _tlbivax_bcast is only on 47x. We don't bother doing a runtime + * check though, it will blow up soon enough if we mistakenly try + * to use it on a 440. + */ +_GLOBAL(_tlbivax_bcast) + mfspr r5,SPRN_MMUCR + mfmsr r10 + rlwimi r5,r4,0,16,31 + wrteei 0 + mtspr SPRN_MMUCR,r5 + isync + PPC_TLBIVAX(0, R3) + isync + eieio + tlbsync +BEGIN_FTR_SECTION + b 1f +END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) + sync + wrtee r10 + blr +/* + * DD2 HW could hang if in instruction fetch happens before msync completes. + * Touch enough instruction cache lines to ensure cache hits + */ +1: mflr r9 + bl 2f +2: mflr r6 + li r7,32 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + add r6,r6,r7 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + add r6,r6,r7 + PPC_ICBT(0,R6,R7) /* touch next cache line */ + sync + nop + nop + nop + nop + nop + nop + nop + nop + mtlr r9 + wrtee r10 + blr +#endif /* CONFIG_PPC_47x */ + +#elif defined(CONFIG_FSL_BOOKE) +/* + * FSL BookE implementations. + * + * Since feature sections are using _SECTION_ELSE we need + * to have the larger code path before the _SECTION_ELSE + */ + +/* + * Flush MMU TLB on the local processor + */ +_GLOBAL(_tlbil_all) +BEGIN_MMU_FTR_SECTION + li r3,(MMUCSR0_TLBFI)@l + mtspr SPRN_MMUCSR0, r3 +1: + mfspr r3,SPRN_MMUCSR0 + andi. r3,r3,MMUCSR0_TLBFI@l + bne 1b +MMU_FTR_SECTION_ELSE + PPC_TLBILX_ALL(0,R0) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) + msync + isync + blr + +_GLOBAL(_tlbil_pid) +BEGIN_MMU_FTR_SECTION + slwi r3,r3,16 + mfmsr r10 + wrteei 0 + mfspr r4,SPRN_MAS6 /* save MAS6 */ + mtspr SPRN_MAS6,r3 + PPC_TLBILX_PID(0,R0) + mtspr SPRN_MAS6,r4 /* restore MAS6 */ + wrtee r10 +MMU_FTR_SECTION_ELSE + li r3,(MMUCSR0_TLBFI)@l + mtspr SPRN_MMUCSR0, r3 +1: + mfspr r3,SPRN_MMUCSR0 + andi. r3,r3,MMUCSR0_TLBFI@l + bne 1b +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX) + msync + isync + blr + +/* + * Flush MMU TLB for a particular address, but only on the local processor + * (no broadcast) + */ +_GLOBAL(__tlbil_va) + mfmsr r10 + wrteei 0 + slwi r4,r4,16 + ori r4,r4,(MAS6_ISIZE(BOOK3E_PAGESZ_4K))@l + mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ +BEGIN_MMU_FTR_SECTION + tlbsx 0,r3 + mfspr r4,SPRN_MAS1 /* check valid */ + andis. r3,r4,MAS1_VALID@h + beq 1f + rlwinm r4,r4,0,1,31 + mtspr SPRN_MAS1,r4 + tlbwe +MMU_FTR_SECTION_ELSE + PPC_TLBILX_VA(0,R3) +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) + msync + isync +1: wrtee r10 + blr +#elif defined(CONFIG_PPC_BOOK3E) +/* + * New Book3E (>= 2.06) implementation + * + * Note: We may be able to get away without the interrupt masking stuff + * if we save/restore MAS6 on exceptions that might modify it + */ +_GLOBAL(_tlbil_pid) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,R0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_pid_noind) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + ori r4,r4,MAS6_SIND + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,R0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_all) + PPC_TLBILX_ALL(0,R0) + msync + isync + blr + +_GLOBAL(_tlbil_va) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBILX_VA(0,R3) + msync + isync + wrtee r10 + blr + +_GLOBAL(_tlbivax_bcast) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBIVAX(0,R3) + eieio + tlbsync + sync + wrtee r10 + blr + +_GLOBAL(set_context) +#ifdef CONFIG_BDI_SWITCH + /* Context switch the PTE pointer for the Abatron BDI2000. + * The PGDIR is the second parameter. + */ + lis r5, abatron_pteptrs@h + ori r5, r5, abatron_pteptrs@l + stw r4, 0x4(r5) +#endif + mtspr SPRN_PID,r3 + isync /* Force context change */ + blr +#else +#error Unsupported processor type ! +#endif + +#if defined(CONFIG_PPC_FSL_BOOK3E) +/* + * extern void loadcam_entry(unsigned int index) + * + * Load TLBCAM[index] entry in to the L2 CAM MMU + */ +_GLOBAL(loadcam_entry) + mflr r5 + LOAD_REG_ADDR_PIC(r4, TLBCAM) + mtlr r5 + mulli r5,r3,TLBCAM_SIZE + add r3,r5,r4 + lwz r4,TLBCAM_MAS0(r3) + mtspr SPRN_MAS0,r4 + lwz r4,TLBCAM_MAS1(r3) + mtspr SPRN_MAS1,r4 + PPC_LL r4,TLBCAM_MAS2(r3) + mtspr SPRN_MAS2,r4 + lwz r4,TLBCAM_MAS3(r3) + mtspr SPRN_MAS3,r4 +BEGIN_MMU_FTR_SECTION + lwz r4,TLBCAM_MAS7(r3) + mtspr SPRN_MAS7,r4 +END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) + isync + tlbwe + isync + blr +#endif diff --git a/arch/powerpc/mm/vphn.c b/arch/powerpc/mm/vphn.c new file mode 100644 index 000000000..5f8ef50e5 --- /dev/null +++ b/arch/powerpc/mm/vphn.c @@ -0,0 +1,70 @@ +#include <asm/byteorder.h> +#include "vphn.h" + +/* + * The associativity domain numbers are returned from the hypervisor as a + * stream of mixed 16-bit and 32-bit fields. The stream is terminated by the + * special value of "all ones" (aka. 0xffff) and its size may not exceed 48 + * bytes. + * + * --- 16-bit fields --> + * _________________________ + * | 0 | 1 | 2 | 3 | be_packed[0] + * ------+-----+-----+------ + * _________________________ + * | 4 | 5 | 6 | 7 | be_packed[1] + * ------------------------- + * ... + * _________________________ + * | 20 | 21 | 22 | 23 | be_packed[5] + * ------------------------- + * + * Convert to the sequence they would appear in the ibm,associativity property. + */ +int vphn_unpack_associativity(const long *packed, __be32 *unpacked) +{ + __be64 be_packed[VPHN_REGISTER_COUNT]; + int i, nr_assoc_doms = 0; + const __be16 *field = (const __be16 *) be_packed; + u16 last = 0; + bool is_32bit = false; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + /* Let's fix the values returned by plpar_hcall9() */ + for (i = 0; i < VPHN_REGISTER_COUNT; i++) + be_packed[i] = cpu_to_be64(packed[i]); + + for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) { + u16 new = be16_to_cpup(field++); + + if (is_32bit) { + /* Let's concatenate the 16 bits of this field to the + * 15 lower bits of the previous field + */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(last << 16 | new); + is_32bit = false; + } else if (new == VPHN_FIELD_UNUSED) + /* This is the list terminator */ + break; + else if (new & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[++nr_assoc_doms] = + cpu_to_be32(new & VPHN_FIELD_MASK); + } else { + /* Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + last = new; + is_32bit = true; + } + } + + /* The first cell contains the length of the property */ + unpacked[0] = cpu_to_be32(nr_assoc_doms); + + return nr_assoc_doms; +} diff --git a/arch/powerpc/mm/vphn.h b/arch/powerpc/mm/vphn.h new file mode 100644 index 000000000..fe8b7805b --- /dev/null +++ b/arch/powerpc/mm/vphn.h @@ -0,0 +1,16 @@ +#ifndef _ARCH_POWERPC_MM_VPHN_H_ +#define _ARCH_POWERPC_MM_VPHN_H_ + +/* The H_HOME_NODE_ASSOCIATIVITY h_call returns 6 64-bit registers. + */ +#define VPHN_REGISTER_COUNT 6 + +/* + * 6 64-bit registers unpacked into up to 24 be32 associativity values. To + * form the complete property we have to add the length in the first cell. + */ +#define VPHN_ASSOC_BUFSIZE (VPHN_REGISTER_COUNT*sizeof(u64)/sizeof(u16) + 1) + +extern int vphn_unpack_associativity(const long *packed, __be32 *unpacked); + +#endif |