diff options
Diffstat (limited to 'arch/s390/mm/pgtable.c')
-rw-r--r-- | arch/s390/mm/pgtable.c | 1549 |
1 files changed, 380 insertions, 1169 deletions
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 510982788..4324b87f9 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -24,591 +24,397 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> -unsigned long *crst_table_alloc(struct mm_struct *mm) -{ - struct page *page = alloc_pages(GFP_KERNEL, 2); - - if (!page) - return NULL; - return (unsigned long *) page_to_phys(page); +static inline pte_t ptep_flush_direct(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + int active, count; + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + active = (mm == current->active_mm) ? 1 : 0; + count = atomic_add_return(0x10000, &mm->context.attach_count); + if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_ipte_local(addr, ptep); + else + __ptep_ipte(addr, ptep); + atomic_sub(0x10000, &mm->context.attach_count); + return old; } -void crst_table_free(struct mm_struct *mm, unsigned long *table) +static inline pte_t ptep_flush_lazy(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { - free_pages((unsigned long) table, 2); + int active, count; + pte_t old; + + old = *ptep; + if (unlikely(pte_val(old) & _PAGE_INVALID)) + return old; + active = (mm == current->active_mm) ? 1 : 0; + count = atomic_add_return(0x10000, &mm->context.attach_count); + if ((count & 0xffff) <= active) { + pte_val(*ptep) |= _PAGE_INVALID; + mm->context.flush_mm = 1; + } else + __ptep_ipte(addr, ptep); + atomic_sub(0x10000, &mm->context.attach_count); + return old; } -static void __crst_table_upgrade(void *arg) +static inline pgste_t pgste_get_lock(pte_t *ptep) { - struct mm_struct *mm = arg; + unsigned long new = 0; +#ifdef CONFIG_PGSTE + unsigned long old; - if (current->active_mm == mm) { - clear_user_asce(); - set_user_asce(mm); - } - __tlb_flush_local(); + preempt_disable(); + asm( + " lg %0,%2\n" + "0: lgr %1,%0\n" + " nihh %0,0xff7f\n" /* clear PCL bit in old */ + " oihh %1,0x0080\n" /* set PCL bit in new */ + " csg %0,%1,%2\n" + " jl 0b\n" + : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) + : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); +#endif + return __pgste(new); } -int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) +static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) { - unsigned long *table, *pgd; - unsigned long entry; - int flush; - - BUG_ON(limit > TASK_MAX_SIZE); - flush = 0; -repeat: - table = crst_table_alloc(mm); - if (!table) - return -ENOMEM; - spin_lock_bh(&mm->page_table_lock); - if (mm->context.asce_limit < limit) { - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit <= (1UL << 31)) { - entry = _REGION3_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - } else { - entry = _REGION2_ENTRY_EMPTY; - mm->context.asce_limit = 1UL << 53; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION2; - } - crst_table_init(table, entry); - pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->task_size = mm->context.asce_limit; - table = NULL; - flush = 1; - } - spin_unlock_bh(&mm->page_table_lock); - if (table) - crst_table_free(mm, table); - if (mm->context.asce_limit < limit) - goto repeat; - if (flush) - on_each_cpu(__crst_table_upgrade, mm, 0); - return 0; +#ifdef CONFIG_PGSTE + asm( + " nihh %1,0xff7f\n" /* clear PCL bit */ + " stg %1,%0\n" + : "=Q" (ptep[PTRS_PER_PTE]) + : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) + : "cc", "memory"); + preempt_enable(); +#endif } -void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) +static inline pgste_t pgste_get(pte_t *ptep) { - pgd_t *pgd; - - if (current->active_mm == mm) { - clear_user_asce(); - __tlb_flush_mm(mm); - } - while (mm->context.asce_limit > limit) { - pgd = mm->pgd; - switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { - case _REGION_ENTRY_TYPE_R2: - mm->context.asce_limit = 1UL << 42; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_REGION3; - break; - case _REGION_ENTRY_TYPE_R3: - mm->context.asce_limit = 1UL << 31; - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | - _ASCE_TYPE_SEGMENT; - break; - default: - BUG(); - } - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->task_size = mm->context.asce_limit; - crst_table_free(mm, (unsigned long *) pgd); - } - if (current->active_mm == mm) - set_user_asce(mm); + unsigned long pgste = 0; +#ifdef CONFIG_PGSTE + pgste = *(unsigned long *)(ptep + PTRS_PER_PTE); +#endif + return __pgste(pgste); } +static inline void pgste_set(pte_t *ptep, pgste_t pgste) +{ #ifdef CONFIG_PGSTE + *(pgste_t *)(ptep + PTRS_PER_PTE) = pgste; +#endif +} -/** - * gmap_alloc - allocate a guest address space - * @mm: pointer to the parent mm_struct - * @limit: maximum address of the gmap address space - * - * Returns a guest address space structure. - */ -struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) +static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste, + struct mm_struct *mm) { - struct gmap *gmap; - struct page *page; - unsigned long *table; - unsigned long etype, atype; - - if (limit < (1UL << 31)) { - limit = (1UL << 31) - 1; - atype = _ASCE_TYPE_SEGMENT; - etype = _SEGMENT_ENTRY_EMPTY; - } else if (limit < (1UL << 42)) { - limit = (1UL << 42) - 1; - atype = _ASCE_TYPE_REGION3; - etype = _REGION3_ENTRY_EMPTY; - } else if (limit < (1UL << 53)) { - limit = (1UL << 53) - 1; - atype = _ASCE_TYPE_REGION2; - etype = _REGION2_ENTRY_EMPTY; - } else { - limit = -1UL; - atype = _ASCE_TYPE_REGION1; - etype = _REGION1_ENTRY_EMPTY; - } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); - if (!gmap) - goto out; - INIT_LIST_HEAD(&gmap->crst_list); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); - spin_lock_init(&gmap->guest_table_lock); - gmap->mm = mm; - page = alloc_pages(GFP_KERNEL, 2); - if (!page) - goto out_free; - page->index = 0; - list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); - crst_table_init(table, etype); - gmap->table = table; - gmap->asce = atype | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | __pa(table); - gmap->asce_end = limit; - down_write(&mm->mmap_sem); - list_add(&gmap->list, &mm->context.gmap_list); - up_write(&mm->mmap_sem); - return gmap; - -out_free: - kfree(gmap); -out: - return NULL; +#ifdef CONFIG_PGSTE + unsigned long address, bits, skey; + + if (!mm_use_skey(mm) || pte_val(pte) & _PAGE_INVALID) + return pgste; + address = pte_val(pte) & PAGE_MASK; + skey = (unsigned long) page_get_storage_key(address); + bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED); + /* Transfer page changed & referenced bit to guest bits in pgste */ + pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */ + /* Copy page access key and fetch protection bit to pgste */ + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT); + pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; +#endif + return pgste; + } -EXPORT_SYMBOL_GPL(gmap_alloc); -static void gmap_flush_tlb(struct gmap *gmap) +static inline void pgste_set_key(pte_t *ptep, pgste_t pgste, pte_t entry, + struct mm_struct *mm) { - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); +#ifdef CONFIG_PGSTE + unsigned long address; + unsigned long nkey; + + if (!mm_use_skey(mm) || pte_val(entry) & _PAGE_INVALID) + return; + VM_BUG_ON(!(pte_val(*ptep) & _PAGE_INVALID)); + address = pte_val(entry) & PAGE_MASK; + /* + * Set page access key and fetch protection bit from pgste. + * The guest C/R information is still in the PGSTE, set real + * key C/R to 0. + */ + nkey = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; + nkey |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48; + page_set_storage_key(address, nkey, 0); +#endif } -static void gmap_radix_tree_free(struct radix_tree_root *root) +static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) { - struct radix_tree_iter iter; - unsigned long indices[16]; - unsigned long index; - void **slot; - int i, nr; - - /* A radix tree is freed by deleting all of its entries */ - index = 0; - do { - nr = 0; - radix_tree_for_each_slot(slot, root, &iter, index) { - indices[nr] = iter.index; - if (++nr == 16) - break; - } - for (i = 0; i < nr; i++) { - index = indices[i]; - radix_tree_delete(root, index); +#ifdef CONFIG_PGSTE + if ((pte_val(entry) & _PAGE_PRESENT) && + (pte_val(entry) & _PAGE_WRITE) && + !(pte_val(entry) & _PAGE_INVALID)) { + if (!MACHINE_HAS_ESOP) { + /* + * Without enhanced suppression-on-protection force + * the dirty bit on for all writable ptes. + */ + pte_val(entry) |= _PAGE_DIRTY; + pte_val(entry) &= ~_PAGE_PROTECT; } - } while (nr > 0); + if (!(pte_val(entry) & _PAGE_PROTECT)) + /* This pte allows write access, set user-dirty */ + pgste_val(pgste) |= PGSTE_UC_BIT; + } +#endif + *ptep = entry; + return pgste; } -/** - * gmap_free - free a guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_free(struct gmap *gmap) +static inline pgste_t pgste_ipte_notify(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, pgste_t pgste) { - struct page *page, *next; - - /* Flush tlb. */ - if (MACHINE_HAS_IDTE) - __tlb_flush_asce(gmap->mm, gmap->asce); - else - __tlb_flush_global(); - - /* Free all segment & region tables. */ - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, 2); - gmap_radix_tree_free(&gmap->guest_to_host); - gmap_radix_tree_free(&gmap->host_to_guest); - down_write(&gmap->mm->mmap_sem); - list_del(&gmap->list); - up_write(&gmap->mm->mmap_sem); - kfree(gmap); +#ifdef CONFIG_PGSTE + if (pgste_val(pgste) & PGSTE_IN_BIT) { + pgste_val(pgste) &= ~PGSTE_IN_BIT; + ptep_notify(mm, addr, ptep); + } +#endif + return pgste; } -EXPORT_SYMBOL_GPL(gmap_free); -/** - * gmap_enable - switch primary space to the guest address space - * @gmap: pointer to the guest address space structure - */ -void gmap_enable(struct gmap *gmap) +static inline pgste_t ptep_xchg_start(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { - S390_lowcore.gmap = (unsigned long) gmap; + pgste_t pgste = __pgste(0); + + if (mm_has_pgste(mm)) { + pgste = pgste_get_lock(ptep); + pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + } + return pgste; } -EXPORT_SYMBOL_GPL(gmap_enable); -/** - * gmap_disable - switch back to the standard primary address space - * @gmap: pointer to the guest address space structure - */ -void gmap_disable(struct gmap *gmap) +static inline void ptep_xchg_commit(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pgste_t pgste, pte_t old, pte_t new) { - S390_lowcore.gmap = 0UL; + if (mm_has_pgste(mm)) { + if (pte_val(old) & _PAGE_INVALID) + pgste_set_key(ptep, pgste, new, mm); + if (pte_val(new) & _PAGE_INVALID) { + pgste = pgste_update_all(old, pgste, mm); + if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == + _PGSTE_GPS_USAGE_UNUSED) + pte_val(old) |= _PAGE_UNUSED; + } + pgste = pgste_set_pte(ptep, pgste, new); + pgste_set_unlock(ptep, pgste); + } else { + *ptep = new; + } } -EXPORT_SYMBOL_GPL(gmap_disable); -/* - * gmap_alloc_table is assumed to be called with mmap_sem held - */ -static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, - unsigned long init, unsigned long gaddr) +pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) { - struct page *page; - unsigned long *new; - - /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, 2); - if (!page) - return -ENOMEM; - new = (unsigned long *) page_to_phys(page); - crst_table_init(new, init); - spin_lock(&gmap->mm->page_table_lock); - if (*table & _REGION_ENTRY_INVALID) { - list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | - (*table & _REGION_ENTRY_TYPE_MASK); - page->index = gaddr; - page = NULL; - } - spin_unlock(&gmap->mm->page_table_lock); - if (page) - __free_pages(page, 2); - return 0; + pgste_t pgste; + pte_t old; + + pgste = ptep_xchg_start(mm, addr, ptep); + old = ptep_flush_direct(mm, addr, ptep); + ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + return old; } +EXPORT_SYMBOL(ptep_xchg_direct); -/** - * __gmap_segment_gaddr - find virtual address from segment pointer - * @entry: pointer to a segment table entry in the guest address space - * - * Returns the virtual address in the guest address space for the segment - */ -static unsigned long __gmap_segment_gaddr(unsigned long *entry) +pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t new) { - struct page *page; - unsigned long offset, mask; - - offset = (unsigned long) entry / sizeof(unsigned long); - offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); - return page->index + offset; + pgste_t pgste; + pte_t old; + + pgste = ptep_xchg_start(mm, addr, ptep); + old = ptep_flush_lazy(mm, addr, ptep); + ptep_xchg_commit(mm, addr, ptep, pgste, old, new); + return old; } +EXPORT_SYMBOL(ptep_xchg_lazy); -/** - * __gmap_unlink_by_vmaddr - unlink a single segment via a host address - * @gmap: pointer to the guest address space structure - * @vmaddr: address in the host process address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr) +pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { - unsigned long *entry; - int flush = 0; - - spin_lock(&gmap->guest_table_lock); - entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT); - if (entry) { - flush = (*entry != _SEGMENT_ENTRY_INVALID); - *entry = _SEGMENT_ENTRY_INVALID; + pgste_t pgste; + pte_t old; + + pgste = ptep_xchg_start(mm, addr, ptep); + old = ptep_flush_lazy(mm, addr, ptep); + if (mm_has_pgste(mm)) { + pgste = pgste_update_all(old, pgste, mm); + pgste_set(ptep, pgste); } - spin_unlock(&gmap->guest_table_lock); - return flush; + return old; } +EXPORT_SYMBOL(ptep_modify_prot_start); -/** - * __gmap_unmap_by_gaddr - unmap a single segment via a guest address - * @gmap: pointer to the guest address space structure - * @gaddr: address in the guest address space - * - * Returns 1 if a TLB flush is required - */ -static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr) +void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) { - unsigned long vmaddr; + pgste_t pgste; - vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0; + if (mm_has_pgste(mm)) { + pgste = pgste_get(ptep); + pgste_set_key(ptep, pgste, pte, mm); + pgste = pgste_set_pte(ptep, pgste, pte); + pgste_set_unlock(ptep, pgste); + } else { + *ptep = pte; + } } +EXPORT_SYMBOL(ptep_modify_prot_commit); -/** - * gmap_unmap_segment - unmap segment from the guest address space - * @gmap: pointer to the guest address space structure - * @to: address in the guest address space - * @len: length of the memory area to unmap - * - * Returns 0 if the unmap succeeded, -EINVAL if not. - */ -int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) +static inline pmd_t pmdp_flush_direct(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) { - unsigned long off; - int flush; - - if ((to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || to + len < to) - return -EINVAL; - - flush = 0; - down_write(&gmap->mm->mmap_sem); - for (off = 0; off < len; off += PMD_SIZE) - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - up_write(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - return 0; + int active, count; + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + if (!MACHINE_HAS_IDTE) { + __pmdp_csp(pmdp); + return old; + } + active = (mm == current->active_mm) ? 1 : 0; + count = atomic_add_return(0x10000, &mm->context.attach_count); + if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active && + cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __pmdp_idte_local(addr, pmdp); + else + __pmdp_idte(addr, pmdp); + atomic_sub(0x10000, &mm->context.attach_count); + return old; +} + +static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + int active, count; + pmd_t old; + + old = *pmdp; + if (pmd_val(old) & _SEGMENT_ENTRY_INVALID) + return old; + active = (mm == current->active_mm) ? 1 : 0; + count = atomic_add_return(0x10000, &mm->context.attach_count); + if ((count & 0xffff) <= active) { + pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; + mm->context.flush_mm = 1; + } else if (MACHINE_HAS_IDTE) + __pmdp_idte(addr, pmdp); + else + __pmdp_csp(pmdp); + atomic_sub(0x10000, &mm->context.attach_count); + return old; } -EXPORT_SYMBOL_GPL(gmap_unmap_segment); - -/** - * gmap_mmap_segment - map a segment to the guest address space - * @gmap: pointer to the guest address space structure - * @from: source address in the parent address space - * @to: target address in the guest address space - * @len: length of the memory area to map - * - * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not. - */ -int gmap_map_segment(struct gmap *gmap, unsigned long from, - unsigned long to, unsigned long len) + +pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) { - unsigned long off; - int flush; - - if ((from | to | len) & (PMD_SIZE - 1)) - return -EINVAL; - if (len == 0 || from + len < from || to + len < to || - from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end) - return -EINVAL; - - flush = 0; - down_write(&gmap->mm->mmap_sem); - for (off = 0; off < len; off += PMD_SIZE) { - /* Remove old translation */ - flush |= __gmap_unmap_by_gaddr(gmap, to + off); - /* Store new translation */ - if (radix_tree_insert(&gmap->guest_to_host, - (to + off) >> PMD_SHIFT, - (void *) from + off)) - break; - } - up_write(&gmap->mm->mmap_sem); - if (flush) - gmap_flush_tlb(gmap); - if (off >= len) - return 0; - gmap_unmap_segment(gmap, to, len); - return -ENOMEM; + pmd_t old; + + old = pmdp_flush_direct(mm, addr, pmdp); + *pmdp = new; + return old; } -EXPORT_SYMBOL_GPL(gmap_map_segment); - -/** - * __gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - * The mmap_sem of the mm that belongs to the address space must be held - * when this function gets called. - */ -unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr) +EXPORT_SYMBOL(pmdp_xchg_direct); + +pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t new) { - unsigned long vmaddr; + pmd_t old; - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT); - return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT; + old = pmdp_flush_lazy(mm, addr, pmdp); + *pmdp = new; + return old; } -EXPORT_SYMBOL_GPL(__gmap_translate); - -/** - * gmap_translate - translate a guest address to a user space address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * - * Returns user space address which corresponds to the guest address or - * -EFAULT if no such mapping exists. - * This function does not establish potentially missing page table entries. - */ -unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) +EXPORT_SYMBOL(pmdp_xchg_lazy); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { - unsigned long rc; + struct list_head *lh = (struct list_head *) pgtable; - down_read(&gmap->mm->mmap_sem); - rc = __gmap_translate(gmap, gaddr); - up_read(&gmap->mm->mmap_sem); - return rc; + assert_spin_locked(pmd_lockptr(mm, pmdp)); + + /* FIFO */ + if (!pmd_huge_pte(mm, pmdp)) + INIT_LIST_HEAD(lh); + else + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } -EXPORT_SYMBOL_GPL(gmap_translate); -/** - * gmap_unlink - disconnect a page table from the gmap shadow tables - * @gmap: pointer to guest mapping meta data structure - * @table: pointer to the host page table - * @vmaddr: vm address associated with the host page table - */ -static void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { - struct gmap *gmap; - int flush; + struct list_head *lh; + pgtable_t pgtable; + pte_t *ptep; + + assert_spin_locked(pmd_lockptr(mm, pmdp)); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { - flush = __gmap_unlink_by_vmaddr(gmap, vmaddr); - if (flush) - gmap_flush_tlb(gmap); + /* FIFO */ + pgtable = pmd_huge_pte(mm, pmdp); + lh = (struct list_head *) pgtable; + if (list_empty(lh)) + pmd_huge_pte(mm, pmdp) = NULL; + else { + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; + list_del(lh); } + ptep = (pte_t *) pgtable; + pte_val(*ptep) = _PAGE_INVALID; + ptep++; + pte_val(*ptep) = _PAGE_INVALID; + return pgtable; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/** - * gmap_link - set up shadow page tables to connect a host to a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @vmaddr: vm address - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - * The mmap_sem of the mm that belongs to the address space must be held - * when this function gets called. - */ -int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) +#ifdef CONFIG_PGSTE +void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) { - struct mm_struct *mm; - unsigned long *table; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - int rc; - - /* Create higher level tables in the gmap page table */ - table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) { - table += (gaddr >> 53) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, - gaddr & 0xffe0000000000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { - table += (gaddr >> 42) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, - gaddr & 0xfffffc0000000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { - table += (gaddr >> 31) & 0x7ff; - if ((*table & _REGION_ENTRY_INVALID) && - gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, - gaddr & 0xffffffff80000000UL)) - return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - } - table += (gaddr >> 20) & 0x7ff; - /* Walk the parent mm page table */ - mm = gmap->mm; - pgd = pgd_offset(mm, vmaddr); - VM_BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, vmaddr); - VM_BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, vmaddr); - VM_BUG_ON(pmd_none(*pmd)); - /* large pmds cannot yet be handled */ - if (pmd_large(*pmd)) - return -EFAULT; - /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL); - if (rc) - return rc; - ptl = pmd_lock(mm, pmd); - spin_lock(&gmap->guest_table_lock); - if (*table == _SEGMENT_ENTRY_INVALID) { - rc = radix_tree_insert(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT, table); - if (!rc) - *table = pmd_val(*pmd); - } else - rc = 0; - spin_unlock(&gmap->guest_table_lock); - spin_unlock(ptl); - radix_tree_preload_end(); - return rc; + pgste_t pgste; + + /* the mm_has_pgste() check is done in set_pte_at() */ + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; + pgste_set_key(ptep, pgste, entry, mm); + pgste = pgste_set_pte(ptep, pgste, entry); + pgste_set_unlock(ptep, pgste); } -/** - * gmap_fault - resolve a fault on a guest address - * @gmap: pointer to guest mapping meta data structure - * @gaddr: guest address - * @fault_flags: flags to pass down to handle_mm_fault() - * - * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT - * if the vm address is already mapped to a different guest segment. - */ -int gmap_fault(struct gmap *gmap, unsigned long gaddr, - unsigned int fault_flags) +void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long vmaddr; - int rc; - bool unlocked; - - down_read(&gmap->mm->mmap_sem); - -retry: - unlocked = false; - vmaddr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(vmaddr)) { - rc = vmaddr; - goto out_up; - } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, - &unlocked)) { - rc = -EFAULT; - goto out_up; - } - /* - * In the case that fixup_user_fault unlocked the mmap_sem during - * faultin redo __gmap_translate to not race with a map/unmap_segment. - */ - if (unlocked) - goto retry; + pgste_t pgste; - rc = __gmap_link(gmap, gaddr, vmaddr); -out_up: - up_read(&gmap->mm->mmap_sem); - return rc; + pgste = pgste_get_lock(ptep); + pgste_val(pgste) |= PGSTE_IN_BIT; + pgste_set_unlock(ptep, pgste); } -EXPORT_SYMBOL_GPL(gmap_fault); -static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) +static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) { if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); @@ -620,225 +426,99 @@ static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm) free_swap_and_cache(entry); } -/* - * this function is assumed to be called with mmap_sem held - */ -void __gmap_zap(struct gmap *gmap, unsigned long gaddr) +void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, int reset) { - unsigned long vmaddr, ptev, pgstev; - pte_t *ptep, pte; - spinlock_t *ptl; + unsigned long pgstev; pgste_t pgste; + pte_t pte; - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - return; - vmaddr |= gaddr & ~PMD_MASK; - /* Get pointer to the page table entry */ - ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (unlikely(!ptep)) - return; - pte = *ptep; - if (!pte_swap(pte)) - goto out_pte; /* Zap unused and logically-zero pages */ pgste = pgste_get_lock(ptep); pgstev = pgste_val(pgste); - ptev = pte_val(pte); - if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) || - ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) { - gmap_zap_swap_entry(pte_to_swp_entry(pte), gmap->mm); - pte_clear(gmap->mm, vmaddr, ptep); - } + pte = *ptep; + if (pte_swap(pte) && + ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || + (pgstev & _PGSTE_GPS_ZERO))) { + ptep_zap_swap_entry(mm, pte_to_swp_entry(pte)); + pte_clear(mm, addr, ptep); + } + if (reset) + pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; pgste_set_unlock(ptep, pgste); -out_pte: - pte_unmap_unlock(ptep, ptl); } -EXPORT_SYMBOL_GPL(__gmap_zap); -void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) +void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long gaddr, vmaddr, size; - struct vm_area_struct *vma; - - down_read(&gmap->mm->mmap_sem); - for (gaddr = from; gaddr < to; - gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { - /* Find the vm address for the guest address */ - vmaddr = (unsigned long) - radix_tree_lookup(&gmap->guest_to_host, - gaddr >> PMD_SHIFT); - if (!vmaddr) - continue; - vmaddr |= gaddr & ~PMD_MASK; - /* Find vma in the parent mm */ - vma = find_vma(gmap->mm, vmaddr); - size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size, NULL); - } - up_read(&gmap->mm->mmap_sem); -} -EXPORT_SYMBOL_GPL(gmap_discard); - -static LIST_HEAD(gmap_notifier_list); -static DEFINE_SPINLOCK(gmap_notifier_lock); + unsigned long ptev; + pgste_t pgste; -/** - * gmap_register_ipte_notifier - register a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_register_ipte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_add(&nb->list, &gmap_notifier_list); - spin_unlock(&gmap_notifier_lock); + /* Clear storage key */ + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | + PGSTE_GR_BIT | PGSTE_GC_BIT); + ptev = pte_val(*ptep); + if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); + pgste_set_unlock(ptep, pgste); } -EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier); -/** - * gmap_unregister_ipte_notifier - remove a pte invalidation callback - * @nb: pointer to the gmap notifier block - */ -void gmap_unregister_ipte_notifier(struct gmap_notifier *nb) -{ - spin_lock(&gmap_notifier_lock); - list_del_init(&nb->list); - spin_unlock(&gmap_notifier_lock); -} -EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier); - -/** - * gmap_ipte_notify - mark a range of ptes for invalidation notification - * @gmap: pointer to guest mapping meta data structure - * @gaddr: virtual address in the guest address space - * @len: size of area - * - * Returns 0 if for each page in the given range a gmap mapping exists and - * the invalidation notification could be set. If the gmap mapping is missing - * for one or more pages -EFAULT is returned. If no memory could be allocated - * -ENOMEM is returned. This function establishes missing page table entries. +/* + * Test and reset if a guest page is dirty */ -int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) +bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) { - unsigned long addr; spinlock_t *ptl; - pte_t *ptep, entry; pgste_t pgste; - bool unlocked; - int rc = 0; - - if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) - return -EINVAL; - down_read(&gmap->mm->mmap_sem); - while (len) { - unlocked = false; - /* Convert gmap address and connect the page tables */ - addr = __gmap_translate(gmap, gaddr); - if (IS_ERR_VALUE(addr)) { - rc = addr; - break; - } - /* Get the page mapped */ - if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - rc = -EFAULT; - break; - } - /* While trying to map mmap_sem got unlocked. Let us retry */ - if (unlocked) - continue; - rc = __gmap_link(gmap, gaddr, addr); - if (rc) - break; - /* Walk the process page table, lock and get pte pointer */ - ptep = get_locked_pte(gmap->mm, addr, &ptl); - VM_BUG_ON(!ptep); - /* Set notification bit in the pgste of the pte */ - entry = *ptep; - if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) { - pgste = pgste_get_lock(ptep); - pgste_val(pgste) |= PGSTE_IN_BIT; - pgste_set_unlock(ptep, pgste); - gaddr += PAGE_SIZE; - len -= PAGE_SIZE; - } - pte_unmap_unlock(ptep, ptl); - } - up_read(&gmap->mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(gmap_ipte_notify); - -/** - * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte. - * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space - * @pte: pointer to the page table entry - * - * This function is assumed to be called with the page table lock held - * for the pte to notify. - */ -void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) -{ - unsigned long offset, gaddr; - unsigned long *table; - struct gmap_notifier *nb; - struct gmap *gmap; - - offset = ((unsigned long) pte) & (255 * sizeof(pte_t)); - offset = offset * (4096 / sizeof(pte_t)); - spin_lock(&gmap_notifier_lock); - list_for_each_entry(gmap, &mm->context.gmap_list, list) { - table = radix_tree_lookup(&gmap->host_to_guest, - vmaddr >> PMD_SHIFT); - if (!table) - continue; - gaddr = __gmap_segment_gaddr(table) + offset; - list_for_each_entry(nb, &gmap_notifier_list, list) - nb->notifier_call(gmap, gaddr); + pte_t *ptep; + pte_t pte; + bool dirty; + + ptep = get_locked_pte(mm, addr, &ptl); + if (unlikely(!ptep)) + return false; + + pgste = pgste_get_lock(ptep); + dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT); + pgste_val(pgste) &= ~PGSTE_UC_BIT; + pte = *ptep; + if (dirty && (pte_val(pte) & _PAGE_PRESENT)) { + pgste = pgste_ipte_notify(mm, addr, ptep, pgste); + __ptep_ipte(addr, ptep); + if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) + pte_val(pte) |= _PAGE_PROTECT; + else + pte_val(pte) |= _PAGE_INVALID; + *ptep = pte; } - spin_unlock(&gmap_notifier_lock); + pgste_set_unlock(ptep, pgste); + + spin_unlock(ptl); + return dirty; } -EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); +EXPORT_SYMBOL_GPL(test_and_clear_guest_dirty); int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, - unsigned long key, bool nq) + unsigned char key, bool nq) { + unsigned long keyul; spinlock_t *ptl; pgste_t old, new; pte_t *ptep; - bool unlocked; down_read(&mm->mmap_sem); -retry: - unlocked = false; ptep = get_locked_pte(mm, addr, &ptl); if (unlikely(!ptep)) { up_read(&mm->mmap_sem); return -EFAULT; } - if (!(pte_val(*ptep) & _PAGE_INVALID) && - (pte_val(*ptep) & _PAGE_PROTECT)) { - pte_unmap_unlock(ptep, ptl); - /* - * We do not really care about unlocked. We will retry either - * way. But this allows fixup_user_fault to enable userfaultfd. - */ - if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE, - &unlocked)) { - up_read(&mm->mmap_sem); - return -EFAULT; - } - goto retry; - } new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | PGSTE_ACC_BITS | PGSTE_FP_BIT); - pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; - pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; + keyul = (unsigned long) key; + pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48; + pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56; if (!(pte_val(*ptep) & _PAGE_INVALID)) { unsigned long address, bits, skey; @@ -863,13 +543,12 @@ retry: } EXPORT_SYMBOL(set_guest_storage_key); -unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) +unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr) { + unsigned char key; spinlock_t *ptl; pgste_t pgste; pte_t *ptep; - uint64_t physaddr; - unsigned long key = 0; down_read(&mm->mmap_sem); ptep = get_locked_pte(mm, addr, &ptl); @@ -880,13 +559,12 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) pgste = pgste_get_lock(ptep); if (pte_val(*ptep) & _PAGE_INVALID) { - key |= (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; + key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56; key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56; key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48; key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48; } else { - physaddr = pte_val(*ptep) & PAGE_MASK; - key = page_get_storage_key(physaddr); + key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK); /* Reflect guest's logical view, not physical */ if (pgste_val(pgste) & PGSTE_GR_BIT) @@ -901,471 +579,4 @@ unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr) return key; } EXPORT_SYMBOL(get_guest_storage_key); - -static int page_table_allocate_pgste_min = 0; -static int page_table_allocate_pgste_max = 1; -int page_table_allocate_pgste = 0; -EXPORT_SYMBOL(page_table_allocate_pgste); - -static struct ctl_table page_table_sysctl[] = { - { - .procname = "allocate_pgste", - .data = &page_table_allocate_pgste, - .maxlen = sizeof(int), - .mode = S_IRUGO | S_IWUSR, - .proc_handler = proc_dointvec, - .extra1 = &page_table_allocate_pgste_min, - .extra2 = &page_table_allocate_pgste_max, - }, - { } -}; - -static struct ctl_table page_table_sysctl_dir[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = page_table_sysctl, - }, - { } -}; - -static int __init page_table_register_sysctl(void) -{ - return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; -} -__initcall(page_table_register_sysctl); - -#else /* CONFIG_PGSTE */ - -static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, - unsigned long vmaddr) -{ -} - -#endif /* CONFIG_PGSTE */ - -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - unsigned int old, new; - - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; -} - -/* - * page table entry allocation/free routines. - */ -unsigned long *page_table_alloc(struct mm_struct *mm) -{ - unsigned long *table; - struct page *page; - unsigned int mask, bit; - - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.list_lock); - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_mapcount); - mask = (mask | (mask >> 4)) & 3; - if (mask != 3) { - table = (unsigned long *) page_to_phys(page); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&page->_mapcount, 1U << bit); - list_del(&page->lru); - } - } - spin_unlock_bh(&mm->context.list_lock); - if (table) - return table; - } - /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - /* Initialize page table */ - table = (unsigned long *) page_to_phys(page); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - atomic_set(&page->_mapcount, 3); - clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); - } else { - /* Return the first 2K fragment of the page */ - atomic_set(&page->_mapcount, 1); - clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.list_lock); - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); - } - return table; -} - -void page_table_free(struct mm_struct *mm, unsigned long *table) -{ - struct page *page; - unsigned int bit, mask; - - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); - mask = atomic_xor_bits(&page->_mapcount, 1U << bit); - if (mask & 3) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); - if (mask != 0) - return; - } - - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); -} - -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) -{ - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; - - mm = tlb->mm; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (mm_alloc_pgste(mm)) { - gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | 3); - tlb_remove_table(tlb, table); - return; - } - bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.list_lock); - mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); - if (mask & 3) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.list_lock); - table = (unsigned long *) (__pa(table) | (1U << bit)); - tlb_remove_table(tlb, table); -} - -static void __tlb_remove_table(void *_table) -{ - unsigned int mask = (unsigned long) _table & 3; - void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - - switch (mask) { - case 0: /* pmd or pud */ - free_pages((unsigned long) table, 2); - break; - case 1: /* lower 2K of a 4K page table */ - case 2: /* higher 2K of a 4K page table */ - if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) - break; - /* fallthrough */ - case 3: /* 4K page table with pgstes */ - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); - break; - } -} - -static void tlb_remove_table_smp_sync(void *arg) -{ - /* Simply deliver the interrupt */ -} - -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely - * on IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); -} - -static void tlb_remove_table_rcu(struct rcu_head *head) -{ - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); -} - -void tlb_table_flush(struct mmu_gather *tlb) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch) { - call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; - } -} - -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - tlb->mm->context.flush_mm = 1; - if (*batch == NULL) { - *batch = (struct mmu_table_batch *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - __tlb_flush_mm_lazy(tlb->mm); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; - } - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_flush_mmu(tlb); -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline void thp_split_vma(struct vm_area_struct *vma) -{ - unsigned long addr; - - for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) - follow_page(vma, addr, FOLL_SPLIT); -} - -static inline void thp_split_mm(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - thp_split_vma(vma); - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; - } - mm->def_flags |= VM_NOHUGEPAGE; -} -#else -static inline void thp_split_mm(struct mm_struct *mm) -{ -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/* - * switch on pgstes for its userspace process (for kvm) - */ -int s390_enable_sie(void) -{ - struct mm_struct *mm = current->mm; - - /* Do we have pgstes? if yes, we are done */ - if (mm_has_pgste(mm)) - return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) - return -EINVAL; - down_write(&mm->mmap_sem); - mm->context.has_pgste = 1; - /* split thp mappings and disable thp for future mappings */ - thp_split_mm(mm); - up_write(&mm->mmap_sem); - return 0; -} -EXPORT_SYMBOL_GPL(s390_enable_sie); - -/* - * Enable storage key handling from now on and initialize the storage - * keys with the default key. - */ -static int __s390_enable_skey(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - unsigned long ptev; - pgste_t pgste; - - pgste = pgste_get_lock(pte); - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) { - ptep_flush_direct(walk->mm, addr, pte); - pte_val(*pte) = _PAGE_INVALID; - } - /* Clear storage key */ - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | - PGSTE_GR_BIT | PGSTE_GC_BIT); - ptev = pte_val(*pte); - if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); - pgste_set_unlock(pte, pgste); - return 0; -} - -int s390_enable_skey(void) -{ - struct mm_walk walk = { .pte_entry = __s390_enable_skey }; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int rc = 0; - - down_write(&mm->mmap_sem); - if (mm_use_skey(mm)) - goto out_up; - - mm->context.use_skey = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.use_skey = 0; - rc = -ENOMEM; - goto out_up; - } - } - mm->def_flags &= ~VM_MERGEABLE; - - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); - -out_up: - up_write(&mm->mmap_sem); - return rc; -} -EXPORT_SYMBOL_GPL(s390_enable_skey); - -/* - * Reset CMMA state, make all pages stable again. - */ -static int __s390_reset_cmma(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk) -{ - pgste_t pgste; - - pgste = pgste_get_lock(pte); - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; - pgste_set_unlock(pte, pgste); - return 0; -} - -void s390_reset_cmma(struct mm_struct *mm) -{ - struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; - - down_write(&mm->mmap_sem); - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); - up_write(&mm->mmap_sem); -} -EXPORT_SYMBOL_GPL(s390_reset_cmma); - -/* - * Test and reset if a guest page is dirty - */ -bool gmap_test_and_clear_dirty(unsigned long address, struct gmap *gmap) -{ - pte_t *pte; - spinlock_t *ptl; - bool dirty = false; - - pte = get_locked_pte(gmap->mm, address, &ptl); - if (unlikely(!pte)) - return false; - - if (ptep_test_and_clear_user_dirty(gmap->mm, address, pte)) - dirty = true; - - spin_unlock(ptl); - return dirty; -} -EXPORT_SYMBOL_GPL(gmap_test_and_clear_dirty); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* No need to flush TLB - * On s390 reference bits are in storage key and never in TLB */ - return pmdp_test_and_clear_young(vma, address, pmdp); -} - -int pmdp_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp, - pmd_t entry, int dirty) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - - entry = pmd_mkyoung(entry); - if (dirty) - entry = pmd_mkdirty(entry); - if (pmd_same(*pmdp, entry)) - return 0; - pmdp_invalidate(vma, address, pmdp); - set_pmd_at(vma->vm_mm, address, pmdp, entry); - return 1; -} - -void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, - pgtable_t pgtable) -{ - struct list_head *lh = (struct list_head *) pgtable; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - if (!pmd_huge_pte(mm, pmdp)) - INIT_LIST_HEAD(lh); - else - list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); - pmd_huge_pte(mm, pmdp) = pgtable; -} - -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) -{ - struct list_head *lh; - pgtable_t pgtable; - pte_t *ptep; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); - - /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); - lh = (struct list_head *) pgtable; - if (list_empty(lh)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; - list_del(lh); - } - ptep = (pte_t *) pgtable; - pte_val(*ptep) = _PAGE_INVALID; - ptep++; - pte_val(*ptep) = _PAGE_INVALID; - return pgtable; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif |