summaryrefslogtreecommitdiff
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/dump_pagetables.c2
-rw-r--r--arch/s390/mm/fault.c6
-rw-r--r--arch/s390/mm/gmap.c1577
-rw-r--r--arch/s390/mm/gup.c45
-rw-r--r--arch/s390/mm/hugetlbpage.c153
-rw-r--r--arch/s390/mm/init.c13
-rw-r--r--arch/s390/mm/page-states.c13
-rw-r--r--arch/s390/mm/pageattr.c269
-rw-r--r--arch/s390/mm/pgalloc.c39
-rw-r--r--arch/s390/mm/pgtable.c302
-rw-r--r--arch/s390/mm/vmem.c73
11 files changed, 2225 insertions, 267 deletions
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 8556d6be9..861880df1 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -157,7 +157,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st,
pud = pud_offset(pgd, addr);
if (!pud_none(*pud))
if (pud_large(*pud)) {
- prot = pud_val(*pud) & _REGION3_ENTRY_RO;
+ prot = pud_val(*pud) & _REGION_ENTRY_PROTECT;
note_page(m, st, prot, 2);
} else
walk_pmd_level(m, st, pud, addr);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 19288c1b3..a58bca62a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
(struct gmap *) S390_lowcore.gmap : NULL;
if (gmap) {
current->thread.gmap_addr = address;
+ current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+ current->thread.gmap_int_code = regs->int_code & 0xffff;
address = __gmap_translate(gmap, address);
if (address == -EFAULT) {
fault = VM_FAULT_BADMAP;
@@ -456,7 +458,7 @@ retry:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags);
/* No reason to continue if interrupted by SIGKILL. */
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
fault = VM_FAULT_SIGNAL;
@@ -624,7 +626,7 @@ void pfault_fini(void)
diag_stat_inc(DIAG_STAT_X258);
asm volatile(
" diag %0,0,0x258\n"
- "0:\n"
+ "0: nopr %%r7\n"
EX_TABLE(0b,0b)
: : "a" (&refbk), "m" (refbk) : "cc");
}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 313c3b8cf..2ce6bb3ba 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -20,14 +20,16 @@
#include <asm/gmap.h>
#include <asm/tlb.h>
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
/**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
* @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
*/
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
{
struct gmap *gmap;
struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
+ INIT_LIST_HEAD(&gmap->children);
+ INIT_LIST_HEAD(&gmap->pt_list);
INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
spin_lock_init(&gmap->guest_table_lock);
- gmap->mm = mm;
+ spin_lock_init(&gmap->shadow_lock);
+ atomic_set(&gmap->ref_count, 1);
page = alloc_pages(GFP_KERNEL, 2);
if (!page)
goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
gmap->asce = atype | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | __pa(table);
gmap->asce_end = limit;
- down_write(&mm->mmap_sem);
- list_add(&gmap->list, &mm->context.gmap_list);
- up_write(&mm->mmap_sem);
return gmap;
out_free:
@@ -80,7 +83,28 @@ out_free:
out:
return NULL;
}
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+ struct gmap *gmap;
+
+ gmap = gmap_alloc(limit);
+ if (!gmap)
+ return NULL;
+ gmap->mm = mm;
+ spin_lock(&mm->context.gmap_lock);
+ list_add_rcu(&gmap->list, &mm->context.gmap_list);
+ spin_unlock(&mm->context.gmap_lock);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
static void gmap_flush_tlb(struct gmap *gmap)
{
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
} while (nr > 0);
}
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ struct radix_tree_iter iter;
+ unsigned long indices[16];
+ unsigned long index;
+ void **slot;
+ int i, nr;
+
+ /* A radix tree is freed by deleting all of its entries */
+ index = 0;
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == 16)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ head = radix_tree_delete(root, index);
+ gmap_for_each_rmap_safe(rmap, rnext, head)
+ kfree(rmap);
+ }
+ } while (nr > 0);
+}
+
/**
* gmap_free - free a guest address space
* @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
*/
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
{
struct page *page, *next;
- /* Flush tlb. */
- if (MACHINE_HAS_IDTE)
- __tlb_flush_idte(gmap->asce);
- else
- __tlb_flush_global();
-
+ /* Flush tlb of all gmaps (if not already done for shadows) */
+ if (!(gmap_is_shadow(gmap) && gmap->removed))
+ gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
__free_pages(page, 2);
gmap_radix_tree_free(&gmap->guest_to_host);
gmap_radix_tree_free(&gmap->host_to_guest);
- down_write(&gmap->mm->mmap_sem);
- list_del(&gmap->list);
- up_write(&gmap->mm->mmap_sem);
+
+ /* Free additional data for a shadow gmap */
+ if (gmap_is_shadow(gmap)) {
+ /* Free all page tables. */
+ list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+ page_table_free_pgste(page);
+ gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+ /* Release reference to the parent */
+ gmap_put(gmap->parent);
+ }
+
kfree(gmap);
}
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+ atomic_inc(&gmap->ref_count);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+ if (atomic_dec_return(&gmap->ref_count) == 0)
+ gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+ struct gmap *sg, *next;
+
+ /* Remove all shadow gmaps linked to this gmap */
+ if (!list_empty(&gmap->children)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next, &gmap->children, list) {
+ list_del(&sg->list);
+ gmap_put(sg);
+ }
+ spin_unlock(&gmap->shadow_lock);
+ }
+ /* Remove gmap from the pre-mm list */
+ spin_lock(&gmap->mm->context.gmap_lock);
+ list_del_rcu(&gmap->list);
+ spin_unlock(&gmap->mm->context.gmap_lock);
+ synchronize_rcu();
+ /* Put reference */
+ gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
/**
* gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_disable);
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+ return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
/*
* gmap_alloc_table is assumed to be called with mmap_sem held
*/
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
crst_table_init(new, init);
- spin_lock(&gmap->mm->page_table_lock);
+ spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
list_add(&page->lru, &gmap->crst_list);
*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
page->index = gaddr;
page = NULL;
}
- spin_unlock(&gmap->mm->page_table_lock);
+ spin_unlock(&gmap->guest_table_lock);
if (page)
__free_pages(page, 2);
return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
unsigned long *entry;
int flush = 0;
+ BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((from | to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
* This function does not establish potentially missing page table entries.
* The mmap_sem of the mm that belongs to the address space must be held
* when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
{
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
vmaddr = (unsigned long)
radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+ /* Note: guest_to_host is empty for a shadow gmap */
return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
}
EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
struct gmap *gmap;
int flush;
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
if (flush)
gmap_flush_tlb(gmap);
}
+ rcu_read_unlock();
}
/**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
pmd_t *pmd;
int rc;
+ BUG_ON(gmap_is_shadow(gmap));
/* Create higher level tables in the gmap page table */
table = gmap->table;
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -430,6 +560,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
VM_BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, vmaddr);
VM_BUG_ON(pud_none(*pud));
+ /* large puds cannot yet be handled */
+ if (pud_large(*pud))
+ return -EFAULT;
pmd = pmd_offset(pud, vmaddr);
VM_BUG_ON(pmd_none(*pmd));
/* large pmds cannot yet be handled */
@@ -549,116 +682,1412 @@ static LIST_HEAD(gmap_notifier_list);
static DEFINE_SPINLOCK(gmap_notifier_lock);
/**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_add(&nb->list, &gmap_notifier_list);
+ list_add_rcu(&nb->list, &gmap_notifier_list);
spin_unlock(&gmap_notifier_lock);
}
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
/**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_del_init(&nb->list);
+ list_del_rcu(&nb->list);
spin_unlock(&gmap_notifier_lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct gmap_notifier *nb;
+
+ list_for_each_entry(nb, &gmap_notifier_list, list)
+ nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+ unsigned long gaddr, int level)
+{
+ unsigned long *table;
+
+ if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+ return NULL;
+ if (gmap_is_shadow(gmap) && gmap->removed)
+ return NULL;
+ if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+ return NULL;
+ table = gmap->table;
+ switch (gmap->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ table += (gaddr >> 53) & 0x7ff;
+ if (level == 4)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION2:
+ table += (gaddr >> 42) & 0x7ff;
+ if (level == 3)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION3:
+ table += (gaddr >> 31) & 0x7ff;
+ if (level == 2)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_SEGMENT:
+ table += (gaddr >> 20) & 0x7ff;
+ if (level == 1)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table += (gaddr >> 12) & 0xff;
+ }
+ return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ * and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+ spinlock_t **ptl)
+{
+ unsigned long *table;
+
+ if (gmap_is_shadow(gmap))
+ spin_lock(&gmap->guest_table_lock);
+ /* Walk the gmap page table, lock and get pte pointer */
+ table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+ if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+ if (gmap_is_shadow(gmap))
+ spin_unlock(&gmap->guest_table_lock);
+ return NULL;
+ }
+ if (gmap_is_shadow(gmap)) {
+ *ptl = &gmap->guest_table_lock;
+ return pte_offset_map((pmd_t *) table, gaddr);
+ }
+ return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+ unsigned long vmaddr, int prot)
+{
+ struct mm_struct *mm = gmap->mm;
+ unsigned int fault_flags;
+ bool unlocked = false;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+ if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ return -EFAULT;
+ if (unlocked)
+ /* lost mmap_sem, caller has to retry __gmap_translate */
+ return 0;
+ /* Connect the page tables */
+ return __gmap_link(gmap, gaddr, vmaddr);
}
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
/**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+ spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
* @gmap: pointer to guest mapping meta data structure
* @gaddr: virtual address in the guest address space
* @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
*
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot, unsigned long bits)
{
- unsigned long addr;
+ unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
- bool unlocked;
- int rc = 0;
+ int rc;
+
+ while (len) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+ gmap_pte_op_end(ptl);
+ }
+ if (rc) {
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+ if (rc)
+ return rc;
+ continue;
+ }
+ gaddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
- if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ * call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot)
+{
+ int rc;
+
+ if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+ return -EINVAL;
+ if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
down_read(&gmap->mm->mmap_sem);
- while (len) {
- unlocked = false;
- /* Convert gmap address and connect the page tables */
- addr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(addr)) {
- rc = addr;
+ rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+ up_read(&gmap->mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ * absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+ unsigned long address, vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+ int rc;
+
+ while (1) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ pte = *ptep;
+ if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+ address = pte_val(pte) & PAGE_MASK;
+ address += gaddr & ~PAGE_MASK;
+ *val = *(unsigned long *) address;
+ pte_val(*ptep) |= _PAGE_YOUNG;
+ /* Do *NOT* clear the _PAGE_INVALID bit! */
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ }
+ if (!rc)
+ break;
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
break;
}
- /* Get the page mapped */
- if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
- &unlocked)) {
- rc = -EFAULT;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+ if (rc)
break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+ struct gmap_rmap *rmap)
+{
+ void **slot;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+ if (slot) {
+ rmap->next = radix_tree_deref_slot_protected(slot,
+ &sg->guest_table_lock);
+ radix_tree_replace_slot(slot, rmap);
+ } else {
+ rmap->next = NULL;
+ radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+ rmap);
+ }
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+ unsigned long paddr, unsigned long len, int prot)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ while (len) {
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = raddr;
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc) {
+ kfree(rmap);
+ return rc;
}
- /* While trying to map mmap_sem got unlocked. Let us retry */
- if (unlocked)
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (ptep) {
+ spin_lock(&sg->guest_table_lock);
+ rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+ PGSTE_VSIE_BIT);
+ if (!rc)
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ }
+ radix_tree_preload_end();
+ if (rc) {
+ kfree(rmap);
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ return rc;
+ continue;
+ }
+ paddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
+
+#define _SHADOW_RMAP_MASK 0x7
+#define _SHADOW_RMAP_REGION1 0x5
+#define _SHADOW_RMAP_REGION2 0x4
+#define _SHADOW_RMAP_REGION3 0x3
+#define _SHADOW_RMAP_SEGMENT 0x2
+#define _SHADOW_RMAP_PGTABLE 0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+ asm volatile(
+ " .insn rrf,0xb98e0000,%0,%1,0,0"
+ : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+ if (!table || *table & _PAGE_INVALID)
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+ ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *pgt)
+{
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < 256; i++, raddr += 1UL << 12)
+ pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long sto, *ste, *pgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+ sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+ gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+ pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+ *ste = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *sgt)
+{
+ unsigned long asce, *pgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+ if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+ continue;
+ pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+ sgt[i] = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+ }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r3o, *r3e, *sgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+ r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+ gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+ sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+ *r3e = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r3t)
+{
+ unsigned long asce, *sgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+ if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+ r3t[i] = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r2o, *r2e, *r3t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+ r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+ gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+ r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+ *r2e = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r2t)
+{
+ unsigned long asce, *r3t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+ if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
continue;
- rc = __gmap_link(gmap, gaddr, addr);
+ r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+ r2t[i] = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r1o, *r1e, *r2t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+ r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+ gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+ r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+ *r1e = _REGION1_ENTRY_EMPTY;
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r1t)
+{
+ unsigned long asce, *r2t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+ if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Clear entry and flush translation r1t -> r2t */
+ gmap_idte_one(asce, raddr);
+ r1t[i] = _REGION1_ENTRY_EMPTY;
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ if (sg->removed)
+ return;
+ sg->removed = 1;
+ gmap_call_notifier(sg, 0, -1UL);
+ gmap_flush_tlb(sg);
+ table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+ switch (sg->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ __gmap_unshadow_r1t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION2:
+ __gmap_unshadow_r2t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION3:
+ __gmap_unshadow_r3t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_SEGMENT:
+ __gmap_unshadow_sgt(sg, 0, table);
+ break;
+ }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg;
+
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+ sg->removed)
+ continue;
+ if (!sg->initialized)
+ return ERR_PTR(-EAGAIN);
+ atomic_inc(&sg->ref_count);
+ return sg;
+ }
+ return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ * given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+ if (sg->removed)
+ return 0;
+ return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg, *new;
+ unsigned long limit;
+ int rc;
+
+ BUG_ON(gmap_is_shadow(parent));
+ spin_lock(&parent->shadow_lock);
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ spin_unlock(&parent->shadow_lock);
+ if (sg)
+ return sg;
+ /* Create a new shadow gmap */
+ limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+ if (asce & _ASCE_REAL_SPACE)
+ limit = -1UL;
+ new = gmap_alloc(limit);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ new->mm = parent->mm;
+ new->parent = gmap_get(parent);
+ new->orig_asce = asce;
+ new->edat_level = edat_level;
+ new->initialized = false;
+ spin_lock(&parent->shadow_lock);
+ /* Recheck if another CPU created the same shadow */
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ spin_unlock(&parent->shadow_lock);
+ gmap_free(new);
+ return sg;
+ }
+ if (asce & _ASCE_REAL_SPACE) {
+ /* only allow one real-space gmap shadow */
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce & _ASCE_REAL_SPACE) {
+ spin_lock(&sg->guest_table_lock);
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ break;
+ }
+ }
+ }
+ atomic_set(&new->ref_count, 2);
+ list_add(&new->list, &parent->children);
+ if (asce & _ASCE_REAL_SPACE) {
+ /* nothing to protect, return right away */
+ new->initialized = true;
+ spin_unlock(&parent->shadow_lock);
+ return new;
+ }
+ spin_unlock(&parent->shadow_lock);
+ /* protect after insertion, so it will get properly invalidated */
+ down_read(&parent->mm->mmap_sem);
+ rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+ ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+ PROT_READ, PGSTE_VSIE_BIT);
+ up_read(&parent->mm->mmap_sem);
+ spin_lock(&parent->shadow_lock);
+ new->initialized = true;
+ if (rc) {
+ list_del(&new->list);
+ gmap_free(new);
+ new = ERR_PTR(rc);
+ }
+ spin_unlock(&parent->shadow_lock);
+ return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r2t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r2t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r2t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r2t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r2t read-only in parent gmap page table */
+ raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+ origin = r2t & _REGION_ENTRY_ORIGIN;
+ offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 4);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r2t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r2t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r3t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r3t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r3t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ }
+ crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r3t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r3t read-only in parent gmap page table */
+ raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+ origin = r3t & _REGION_ENTRY_ORIGIN;
+ offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 3);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r3t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r3t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_sgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+ /* Allocate a shadow segment table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = sgt & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_sgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= sgt & _REGION_ENTRY_PROTECT;
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make sgt read-only in parent gmap page table */
+ raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+ origin = sgt & _REGION_ENTRY_ORIGIN;
+ offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 2);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_sgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_sgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ unsigned long *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+ /* Shadow page tables are full pages (pte+pgste) */
+ page = pfn_to_page(*table >> PAGE_SHIFT);
+ *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+ *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+ *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+ rc = 0;
+ } else {
+ rc = -EAGAIN;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake)
+{
+ unsigned long raddr, origin;
+ unsigned long *s_pgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+ /* Allocate a shadow page table */
+ page = page_table_alloc_pgste(sg->mm);
+ if (!page)
+ return -ENOMEM;
+ page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_pgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow page table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+ (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+ list_add(&page->lru, &sg->pt_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make pgt read-only in parent gmap page table (not the pgste) */
+ raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+ rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 1);
+ if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+ (unsigned long) s_pgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_pgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ page_table_free_pgste(page);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr, paddr;
+ spinlock_t *ptl;
+ pte_t *sptep, *tptep;
+ int prot;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+ while (1) {
+ paddr = pte_val(pte) & PAGE_MASK;
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
+ break;
+ }
+ rc = radix_tree_preload(GFP_KERNEL);
if (rc)
break;
- /* Walk the process page table, lock and get pte pointer */
- ptep = get_locked_pte(gmap->mm, addr, &ptl);
- VM_BUG_ON(!ptep);
- /* Set notification bit in the pgste of the pte */
- if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
- ptep_set_notify(gmap->mm, addr, ptep);
- gaddr += PAGE_SIZE;
- len -= PAGE_SIZE;
+ rc = -EAGAIN;
+ sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (sptep) {
+ spin_lock(&sg->guest_table_lock);
+ /* Get page table pointer */
+ tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+ if (!tptep) {
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ radix_tree_preload_end();
+ break;
+ }
+ rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+ if (rc > 0) {
+ /* Success and a new mapping */
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ rmap = NULL;
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ spin_unlock(&sg->guest_table_lock);
}
- pte_unmap_unlock(ptep, ptl);
+ radix_tree_preload_end();
+ if (!rc)
+ break;
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ break;
}
- up_read(&gmap->mm->mmap_sem);
+ kfree(rmap);
return rc;
}
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+ unsigned long offset, pte_t *pte)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ unsigned long gaddr, start, end, bits, raddr;
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->parent->guest_table_lock);
+ table = radix_tree_lookup(&sg->parent->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+ spin_unlock(&sg->parent->guest_table_lock);
+ if (!table)
+ return;
+
+ spin_lock(&sg->guest_table_lock);
+ if (sg->removed) {
+ spin_unlock(&sg->guest_table_lock);
+ return;
+ }
+ /* Check for top level table */
+ start = sg->orig_asce & _ASCE_ORIGIN;
+ end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+ if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+ gaddr < end) {
+ /* The complete shadow table has to go */
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ return;
+ }
+ /* Remove the page table tree from on specific entry */
+ head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+ gmap_for_each_rmap_safe(rmap, rnext, head) {
+ bits = rmap->raddr & _SHADOW_RMAP_MASK;
+ raddr = rmap->raddr ^ bits;
+ switch (bits) {
+ case _SHADOW_RMAP_REGION1:
+ gmap_unshadow_r2t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION2:
+ gmap_unshadow_r3t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION3:
+ gmap_unshadow_sgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_SEGMENT:
+ gmap_unshadow_pgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_PGTABLE:
+ gmap_unshadow_page(sg, raddr);
+ break;
+ }
+ kfree(rmap);
+ }
+ spin_unlock(&sg->guest_table_lock);
+}
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
* @addr: virtual address in the process address space
* @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
*
* This function is assumed to be called with the page table lock held
* for the pte to notify.
*/
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+ pte_t *pte, unsigned long bits)
{
unsigned long offset, gaddr;
unsigned long *table;
- struct gmap_notifier *nb;
- struct gmap *gmap;
+ struct gmap *gmap, *sg, *next;
offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
offset = offset * (4096 / sizeof(pte_t));
- spin_lock(&gmap_notifier_lock);
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next,
+ &gmap->children, list)
+ gmap_shadow_notify(sg, vmaddr, offset, pte);
+ spin_unlock(&gmap->shadow_lock);
+ }
+ if (!(bits & PGSTE_IN_BIT))
+ continue;
+ spin_lock(&gmap->guest_table_lock);
table = radix_tree_lookup(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT);
- if (!table)
- continue;
- gaddr = __gmap_segment_gaddr(table) + offset;
- list_for_each_entry(nb, &gmap_notifier_list, list)
- nb->notifier_call(gmap, gaddr);
+ if (table)
+ gaddr = __gmap_segment_gaddr(table) + offset;
+ spin_unlock(&gmap->guest_table_lock);
+ if (table)
+ gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
}
- spin_unlock(&gmap_notifier_lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ptep_notify);
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index a8a6765f1..adb0c34bf 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -128,6 +128,44 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
return 1;
}
+static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ struct page *head, *page;
+ unsigned long mask;
+ int refs;
+
+ mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID;
+ if ((pud_val(pud) & mask) != 0)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
+
+ refs = 0;
+ head = pud_page(pud);
+ page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pud_val(pud) != pud_val(*pudp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ return 1;
+}
+
static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
@@ -144,7 +182,12 @@ static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
+ if (unlikely(pud_large(pud))) {
+ if (!gup_huge_pud(pudp, pud, addr, next, write, pages,
+ nr))
+ return 0;
+ } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages,
+ nr))
return 0;
} while (pudp++, addr = next, addr != end);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 1b5e8983f..cd404aa39 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -1,19 +1,28 @@
/*
* IBM System z Huge TLB Page Support for Kernel.
*
- * Copyright IBM Corp. 2007
+ * Copyright IBM Corp. 2007,2016
* Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
*/
+#define KMSG_COMPONENT "hugetlb"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
#include <linux/mm.h>
#include <linux/hugetlb.h>
-static inline pmd_t __pte_to_pmd(pte_t pte)
+/*
+ * If the bit selected by single-bit bitmask "a" is set within "x", move
+ * it to the position indicated by single-bit bitmask "b".
+ */
+#define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b))
+
+static inline unsigned long __pte_to_rste(pte_t pte)
{
- pmd_t pmd;
+ unsigned long rste;
/*
- * Convert encoding pte bits pmd bits
+ * Convert encoding pte bits pmd / pud bits
* lIR.uswrdy.p dy..R...I...wr
* empty 010.000000.0 -> 00..0...1...00
* prot-none, clean, old 111.000000.1 -> 00..1...1...00
@@ -33,25 +42,40 @@ static inline pmd_t __pte_to_pmd(pte_t pte)
* u unused, l large
*/
if (pte_present(pte)) {
- pmd_val(pmd) = pte_val(pte) & PAGE_MASK;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_READ) >> 4;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_WRITE) >> 4;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_INVALID) >> 5;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_PROTECT);
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_DIRTY) << 10;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_YOUNG) << 10;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_SOFT_DIRTY) << 13;
+ rste = pte_val(pte) & PAGE_MASK;
+ rste |= move_set_bit(pte_val(pte), _PAGE_READ,
+ _SEGMENT_ENTRY_READ);
+ rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
+ _SEGMENT_ENTRY_WRITE);
+ rste |= move_set_bit(pte_val(pte), _PAGE_INVALID,
+ _SEGMENT_ENTRY_INVALID);
+ rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT,
+ _SEGMENT_ENTRY_PROTECT);
+ rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY,
+ _SEGMENT_ENTRY_DIRTY);
+ rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG,
+ _SEGMENT_ENTRY_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY,
+ _SEGMENT_ENTRY_SOFT_DIRTY);
+#endif
} else
- pmd_val(pmd) = _SEGMENT_ENTRY_INVALID;
- return pmd;
+ rste = _SEGMENT_ENTRY_INVALID;
+ return rste;
}
-static inline pte_t __pmd_to_pte(pmd_t pmd)
+static inline pte_t __rste_to_pte(unsigned long rste)
{
+ int present;
pte_t pte;
+ if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ present = pud_present(__pud(rste));
+ else
+ present = pmd_present(__pmd(rste));
+
/*
- * Convert encoding pmd bits pte bits
+ * Convert encoding pmd / pud bits pte bits
* dy..R...I...wr lIR.uswrdy.p
* empty 00..0...1...00 -> 010.000000.0
* prot-none, clean, old 00..1...1...00 -> 111.000000.1
@@ -70,16 +94,25 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
* SW-bits: p present, y young, d dirty, r read, w write, s special,
* u unused, l large
*/
- if (pmd_present(pmd)) {
- pte_val(pte) = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN_LARGE;
+ if (present) {
+ pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_READ) << 4;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) << 4;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) << 5;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT);
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) >> 10;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) >> 10;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY) >> 13;
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
+ _PAGE_READ);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
+ _PAGE_WRITE);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
+ _PAGE_INVALID);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
+ _PAGE_PROTECT);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
+ _PAGE_DIRTY);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
+ _PAGE_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
+ _PAGE_DIRTY);
+#endif
} else
pte_val(pte) = _PAGE_INVALID;
return pte;
@@ -88,27 +121,33 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- pmd_t pmd = __pte_to_pmd(pte);
-
- pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
- *(pmd_t *) ptep = pmd;
+ unsigned long rste = __pte_to_rste(pte);
+
+ /* Set correct table type for 2G hugepages */
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
+ else
+ rste |= _SEGMENT_ENTRY_LARGE;
+ pte_val(*ptep) = rste;
}
pte_t huge_ptep_get(pte_t *ptep)
{
- pmd_t pmd = *(pmd_t *) ptep;
-
- return __pmd_to_pte(pmd);
+ return __rste_to_pte(pte_val(*ptep));
}
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
+ pte_t pte = huge_ptep_get(ptep);
pmd_t *pmdp = (pmd_t *) ptep;
- pmd_t old;
+ pud_t *pudp = (pud_t *) ptep;
- old = pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
- return __pmd_to_pte(old);
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY));
+ else
+ pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+ return pte;
}
pte_t *huge_pte_alloc(struct mm_struct *mm,
@@ -120,8 +159,12 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pgdp = pgd_offset(mm, addr);
pudp = pud_alloc(mm, pgdp, addr);
- if (pudp)
- pmdp = pmd_alloc(mm, pudp, addr);
+ if (pudp) {
+ if (sz == PUD_SIZE)
+ return (pte_t *) pudp;
+ else if (sz == PMD_SIZE)
+ pmdp = pmd_alloc(mm, pudp, addr);
+ }
return (pte_t *) pmdp;
}
@@ -134,8 +177,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
pgdp = pgd_offset(mm, addr);
if (pgd_present(*pgdp)) {
pudp = pud_offset(pgdp, addr);
- if (pud_present(*pudp))
+ if (pud_present(*pudp)) {
+ if (pud_large(*pudp))
+ return (pte_t *) pudp;
pmdp = pmd_offset(pudp, addr);
+ }
}
return (pte_t *) pmdp;
}
@@ -147,5 +193,34 @@ int pmd_huge(pmd_t pmd)
int pud_huge(pud_t pud)
{
- return 0;
+ return pud_large(pud);
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int flags)
+{
+ if (flags & FOLL_GET)
+ return NULL;
+
+ return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+}
+
+static __init int setup_hugepagesz(char *opt)
+{
+ unsigned long size;
+ char *string = opt;
+
+ size = memparse(opt, &opt);
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else {
+ pr_err("hugepagesz= specifies an unsupported page size %s\n",
+ string);
+ return 0;
+ }
+ return 1;
}
+__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 2489b2e91..f56a39bd8 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -40,7 +40,7 @@
#include <asm/ctl_reg.h>
#include <asm/sclp.h>
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
@@ -111,17 +111,16 @@ void __init paging_init(void)
void mark_rodata_ro(void)
{
- /* Text and rodata are already protected. Nothing to do here. */
- pr_info("Write protecting the kernel read-only data: %luk\n",
- ((unsigned long)&_eshared - (unsigned long)&_stext) >> 10);
+ unsigned long size = __end_ro_after_init - __start_ro_after_init;
+
+ set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
+ pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
}
void __init mem_init(void)
{
- if (MACHINE_HAS_TLB_LC)
- cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
+ cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(0, mm_cpumask(&init_mm));
- atomic_set(&init_mm.context.attach_count, 1);
set_max_mapnr(max_low_pfn);
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index a90d45e9d..3330ea124 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -34,20 +34,25 @@ static int __init cmma(char *str)
}
__setup("cmma=", cmma);
-void __init cmma_init(void)
+static inline int cmma_test_essa(void)
{
register unsigned long tmp asm("0") = 0;
register int rc asm("1") = -EOPNOTSUPP;
- if (!cmma_flag)
- return;
asm volatile(
" .insn rrf,0xb9ab0000,%1,%1,0,0\n"
"0: la %0,0\n"
"1:\n"
EX_TABLE(0b,1b)
: "+&d" (rc), "+&d" (tmp));
- if (rc)
+ return rc;
+}
+
+void __init cmma_init(void)
+{
+ if (!cmma_flag)
+ return;
+ if (cmma_test_essa())
cmma_flag = 0;
}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index f2a5c29a9..af7cf28cf 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -10,7 +10,6 @@
#include <asm/pgtable.h>
#include <asm/page.h>
-#if PAGE_DEFAULT_KEY
static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
{
asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0"
@@ -22,6 +21,8 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
{
unsigned long boundary, size;
+ if (!PAGE_DEFAULT_KEY)
+ return;
while (start < end) {
if (MACHINE_HAS_EDAT1) {
/* set storage keys for a 1MB frame */
@@ -38,56 +39,256 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
start += PAGE_SIZE;
}
}
-#endif
-static pte_t *walk_page_table(unsigned long addr)
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+
+void arch_report_meminfo(struct seq_file *m)
{
- pgd_t *pgdp;
- pud_t *pudp;
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2);
+ seq_printf(m, "DirectMap1M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10);
+ seq_printf(m, "DirectMap2G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21);
+}
+#endif /* CONFIG_PROC_FS */
+
+static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
+ unsigned long dtt)
+{
+ unsigned long table, mask;
+
+ mask = 0;
+ if (MACHINE_HAS_EDAT2) {
+ switch (dtt) {
+ case CRDTE_DTT_REGION3:
+ mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
+ break;
+ case CRDTE_DTT_SEGMENT:
+ mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+ break;
+ case CRDTE_DTT_PAGE:
+ mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+ break;
+ }
+ table = (unsigned long)old & mask;
+ crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
+ } else if (MACHINE_HAS_IDTE) {
+ cspg(old, *old, new);
+ } else {
+ csp((unsigned int *)old + 1, *old, new);
+ }
+}
+
+struct cpa {
+ unsigned int set_ro : 1;
+ unsigned int clear_ro : 1;
+};
+
+static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ pte_t *ptep, new;
+
+ ptep = pte_offset(pmdp, addr);
+ do {
+ if (pte_none(*ptep))
+ return -EINVAL;
+ if (cpa.set_ro)
+ new = pte_wrprotect(*ptep);
+ else if (cpa.clear_ro)
+ new = pte_mkwrite(pte_mkdirty(*ptep));
+ pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
+ ptep++;
+ addr += PAGE_SIZE;
+ cond_resched();
+ } while (addr < end);
+ return 0;
+}
+
+static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
+{
+ unsigned long pte_addr, prot;
+ pte_t *pt_dir, *ptep;
+ pmd_t new;
+ int i, ro;
+
+ pt_dir = vmem_pte_alloc();
+ if (!pt_dir)
+ return -ENOMEM;
+ pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT;
+ ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT);
+ prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
+ ptep = pt_dir;
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_val(*ptep) = pte_addr | prot;
+ pte_addr += PAGE_SIZE;
+ ptep++;
+ }
+ pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+ pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+ update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
+ update_page_count(PG_DIRECT_MAP_1M, -1);
+ return 0;
+}
+
+static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, struct cpa cpa)
+{
+ pmd_t new;
+
+ if (cpa.set_ro)
+ new = pmd_wrprotect(*pmdp);
+ else if (cpa.clear_ro)
+ new = pmd_mkwrite(pmd_mkdirty(*pmdp));
+ pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+}
+
+static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ unsigned long next;
pmd_t *pmdp;
- pte_t *ptep;
+ int rc = 0;
- pgdp = pgd_offset_k(addr);
- if (pgd_none(*pgdp))
- return NULL;
- pudp = pud_offset(pgdp, addr);
- if (pud_none(*pudp) || pud_large(*pudp))
- return NULL;
pmdp = pmd_offset(pudp, addr);
- if (pmd_none(*pmdp) || pmd_large(*pmdp))
- return NULL;
- ptep = pte_offset_kernel(pmdp, addr);
- if (pte_none(*ptep))
- return NULL;
- return ptep;
+ do {
+ if (pmd_none(*pmdp))
+ return -EINVAL;
+ next = pmd_addr_end(addr, end);
+ if (pmd_large(*pmdp)) {
+ if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+ rc = split_pmd_page(pmdp, addr);
+ if (rc)
+ return rc;
+ continue;
+ }
+ modify_pmd_page(pmdp, addr, cpa);
+ } else {
+ rc = walk_pte_level(pmdp, addr, next, cpa);
+ if (rc)
+ return rc;
+ }
+ pmdp++;
+ addr = next;
+ cond_resched();
+ } while (addr < end);
+ return rc;
}
-static void change_page_attr(unsigned long addr, int numpages,
- pte_t (*set) (pte_t))
+static int split_pud_page(pud_t *pudp, unsigned long addr)
{
- pte_t *ptep;
- int i;
+ unsigned long pmd_addr, prot;
+ pmd_t *pm_dir, *pmdp;
+ pud_t new;
+ int i, ro;
- for (i = 0; i < numpages; i++) {
- ptep = walk_page_table(addr);
- if (WARN_ON_ONCE(!ptep))
- break;
- *ptep = set(*ptep);
- addr += PAGE_SIZE;
+ pm_dir = vmem_pmd_alloc();
+ if (!pm_dir)
+ return -ENOMEM;
+ pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT;
+ ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT);
+ prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL);
+ pmdp = pm_dir;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_val(*pmdp) = pmd_addr | prot;
+ pmd_addr += PMD_SIZE;
+ pmdp++;
}
- __tlb_flush_kernel();
+ pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+ pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+ update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
+ update_page_count(PG_DIRECT_MAP_2G, -1);
+ return 0;
+}
+
+static void modify_pud_page(pud_t *pudp, unsigned long addr, struct cpa cpa)
+{
+ pud_t new;
+
+ if (cpa.set_ro)
+ new = pud_wrprotect(*pudp);
+ else if (cpa.clear_ro)
+ new = pud_mkwrite(pud_mkdirty(*pudp));
+ pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+}
+
+static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ unsigned long next;
+ pud_t *pudp;
+ int rc = 0;
+
+ pudp = pud_offset(pgd, addr);
+ do {
+ if (pud_none(*pudp))
+ return -EINVAL;
+ next = pud_addr_end(addr, end);
+ if (pud_large(*pudp)) {
+ if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+ rc = split_pud_page(pudp, addr);
+ if (rc)
+ break;
+ continue;
+ }
+ modify_pud_page(pudp, addr, cpa);
+ } else {
+ rc = walk_pmd_level(pudp, addr, next, cpa);
+ }
+ pudp++;
+ addr = next;
+ cond_resched();
+ } while (addr < end && !rc);
+ return rc;
+}
+
+static DEFINE_MUTEX(cpa_mutex);
+
+static int change_page_attr(unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ unsigned long next;
+ int rc = -EINVAL;
+ pgd_t *pgdp;
+
+ if (addr == end)
+ return 0;
+ if (end >= MODULES_END)
+ return -EINVAL;
+ mutex_lock(&cpa_mutex);
+ pgdp = pgd_offset_k(addr);
+ do {
+ if (pgd_none(*pgdp))
+ break;
+ next = pgd_addr_end(addr, end);
+ rc = walk_pud_level(pgdp, addr, next, cpa);
+ if (rc)
+ break;
+ cond_resched();
+ } while (pgdp++, addr = next, addr < end && !rc);
+ mutex_unlock(&cpa_mutex);
+ return rc;
}
int set_memory_ro(unsigned long addr, int numpages)
{
- change_page_attr(addr, numpages, pte_wrprotect);
- return 0;
+ struct cpa cpa = {
+ .set_ro = 1,
+ };
+
+ addr &= PAGE_MASK;
+ return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
}
int set_memory_rw(unsigned long addr, int numpages)
{
- change_page_attr(addr, numpages, pte_mkwrite);
- return 0;
+ struct cpa cpa = {
+ .clear_ro = 1,
+ };
+
+ addr &= PAGE_MASK;
+ return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
}
/* not possible */
@@ -138,7 +339,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
nr = min(numpages - i, nr);
if (enable) {
for (j = 0; j < nr; j++) {
- pte_val(*pte) = __pa(address);
+ pte_val(*pte) = address | pgprot_val(PAGE_KERNEL);
address += PAGE_SIZE;
pte++;
}
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index e2565d2d0..995f78532 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
return new;
}
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+ struct page *page;
+ unsigned long *table;
+
+ page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+ if (page) {
+ table = (unsigned long *) page_to_phys(page);
+ clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+ clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+ }
+ return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+ __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
/*
* page table entry allocation/free routines.
*/
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Try to get a fragment of a 4K page as a 2K page table */
if (!mm_alloc_pgste(mm)) {
table = NULL;
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
list_del(&page->lru);
}
}
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (table)
return table;
}
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Return the first 2K fragment of the page */
atomic_set(&page->_mapcount, 1);
clear_table(table, _PAGE_INVALID, PAGE_SIZE);
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
list_add(&page->lru, &mm->context.pgtable_list);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
}
return table;
}
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (mask != 0)
return;
}
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
return;
}
bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
if (mask & 3)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
table = (unsigned long *) (__pa(table) | (1U << bit));
tlb_remove_table(tlb, table);
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ebb4f8711..5f092015a 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -27,40 +27,37 @@
static inline pte_t ptep_flush_direct(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- int active, count;
pte_t old;
old = *ptep;
if (unlikely(pte_val(old) & _PAGE_INVALID))
return old;
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
__ptep_ipte_local(addr, ptep);
else
__ptep_ipte(addr, ptep);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- int active, count;
pte_t old;
old = *ptep;
if (unlikely(pte_val(old) & _PAGE_INVALID))
return old;
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if ((count & 0xffff) <= active) {
+ atomic_inc(&mm->context.flush_count);
+ if (cpumask_equal(&mm->context.cpu_attach_mask,
+ cpumask_of(smp_processor_id()))) {
pte_val(*ptep) |= _PAGE_INVALID;
mm->context.flush_mm = 1;
} else
__ptep_ipte(addr, ptep);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
@@ -70,7 +67,6 @@ static inline pgste_t pgste_get_lock(pte_t *ptep)
#ifdef CONFIG_PGSTE
unsigned long old;
- preempt_disable();
asm(
" lg %0,%2\n"
"0: lgr %1,%0\n"
@@ -93,7 +89,6 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
: "=Q" (ptep[PTRS_PER_PTE])
: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
: "cc", "memory");
- preempt_enable();
#endif
}
@@ -179,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
return pgste;
}
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
- if (pgste_val(pgste) & PGSTE_IN_BIT) {
- pgste_val(pgste) &= ~PGSTE_IN_BIT;
- ptep_notify(mm, addr, ptep);
+ unsigned long bits;
+
+ bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+ if (bits) {
+ pgste_val(pgste) ^= bits;
+ ptep_notify(mm, addr, ptep, bits);
}
#endif
return pgste;
@@ -199,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
}
return pgste;
}
@@ -230,9 +228,11 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
pte_t old;
+ preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
old = ptep_flush_direct(mm, addr, ptep);
ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(ptep_xchg_direct);
@@ -243,9 +243,11 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
pte_t old;
+ preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
old = ptep_flush_lazy(mm, addr, ptep);
ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(ptep_xchg_lazy);
@@ -256,6 +258,7 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
pte_t old;
+ preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
old = ptep_flush_lazy(mm, addr, ptep);
if (mm_has_pgste(mm)) {
@@ -279,13 +282,13 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
} else {
*ptep = pte;
}
+ preempt_enable();
}
EXPORT_SYMBOL(ptep_modify_prot_commit);
static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- int active, count;
pmd_t old;
old = *pmdp;
@@ -295,36 +298,34 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
__pmdp_csp(pmdp);
return old;
}
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
__pmdp_idte_local(addr, pmdp);
else
__pmdp_idte(addr, pmdp);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- int active, count;
pmd_t old;
old = *pmdp;
if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
return old;
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if ((count & 0xffff) <= active) {
+ atomic_inc(&mm->context.flush_count);
+ if (cpumask_equal(&mm->context.cpu_attach_mask,
+ cpumask_of(smp_processor_id()))) {
pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
mm->context.flush_mm = 1;
} else if (MACHINE_HAS_IDTE)
__pmdp_idte(addr, pmdp);
else
__pmdp_csp(pmdp);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
@@ -333,8 +334,10 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
{
pmd_t old;
+ preempt_disable();
old = pmdp_flush_direct(mm, addr, pmdp);
*pmdp = new;
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(pmdp_xchg_direct);
@@ -344,12 +347,53 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
{
pmd_t old;
+ preempt_disable();
old = pmdp_flush_lazy(mm, addr, pmdp);
*pmdp = new;
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(pmdp_xchg_lazy);
+static inline pud_t pudp_flush_direct(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old;
+
+ old = *pudp;
+ if (pud_val(old) & _REGION_ENTRY_INVALID)
+ return old;
+ if (!MACHINE_HAS_IDTE) {
+ /*
+ * Invalid bit position is the same for pmd and pud, so we can
+ * re-use _pmd_csp() here
+ */
+ __pmdp_csp((pmd_t *) pudp);
+ return old;
+ }
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ __pudp_idte_local(addr, pudp);
+ else
+ __pudp_idte(addr, pudp);
+ atomic_dec(&mm->context.flush_count);
+ return old;
+}
+
+pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t new)
+{
+ pud_t old;
+
+ preempt_disable();
+ old = pudp_flush_direct(mm, addr, pudp);
+ *pudp = new;
+ preempt_enable();
+ return old;
+}
+EXPORT_SYMBOL(pudp_xchg_direct);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
@@ -398,20 +442,108 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
/* the mm_has_pgste() check is done in set_pte_at() */
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
pgste_set_key(ptep, pgste, entry, mm);
pgste = pgste_set_pte(ptep, pgste, entry);
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
}
void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
pgste_t pgste;
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgste_val(pgste) |= PGSTE_IN_BIT;
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
+}
+
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int prot, unsigned long bit)
+{
+ pte_t entry;
+ pgste_t pgste;
+ int pte_i, pte_p;
+
+ pgste = pgste_get_lock(ptep);
+ entry = *ptep;
+ /* Check pte entry after all locks have been acquired */
+ pte_i = pte_val(entry) & _PAGE_INVALID;
+ pte_p = pte_val(entry) & _PAGE_PROTECT;
+ if ((pte_i && (prot != PROT_NONE)) ||
+ (pte_p && (prot & PROT_WRITE))) {
+ pgste_set_unlock(ptep, pgste);
+ return -EAGAIN;
+ }
+ /* Change access rights and set pgste bit */
+ if (prot == PROT_NONE && !pte_i) {
+ ptep_flush_direct(mm, addr, ptep);
+ pgste = pgste_update_all(entry, pgste, mm);
+ pte_val(entry) |= _PAGE_INVALID;
+ }
+ if (prot == PROT_READ && !pte_p) {
+ ptep_flush_direct(mm, addr, ptep);
+ pte_val(entry) &= ~_PAGE_INVALID;
+ pte_val(entry) |= _PAGE_PROTECT;
+ }
+ pgste_val(pgste) |= bit;
+ pgste = pgste_set_pte(ptep, pgste, entry);
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+ pgste_t spgste, tpgste;
+ pte_t spte, tpte;
+ int rc = -EAGAIN;
+
+ if (!(pte_val(*tptep) & _PAGE_INVALID))
+ return 0; /* already shadowed */
+ spgste = pgste_get_lock(sptep);
+ spte = *sptep;
+ if (!(pte_val(spte) & _PAGE_INVALID) &&
+ !((pte_val(spte) & _PAGE_PROTECT) &&
+ !(pte_val(pte) & _PAGE_PROTECT))) {
+ pgste_val(spgste) |= PGSTE_VSIE_BIT;
+ tpgste = pgste_get_lock(tptep);
+ pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT);
+ /* don't touch the storage key - it belongs to parent pgste */
+ tpgste = pgste_set_pte(tptep, tpgste, tpte);
+ pgste_set_unlock(tptep, tpgste);
+ rc = 1;
+ }
+ pgste_set_unlock(sptep, spgste);
+ return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+ pgste_t pgste;
+
+ pgste = pgste_get_lock(ptep);
+ /* notifier is called by the caller */
+ ptep_flush_direct(mm, saddr, ptep);
+ /* don't touch the storage key - it belongs to parent pgste */
+ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+ pgste_set_unlock(ptep, pgste);
}
static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
@@ -434,6 +566,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_t pte;
/* Zap unused and logically-zero pages */
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgstev = pgste_val(pgste);
pte = *ptep;
@@ -446,6 +579,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
if (reset)
pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
}
void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -454,6 +588,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
pgste_t pgste;
/* Clear storage key */
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -461,6 +596,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
}
/*
@@ -483,7 +619,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
pgste_val(pgste) &= ~PGSTE_UC_BIT;
pte = *ptep;
if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
__ptep_ipte(addr, ptep);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
pte_val(pte) |= _PAGE_PROTECT;
@@ -506,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_t old, new;
pte_t *ptep;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@ -538,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
return 0;
}
EXPORT_SYMBOL(set_guest_storage_key);
-unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc)
+{
+ unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+ int rc;
+
+ /* we can drop the pgste lock between getting and setting the key */
+ if (mr | mc) {
+ rc = get_guest_storage_key(current->mm, addr, &tmp);
+ if (rc)
+ return rc;
+ if (oldkey)
+ *oldkey = tmp;
+ if (!mr)
+ mask |= _PAGE_REFERENCED;
+ if (!mc)
+ mask |= _PAGE_CHANGED;
+ if (!((tmp ^ key) & mask))
+ return 0;
+ }
+ rc = set_guest_storage_key(current->mm, addr, key, nq);
+ return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
{
- unsigned char key;
spinlock_t *ptl;
- pgste_t pgste;
+ pgste_t old, new;
pte_t *ptep;
+ int cc = 0;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
- pgste = pgste_get_lock(ptep);
- if (pte_val(*ptep) & _PAGE_INVALID) {
- key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
- key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
- key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
- key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
- } else {
- key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ new = old = pgste_get_lock(ptep);
+ /* Reset guest reference bit only */
+ pgste_val(new) &= ~PGSTE_GR_BIT;
- /* Reflect guest's logical view, not physical */
- if (pgste_val(pgste) & PGSTE_GR_BIT)
- key |= _PAGE_REFERENCED;
- if (pgste_val(pgste) & PGSTE_GC_BIT)
- key |= _PAGE_CHANGED;
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+ /* Merge real referenced bit into host-set */
+ pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
}
+ /* Reflect guest's logical view, not physical */
+ cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+ /* Changing the guest storage key is considered a change of the page */
+ if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+ pgste_val(new) |= PGSTE_UC_BIT;
+
+ pgste_set_unlock(ptep, new);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key)
+{
+ spinlock_t *ptl;
+ pgste_t pgste;
+ pte_t *ptep;
+ ptep = get_locked_pte(mm, addr, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+
+ pgste = pgste_get_lock(ptep);
+ *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+ if (!(pte_val(*ptep) & _PAGE_INVALID))
+ *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ /* Reflect guest's logical view, not physical */
+ *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
pgste_set_unlock(ptep, pgste);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
- return key;
+ return 0;
}
EXPORT_SYMBOL(get_guest_storage_key);
#endif
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index d48cf25cf..184829276 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -11,6 +11,7 @@
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/memblock.h>
+#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
@@ -29,9 +30,11 @@ static LIST_HEAD(mem_segs);
static void __ref *vmem_alloc_pages(unsigned int order)
{
+ unsigned long size = PAGE_SIZE << order;
+
if (slab_is_available())
return (void *)__get_free_pages(GFP_KERNEL, order);
- return alloc_bootmem_pages((1 << order) * PAGE_SIZE);
+ return alloc_bootmem_align(size, size);
}
static inline pud_t *vmem_pud_alloc(void)
@@ -45,7 +48,7 @@ static inline pud_t *vmem_pud_alloc(void)
return pud;
}
-static inline pmd_t *vmem_pmd_alloc(void)
+pmd_t *vmem_pmd_alloc(void)
{
pmd_t *pmd = NULL;
@@ -56,7 +59,7 @@ static inline pmd_t *vmem_pmd_alloc(void)
return pmd;
}
-static pte_t __ref *vmem_pte_alloc(void)
+pte_t __ref *vmem_pte_alloc(void)
{
pte_t *pte;
@@ -75,8 +78,9 @@ static pte_t __ref *vmem_pte_alloc(void)
/*
* Add a physical memory range to the 1:1 mapping.
*/
-static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
+static int vmem_add_mem(unsigned long start, unsigned long size)
{
+ unsigned long pages4k, pages1m, pages2g;
unsigned long end = start + size;
unsigned long address = start;
pgd_t *pg_dir;
@@ -85,6 +89,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
pte_t *pt_dir;
int ret = -ENOMEM;
+ pages4k = pages1m = pages2g = 0;
while (address < end) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
@@ -97,10 +102,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
!(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
!debug_pagealloc_enabled()) {
- pud_val(*pu_dir) = __pa(address) |
- _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
- (ro ? _REGION_ENTRY_PROTECT : 0);
+ pud_val(*pu_dir) = address | pgprot_val(REGION3_KERNEL);
address += PUD_SIZE;
+ pages2g++;
continue;
}
if (pud_none(*pu_dir)) {
@@ -113,11 +117,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
!(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
!debug_pagealloc_enabled()) {
- pmd_val(*pm_dir) = __pa(address) |
- _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
- _SEGMENT_ENTRY_YOUNG |
- (ro ? _SEGMENT_ENTRY_PROTECT : 0);
+ pmd_val(*pm_dir) = address | pgprot_val(SEGMENT_KERNEL);
address += PMD_SIZE;
+ pages1m++;
continue;
}
if (pmd_none(*pm_dir)) {
@@ -128,12 +130,15 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
}
pt_dir = pte_offset_kernel(pm_dir, address);
- pte_val(*pt_dir) = __pa(address) |
- pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
+ pte_val(*pt_dir) = address | pgprot_val(PAGE_KERNEL);
address += PAGE_SIZE;
+ pages4k++;
}
ret = 0;
out:
+ update_page_count(PG_DIRECT_MAP_4K, pages4k);
+ update_page_count(PG_DIRECT_MAP_1M, pages1m);
+ update_page_count(PG_DIRECT_MAP_2G, pages2g);
return ret;
}
@@ -143,15 +148,15 @@ out:
*/
static void vmem_remove_range(unsigned long start, unsigned long size)
{
+ unsigned long pages4k, pages1m, pages2g;
unsigned long end = start + size;
unsigned long address = start;
pgd_t *pg_dir;
pud_t *pu_dir;
pmd_t *pm_dir;
pte_t *pt_dir;
- pte_t pte;
- pte_val(pte) = _PAGE_INVALID;
+ pages4k = pages1m = pages2g = 0;
while (address < end) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
@@ -166,6 +171,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
if (pud_large(*pu_dir)) {
pud_clear(pu_dir);
address += PUD_SIZE;
+ pages2g++;
continue;
}
pm_dir = pmd_offset(pu_dir, address);
@@ -176,13 +182,18 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
if (pmd_large(*pm_dir)) {
pmd_clear(pm_dir);
address += PMD_SIZE;
+ pages1m++;
continue;
}
pt_dir = pte_offset_kernel(pm_dir, address);
- *pt_dir = pte;
+ pte_clear(&init_mm, address, pt_dir);
address += PAGE_SIZE;
+ pages4k++;
}
flush_tlb_kernel_range(start, end);
+ update_page_count(PG_DIRECT_MAP_4K, -pages4k);
+ update_page_count(PG_DIRECT_MAP_1M, -pages1m);
+ update_page_count(PG_DIRECT_MAP_2G, -pages2g);
}
/*
@@ -341,7 +352,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
if (ret)
goto out_free;
- ret = vmem_add_mem(start, size, 0);
+ ret = vmem_add_mem(start, size);
if (ret)
goto out_remove;
goto out;
@@ -362,31 +373,13 @@ out:
*/
void __init vmem_map_init(void)
{
- unsigned long ro_start, ro_end;
+ unsigned long size = _eshared - _stext;
struct memblock_region *reg;
- phys_addr_t start, end;
- ro_start = PFN_ALIGN((unsigned long)&_stext);
- ro_end = (unsigned long)&_eshared & PAGE_MASK;
- for_each_memblock(memory, reg) {
- start = reg->base;
- end = reg->base + reg->size;
- if (start >= ro_end || end <= ro_start)
- vmem_add_mem(start, end - start, 0);
- else if (start >= ro_start && end <= ro_end)
- vmem_add_mem(start, end - start, 1);
- else if (start >= ro_start) {
- vmem_add_mem(start, ro_end - start, 1);
- vmem_add_mem(ro_end, end - ro_end, 0);
- } else if (end < ro_end) {
- vmem_add_mem(start, ro_start - start, 0);
- vmem_add_mem(ro_start, end - ro_start, 1);
- } else {
- vmem_add_mem(start, ro_start - start, 0);
- vmem_add_mem(ro_start, ro_end - ro_start, 1);
- vmem_add_mem(ro_end, end - ro_end, 0);
- }
- }
+ for_each_memblock(memory, reg)
+ vmem_add_mem(reg->base, reg->size);
+ set_memory_ro((unsigned long)_stext, size >> PAGE_SHIFT);
+ pr_info("Write protected kernel read-only data: %luk\n", size >> 10);
}
/*