diff options
Diffstat (limited to 'arch/arm/kvm')
-rw-r--r-- | arch/arm/kvm/Kconfig | 7 | ||||
-rw-r--r-- | arch/arm/kvm/Makefile | 11 | ||||
-rw-r--r-- | arch/arm/kvm/arm.c | 157 | ||||
-rw-r--r-- | arch/arm/kvm/mmio.c | 24 | ||||
-rw-r--r-- | arch/arm/kvm/mmu.c | 396 |
5 files changed, 339 insertions, 256 deletions
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 95a000515..02abfff68 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -46,6 +46,13 @@ config KVM_ARM_HOST ---help--- Provides host support for ARM processors. +config KVM_NEW_VGIC + bool "New VGIC implementation" + depends on KVM + default y + ---help--- + uses the new VGIC implementation + source drivers/vhost/Kconfig endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index eb1bf4309..a596b58f6 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -21,7 +21,18 @@ obj-$(CONFIG_KVM_ARM_HOST) += hyp/ obj-y += kvm-arm.o init.o interrupts.o obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o + +ifeq ($(CONFIG_KVM_NEW_VGIC),y) +obj-y += $(KVM)/arm/vgic/vgic.o +obj-y += $(KVM)/arm/vgic/vgic-init.o +obj-y += $(KVM)/arm/vgic/vgic-irqfd.o +obj-y += $(KVM)/arm/vgic/vgic-v2.o +obj-y += $(KVM)/arm/vgic/vgic-mmio.o +obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o +obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o +else obj-y += $(KVM)/arm/vgic.o obj-y += $(KVM)/arm/vgic-v2.o obj-y += $(KVM)/arm/vgic-v2-emul.o +endif obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 72b11d91e..f1bde7c4e 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -16,7 +16,6 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ -#include <linux/cpu.h> #include <linux/cpu_pm.h> #include <linux/errno.h> #include <linux/err.h> @@ -66,6 +65,8 @@ static DEFINE_SPINLOCK(kvm_vmid_lock); static bool vgic_present; +static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); + static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) { BUG_ON(preemptible()); @@ -90,11 +91,6 @@ struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) return &kvm_arm_running_vcpu; } -int kvm_arch_hardware_enable(void) -{ - return 0; -} - int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -449,7 +445,7 @@ static void update_vttbr(struct kvm *kvm) kvm_next_vmid &= (1 << kvm_vmid_bits) - 1; /* update vttbr to be used with the new vmid */ - pgd_phys = virt_to_phys(kvm_get_hwpgd(kvm)); + pgd_phys = virt_to_phys(kvm->arch.pgd); BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); kvm->arch.vttbr = pgd_phys | vmid; @@ -460,7 +456,7 @@ static void update_vttbr(struct kvm *kvm) static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int ret; + int ret = 0; if (likely(vcpu->arch.has_run_once)) return 0; @@ -483,9 +479,9 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) * interrupts from the virtual timer with a userspace gic. */ if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) - kvm_timer_enable(kvm); + ret = kvm_timer_enable(vcpu); - return 0; + return ret; } bool kvm_arch_intc_initialized(struct kvm *kvm) @@ -493,30 +489,37 @@ bool kvm_arch_intc_initialized(struct kvm *kvm) return vgic_initialized(kvm); } -static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused; -static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused; - -static void kvm_arm_halt_guest(struct kvm *kvm) +void kvm_arm_halt_guest(struct kvm *kvm) { int i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) vcpu->arch.pause = true; - force_vm_exit(cpu_all_mask); + kvm_make_all_cpus_request(kvm, KVM_REQ_VCPU_EXIT); +} + +void kvm_arm_halt_vcpu(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pause = true; + kvm_vcpu_kick(vcpu); } -static void kvm_arm_resume_guest(struct kvm *kvm) +void kvm_arm_resume_vcpu(struct kvm_vcpu *vcpu) +{ + struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); + + vcpu->arch.pause = false; + swake_up(wq); +} + +void kvm_arm_resume_guest(struct kvm *kvm) { int i; struct kvm_vcpu *vcpu; - kvm_for_each_vcpu(i, vcpu, kvm) { - struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu); - - vcpu->arch.pause = false; - swake_up(wq); - } + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arm_resume_vcpu(vcpu); } static void vcpu_sleep(struct kvm_vcpu *vcpu) @@ -1034,11 +1037,6 @@ long kvm_arch_vm_ioctl(struct file *filp, } } -static void cpu_init_stage2(void *dummy) -{ - __cpu_init_stage2(); -} - static void cpu_init_hyp_mode(void *dummy) { phys_addr_t boot_pgd_ptr; @@ -1066,43 +1064,87 @@ static void cpu_hyp_reinit(void) { if (is_kernel_in_hyp_mode()) { /* - * cpu_init_stage2() is safe to call even if the PM + * __cpu_init_stage2() is safe to call even if the PM * event was cancelled before the CPU was reset. */ - cpu_init_stage2(NULL); + __cpu_init_stage2(); } else { if (__hyp_get_vectors() == hyp_default_vectors) cpu_init_hyp_mode(NULL); } } -static int hyp_init_cpu_notify(struct notifier_block *self, - unsigned long action, void *cpu) +static void cpu_hyp_reset(void) { - switch (action) { - case CPU_STARTING: - case CPU_STARTING_FROZEN: + phys_addr_t boot_pgd_ptr; + phys_addr_t phys_idmap_start; + + if (!is_kernel_in_hyp_mode()) { + boot_pgd_ptr = kvm_mmu_get_boot_httbr(); + phys_idmap_start = kvm_get_idmap_start(); + + __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start); + } +} + +static void _kvm_arch_hardware_enable(void *discard) +{ + if (!__this_cpu_read(kvm_arm_hardware_enabled)) { cpu_hyp_reinit(); + __this_cpu_write(kvm_arm_hardware_enabled, 1); } +} + +int kvm_arch_hardware_enable(void) +{ + _kvm_arch_hardware_enable(NULL); + return 0; +} - return NOTIFY_OK; +static void _kvm_arch_hardware_disable(void *discard) +{ + if (__this_cpu_read(kvm_arm_hardware_enabled)) { + cpu_hyp_reset(); + __this_cpu_write(kvm_arm_hardware_enabled, 0); + } } -static struct notifier_block hyp_init_cpu_nb = { - .notifier_call = hyp_init_cpu_notify, -}; +void kvm_arch_hardware_disable(void) +{ + _kvm_arch_hardware_disable(NULL); +} #ifdef CONFIG_CPU_PM static int hyp_init_cpu_pm_notifier(struct notifier_block *self, unsigned long cmd, void *v) { - if (cmd == CPU_PM_EXIT) { - cpu_hyp_reinit(); + /* + * kvm_arm_hardware_enabled is left with its old value over + * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should + * re-enable hyp. + */ + switch (cmd) { + case CPU_PM_ENTER: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* + * don't update kvm_arm_hardware_enabled here + * so that the hardware will be re-enabled + * when we resume. See below. + */ + cpu_hyp_reset(); + return NOTIFY_OK; - } + case CPU_PM_EXIT: + if (__this_cpu_read(kvm_arm_hardware_enabled)) + /* The hardware was enabled before suspend. */ + cpu_hyp_reinit(); - return NOTIFY_DONE; + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } } static struct notifier_block hyp_init_cpu_pm_nb = { @@ -1144,16 +1186,12 @@ static int init_common_resources(void) static int init_subsystems(void) { - int err; + int err = 0; /* - * Register CPU Hotplug notifier + * Enable hardware so that subsystem initialisation can access EL2. */ - err = register_cpu_notifier(&hyp_init_cpu_nb); - if (err) { - kvm_err("Cannot register KVM init CPU notifier (%d)\n", err); - return err; - } + on_each_cpu(_kvm_arch_hardware_enable, NULL, 1); /* * Register CPU lower-power notifier @@ -1171,9 +1209,10 @@ static int init_subsystems(void) case -ENODEV: case -ENXIO: vgic_present = false; + err = 0; break; default: - return err; + goto out; } /* @@ -1181,12 +1220,15 @@ static int init_subsystems(void) */ err = kvm_timer_hyp_init(); if (err) - return err; + goto out; kvm_perf_init(); kvm_coproc_table_init(); - return 0; +out: + on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); + + return err; } static void teardown_hyp_mode(void) @@ -1199,17 +1241,11 @@ static void teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); - unregister_cpu_notifier(&hyp_init_cpu_nb); hyp_cpu_pm_exit(); } static int init_vhe_mode(void) { - /* - * Execute the init code on each CPU. - */ - on_each_cpu(cpu_init_stage2, NULL, 1); - /* set size of VMID supported by CPU */ kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); @@ -1296,11 +1332,6 @@ static int init_hyp_mode(void) } } - /* - * Execute the init code on each CPU. - */ - on_each_cpu(cpu_init_hyp_mode, NULL, 1); - #ifndef CONFIG_HOTPLUG_CPU free_boot_hyp_pgd(); #endif diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c index 0f6600f05..10f80a6c7 100644 --- a/arch/arm/kvm/mmio.c +++ b/arch/arm/kvm/mmio.c @@ -23,7 +23,7 @@ #include "trace.h" -static void mmio_write_buf(char *buf, unsigned int len, unsigned long data) +void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data) { void *datap = NULL; union { @@ -55,7 +55,7 @@ static void mmio_write_buf(char *buf, unsigned int len, unsigned long data) memcpy(buf, datap, len); } -static unsigned long mmio_read_buf(char *buf, unsigned int len) +unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) { unsigned long data = 0; union { @@ -66,7 +66,7 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len) switch (len) { case 1: - data = buf[0]; + data = *(u8 *)buf; break; case 2: memcpy(&tmp.hword, buf, len); @@ -87,11 +87,10 @@ static unsigned long mmio_read_buf(char *buf, unsigned int len) /** * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation + * or in-kernel IO emulation + * * @vcpu: The VCPU pointer * @run: The VCPU run struct containing the mmio data - * - * This should only be called after returning from userspace for MMIO load - * emulation. */ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -104,7 +103,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) if (len > sizeof(unsigned long)) return -EINVAL; - data = mmio_read_buf(run->mmio.data, len); + data = kvm_mmio_read_buf(run->mmio.data, len); if (vcpu->arch.mmio_decode.sign_extend && len < sizeof(unsigned long)) { @@ -190,7 +189,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, len); trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); - mmio_write_buf(data_buf, len, data); + kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); @@ -206,18 +205,19 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, run->mmio.is_write = is_write; run->mmio.phys_addr = fault_ipa; run->mmio.len = len; - if (is_write) - memcpy(run->mmio.data, data_buf, len); if (!ret) { /* We handled the access successfully in the kernel. */ + if (!is_write) + memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_kernel++; kvm_handle_mmio_return(vcpu, run); return 1; - } else { - vcpu->stat.mmio_exit_user++; } + if (is_write) + memcpy(run->mmio.data, data_buf, len); + vcpu->stat.mmio_exit_user++; run->exit_reason = KVM_EXIT_MMIO; return 0; } diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index dea1452a8..45c43aecb 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -43,11 +43,9 @@ static unsigned long hyp_idmap_start; static unsigned long hyp_idmap_end; static phys_addr_t hyp_idmap_vector; +#define S2_PGD_SIZE (PTRS_PER_S2_PGD * sizeof(pgd_t)) #define hyp_pgd_order get_order(PTRS_PER_PGD * sizeof(pgd_t)) -#define kvm_pmd_huge(_x) (pmd_huge(_x) || pmd_trans_huge(_x)) -#define kvm_pud_huge(_x) pud_huge(_x) - #define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0) #define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1) @@ -69,14 +67,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) { - /* - * This function also gets called when dealing with HYP page - * tables. As HYP doesn't have an associated struct kvm (and - * the HYP page tables are fairly static), we don't do - * anything there. - */ - if (kvm) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa); } /* @@ -115,7 +106,7 @@ static bool kvm_is_device_pfn(unsigned long pfn) */ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd) { - if (!kvm_pmd_huge(*pmd)) + if (!pmd_thp_or_huge(*pmd)) return; pmd_clear(pmd); @@ -155,29 +146,29 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) return p; } -static void clear_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) +static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { - pud_t *pud_table __maybe_unused = pud_offset(pgd, 0); - pgd_clear(pgd); + pud_t *pud_table __maybe_unused = stage2_pud_offset(pgd, 0UL); + stage2_pgd_clear(pgd); kvm_tlb_flush_vmid_ipa(kvm, addr); - pud_free(NULL, pud_table); + stage2_pud_free(pud_table); put_page(virt_to_page(pgd)); } -static void clear_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) +static void clear_stage2_pud_entry(struct kvm *kvm, pud_t *pud, phys_addr_t addr) { - pmd_t *pmd_table = pmd_offset(pud, 0); - VM_BUG_ON(pud_huge(*pud)); - pud_clear(pud); + pmd_t *pmd_table __maybe_unused = stage2_pmd_offset(pud, 0); + VM_BUG_ON(stage2_pud_huge(*pud)); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - pmd_free(NULL, pmd_table); + stage2_pmd_free(pmd_table); put_page(virt_to_page(pud)); } -static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) +static void clear_stage2_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) { pte_t *pte_table = pte_offset_kernel(pmd, 0); - VM_BUG_ON(kvm_pmd_huge(*pmd)); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); pmd_clear(pmd); kvm_tlb_flush_vmid_ipa(kvm, addr); pte_free_kernel(NULL, pte_table); @@ -204,7 +195,7 @@ static void clear_pmd_entry(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr) * the corresponding TLBs, we call kvm_flush_dcache_p*() to make sure * the IO subsystem will never hit in the cache. */ -static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, +static void unmap_stage2_ptes(struct kvm *kvm, pmd_t *pmd, phys_addr_t addr, phys_addr_t end) { phys_addr_t start_addr = addr; @@ -226,21 +217,21 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); - if (kvm_pte_table_empty(kvm, start_pte)) - clear_pmd_entry(kvm, pmd, start_addr); + if (stage2_pte_table_empty(start_pte)) + clear_stage2_pmd_entry(kvm, pmd, start_addr); } -static void unmap_pmds(struct kvm *kvm, pud_t *pud, +static void unmap_stage2_pmds(struct kvm *kvm, pud_t *pud, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pmd_t *pmd, *start_pmd; - start_pmd = pmd = pmd_offset(pud, addr); + start_pmd = pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { pmd_t old_pmd = *pmd; pmd_clear(pmd); @@ -250,57 +241,64 @@ static void unmap_pmds(struct kvm *kvm, pud_t *pud, put_page(virt_to_page(pmd)); } else { - unmap_ptes(kvm, pmd, addr, next); + unmap_stage2_ptes(kvm, pmd, addr, next); } } } while (pmd++, addr = next, addr != end); - if (kvm_pmd_table_empty(kvm, start_pmd)) - clear_pud_entry(kvm, pud, start_addr); + if (stage2_pmd_table_empty(start_pmd)) + clear_stage2_pud_entry(kvm, pud, start_addr); } -static void unmap_puds(struct kvm *kvm, pgd_t *pgd, +static void unmap_stage2_puds(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr, phys_addr_t end) { phys_addr_t next, start_addr = addr; pud_t *pud, *start_pud; - start_pud = pud = pud_offset(pgd, addr); + start_pud = pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) { pud_t old_pud = *pud; - pud_clear(pud); + stage2_pud_clear(pud); kvm_tlb_flush_vmid_ipa(kvm, addr); - kvm_flush_dcache_pud(old_pud); - put_page(virt_to_page(pud)); } else { - unmap_pmds(kvm, pud, addr, next); + unmap_stage2_pmds(kvm, pud, addr, next); } } } while (pud++, addr = next, addr != end); - if (kvm_pud_table_empty(kvm, start_pud)) - clear_pgd_entry(kvm, pgd, start_addr); + if (stage2_pud_table_empty(start_pud)) + clear_stage2_pgd_entry(kvm, pgd, start_addr); } - -static void unmap_range(struct kvm *kvm, pgd_t *pgdp, - phys_addr_t start, u64 size) +/** + * unmap_stage2_range -- Clear stage2 page table entries to unmap a range + * @kvm: The VM pointer + * @start: The intermediate physical base address of the range to unmap + * @size: The size of the area to unmap + * + * Clear a range of stage-2 mappings, lowering the various ref-counts. Must + * be called while holding mmu_lock (unless for freeing the stage2 pgd before + * destroying the VM), otherwise another faulting VCPU may come in and mess + * with things behind our backs. + */ +static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) { pgd_t *pgd; phys_addr_t addr = start, end = start + size; phys_addr_t next; - pgd = pgdp + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); - if (!pgd_none(*pgd)) - unmap_puds(kvm, pgd, addr, next); + next = stage2_pgd_addr_end(addr, end); + if (!stage2_pgd_none(*pgd)) + unmap_stage2_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -322,11 +320,11 @@ static void stage2_flush_pmds(struct kvm *kvm, pud_t *pud, pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) + if (pmd_thp_or_huge(*pmd)) kvm_flush_dcache_pmd(*pmd); else stage2_flush_ptes(kvm, pmd, addr, next); @@ -340,11 +338,11 @@ static void stage2_flush_puds(struct kvm *kvm, pgd_t *pgd, pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { - if (pud_huge(*pud)) + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { + if (stage2_pud_huge(*pud)) kvm_flush_dcache_pud(*pud); else stage2_flush_pmds(kvm, pud, addr, next); @@ -360,9 +358,9 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { - next = kvm_pgd_addr_end(addr, end); + next = stage2_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -391,6 +389,100 @@ static void stage2_flush_vm(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } +static void clear_hyp_pgd_entry(pgd_t *pgd) +{ + pud_t *pud_table __maybe_unused = pud_offset(pgd, 0UL); + pgd_clear(pgd); + pud_free(NULL, pud_table); + put_page(virt_to_page(pgd)); +} + +static void clear_hyp_pud_entry(pud_t *pud) +{ + pmd_t *pmd_table __maybe_unused = pmd_offset(pud, 0); + VM_BUG_ON(pud_huge(*pud)); + pud_clear(pud); + pmd_free(NULL, pmd_table); + put_page(virt_to_page(pud)); +} + +static void clear_hyp_pmd_entry(pmd_t *pmd) +{ + pte_t *pte_table = pte_offset_kernel(pmd, 0); + VM_BUG_ON(pmd_thp_or_huge(*pmd)); + pmd_clear(pmd); + pte_free_kernel(NULL, pte_table); + put_page(virt_to_page(pmd)); +} + +static void unmap_hyp_ptes(pmd_t *pmd, phys_addr_t addr, phys_addr_t end) +{ + pte_t *pte, *start_pte; + + start_pte = pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_none(*pte)) { + kvm_set_pte(pte, __pte(0)); + put_page(virt_to_page(pte)); + } + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (hyp_pte_table_empty(start_pte)) + clear_hyp_pmd_entry(pmd); +} + +static void unmap_hyp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pmd_t *pmd, *start_pmd; + + start_pmd = pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + /* Hyp doesn't use huge pmds */ + if (!pmd_none(*pmd)) + unmap_hyp_ptes(pmd, addr, next); + } while (pmd++, addr = next, addr != end); + + if (hyp_pmd_table_empty(start_pmd)) + clear_hyp_pud_entry(pud); +} + +static void unmap_hyp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t next; + pud_t *pud, *start_pud; + + start_pud = pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + /* Hyp doesn't use huge puds */ + if (!pud_none(*pud)) + unmap_hyp_pmds(pud, addr, next); + } while (pud++, addr = next, addr != end); + + if (hyp_pud_table_empty(start_pud)) + clear_hyp_pgd_entry(pgd); +} + +static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) +{ + pgd_t *pgd; + phys_addr_t addr = start, end = start + size; + phys_addr_t next; + + /* + * We don't unmap anything from HYP, except at the hyp tear down. + * Hence, we don't have to invalidate the TLBs here. + */ + pgd = pgdp + pgd_index(addr); + do { + next = pgd_addr_end(addr, end); + if (!pgd_none(*pgd)) + unmap_hyp_puds(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} + /** * free_boot_hyp_pgd - free HYP boot page tables * @@ -401,14 +493,14 @@ void free_boot_hyp_pgd(void) mutex_lock(&kvm_hyp_pgd_mutex); if (boot_hyp_pgd) { - unmap_range(NULL, boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); - unmap_range(NULL, boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE); + unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order); boot_hyp_pgd = NULL; } if (hyp_pgd) - unmap_range(NULL, hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); + unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE); mutex_unlock(&kvm_hyp_pgd_mutex); } @@ -433,9 +525,9 @@ void free_hyp_pgds(void) if (hyp_pgd) { for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_range(NULL, hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; @@ -645,20 +737,6 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } -/* Free the HW pgd, one page at a time */ -static void kvm_free_hwpgd(void *hwpgd) -{ - free_pages_exact(hwpgd, kvm_get_hwpgd_size()); -} - -/* Allocate the HW PGD, making sure that each page gets its own refcount */ -static void *kvm_alloc_hwpgd(void) -{ - unsigned int size = kvm_get_hwpgd_size(); - - return alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); -} - /** * kvm_alloc_stage2_pgd - allocate level-1 table for stage-2 translation. * @kvm: The KVM struct pointer for the VM. @@ -673,81 +751,22 @@ static void *kvm_alloc_hwpgd(void) int kvm_alloc_stage2_pgd(struct kvm *kvm) { pgd_t *pgd; - void *hwpgd; if (kvm->arch.pgd != NULL) { kvm_err("kvm_arch already initialized?\n"); return -EINVAL; } - hwpgd = kvm_alloc_hwpgd(); - if (!hwpgd) + /* Allocate the HW PGD, making sure that each page gets its own refcount */ + pgd = alloc_pages_exact(S2_PGD_SIZE, GFP_KERNEL | __GFP_ZERO); + if (!pgd) return -ENOMEM; - /* When the kernel uses more levels of page tables than the - * guest, we allocate a fake PGD and pre-populate it to point - * to the next-level page table, which will be the real - * initial page table pointed to by the VTTBR. - * - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for - * the PMD and the kernel will use folded pud. - * When KVM_PREALLOC_LEVEL==1, we allocate 2 consecutive PUD - * pages. - */ - if (KVM_PREALLOC_LEVEL > 0) { - int i; - - /* - * Allocate fake pgd for the page table manipulation macros to - * work. This is not used by the hardware and we have no - * alignment requirement for this allocation. - */ - pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t), - GFP_KERNEL | __GFP_ZERO); - - if (!pgd) { - kvm_free_hwpgd(hwpgd); - return -ENOMEM; - } - - /* Plug the HW PGD into the fake one. */ - for (i = 0; i < PTRS_PER_S2_PGD; i++) { - if (KVM_PREALLOC_LEVEL == 1) - pgd_populate(NULL, pgd + i, - (pud_t *)hwpgd + i * PTRS_PER_PUD); - else if (KVM_PREALLOC_LEVEL == 2) - pud_populate(NULL, pud_offset(pgd, 0) + i, - (pmd_t *)hwpgd + i * PTRS_PER_PMD); - } - } else { - /* - * Allocate actual first-level Stage-2 page table used by the - * hardware for Stage-2 page table walks. - */ - pgd = (pgd_t *)hwpgd; - } - kvm_clean_pgd(pgd); kvm->arch.pgd = pgd; return 0; } -/** - * unmap_stage2_range -- Clear stage2 page table entries to unmap a range - * @kvm: The VM pointer - * @start: The intermediate physical base address of the range to unmap - * @size: The size of the area to unmap - * - * Clear a range of stage-2 mappings, lowering the various ref-counts. Must - * be called while holding mmu_lock (unless for freeing the stage2 pgd before - * destroying the VM), otherwise another faulting VCPU may come in and mess - * with things behind our backs. - */ -static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) -{ - unmap_range(kvm, kvm->arch.pgd, start, size); -} - static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -830,10 +849,8 @@ void kvm_free_stage2_pgd(struct kvm *kvm) return; unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - kvm_free_hwpgd(kvm_get_hwpgd(kvm)); - if (KVM_PREALLOC_LEVEL > 0) - kfree(kvm->arch.pgd); - + /* Free the HW pgd, one page at a time */ + free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE); kvm->arch.pgd = NULL; } @@ -843,16 +860,16 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); - if (WARN_ON(pgd_none(*pgd))) { + pgd = kvm->arch.pgd + stage2_pgd_index(addr); + if (WARN_ON(stage2_pgd_none(*pgd))) { if (!cache) return NULL; pud = mmu_memory_cache_alloc(cache); - pgd_populate(NULL, pgd, pud); + stage2_pgd_populate(pgd, pud); get_page(virt_to_page(pgd)); } - return pud_offset(pgd, addr); + return stage2_pud_offset(pgd, addr); } static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -862,15 +879,15 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); - if (pud_none(*pud)) { + if (stage2_pud_none(*pud)) { if (!cache) return NULL; pmd = mmu_memory_cache_alloc(cache); - pud_populate(NULL, pud, pmd); + stage2_pud_populate(pud, pmd); get_page(virt_to_page(pud)); } - return pmd_offset(pud, addr); + return stage2_pmd_offset(pud, addr); } static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache @@ -960,6 +977,27 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, return 0; } +#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + if (pte_young(*pte)) { + *pte = pte_mkold(*pte); + return 1; + } + return 0; +} +#else +static int stage2_ptep_test_and_clear_young(pte_t *pte) +{ + return __ptep_test_and_clear_young(pte); +} +#endif + +static int stage2_pmdp_test_and_clear_young(pmd_t *pmd) +{ + return stage2_ptep_test_and_clear_young((pte_t *)pmd); +} + /** * kvm_phys_addr_ioremap - map a device range to guest IPA * @@ -983,7 +1021,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE); if (writable) - kvm_set_s2pte_writable(&pte); + pte = kvm_s2pte_mkwrite(pte); ret = mmu_topup_memory_cache(&cache, KVM_MMU_CACHE_MIN_PAGES, KVM_NR_MEM_OBJS); @@ -1083,12 +1121,12 @@ static void stage2_wp_pmds(pud_t *pud, phys_addr_t addr, phys_addr_t end) pmd_t *pmd; phys_addr_t next; - pmd = pmd_offset(pud, addr); + pmd = stage2_pmd_offset(pud, addr); do { - next = kvm_pmd_addr_end(addr, end); + next = stage2_pmd_addr_end(addr, end); if (!pmd_none(*pmd)) { - if (kvm_pmd_huge(*pmd)) { + if (pmd_thp_or_huge(*pmd)) { if (!kvm_s2pmd_readonly(pmd)) kvm_set_s2pmd_readonly(pmd); } else { @@ -1111,12 +1149,12 @@ static void stage2_wp_puds(pgd_t *pgd, phys_addr_t addr, phys_addr_t end) pud_t *pud; phys_addr_t next; - pud = pud_offset(pgd, addr); + pud = stage2_pud_offset(pgd, addr); do { - next = kvm_pud_addr_end(addr, end); - if (!pud_none(*pud)) { + next = stage2_pud_addr_end(addr, end); + if (!stage2_pud_none(*pud)) { /* TODO:PUD not supported, revisit later if supported */ - BUG_ON(kvm_pud_huge(*pud)); + BUG_ON(stage2_pud_huge(*pud)); stage2_wp_pmds(pud, addr, next); } } while (pud++, addr = next, addr != end); @@ -1133,7 +1171,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + kvm_pgd_index(addr); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { /* * Release kvm_mmu_lock periodically if the memory region is @@ -1145,8 +1183,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) if (need_resched() || spin_needbreak(&kvm->mmu_lock)) cond_resched_lock(&kvm->mmu_lock); - next = kvm_pgd_addr_end(addr, end); - if (pgd_present(*pgd)) + next = stage2_pgd_addr_end(addr, end); + if (stage2_pgd_present(*pgd)) stage2_wp_puds(pgd, addr, next); } while (pgd++, addr = next, addr != end); } @@ -1325,7 +1363,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pmd_t new_pmd = pfn_pmd(pfn, mem_type); new_pmd = pmd_mkhuge(new_pmd); if (writable) { - kvm_set_s2pmd_writable(&new_pmd); + new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached); @@ -1334,7 +1372,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, pte_t new_pte = pfn_pte(pfn, mem_type); if (writable) { - kvm_set_s2pte_writable(&new_pte); + new_pte = kvm_s2pte_mkwrite(new_pte); kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } @@ -1353,6 +1391,8 @@ out_unlock: * Resolve the access fault by making the page young again. * Note that because the faulting entry is guaranteed not to be * cached in the TLB, we don't need to invalidate anything. + * Only the HW Access Flag updates are supported for Stage 2 (no DBM), + * so there is no need for atomic (pte|pmd)_mkyoung operations. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { @@ -1369,7 +1409,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) if (!pmd || pmd_none(*pmd)) /* Nothing there */ goto out; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */ *pmd = pmd_mkyoung(*pmd); pfn = pmd_pfn(*pmd); pfn_valid = true; @@ -1593,25 +1633,14 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) { /* THP, HugeTLB */ - if (pmd_young(*pmd)) { - *pmd = pmd_mkold(*pmd); - return 1; - } - - return 0; - } + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ + return stage2_pmdp_test_and_clear_young(pmd); pte = pte_offset_kernel(pmd, gpa); if (pte_none(*pte)) return 0; - if (pte_young(*pte)) { - *pte = pte_mkold(*pte); /* Just a page... */ - return 1; - } - - return 0; + return stage2_ptep_test_and_clear_young(pte); } static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) @@ -1623,7 +1652,7 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; - if (kvm_pmd_huge(*pmd)) /* THP, HugeTLB */ + if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */ return pmd_young(*pmd); pte = pte_offset_kernel(pmd, gpa); @@ -1671,6 +1700,11 @@ phys_addr_t kvm_get_idmap_vector(void) return hyp_idmap_vector; } +phys_addr_t kvm_get_idmap_start(void) +{ + return hyp_idmap_start; +} + int kvm_mmu_init(void) { int err; |