diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/kvm/assigned-dev.c | 14 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 78 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 37 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 55 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 348 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 30 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 164 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 17 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 624 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 50 | ||||
-rw-r--r-- | arch/x86/kvm/mtrr.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/page_track.c | 227 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 67 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 146 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 185 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 19 |
27 files changed, 1454 insertions, 737 deletions
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a1ff508bb..464fa477a 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,9 +13,10 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o + hyperv.o page_track.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o + kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 9dc091acd..308b8597c 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -51,11 +51,9 @@ struct kvm_assigned_dev_kernel { static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) { - struct list_head *ptr; struct kvm_assigned_dev_kernel *match; - list_for_each(ptr, head) { - match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + list_for_each_entry(match, head, list) { if (match->assigned_dev_id == assigned_dev_id) return match; } @@ -373,14 +371,10 @@ static void kvm_free_assigned_device(struct kvm *kvm, void kvm_free_all_assigned_devices(struct kvm *kvm) { - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); + struct kvm_assigned_dev_kernel *assigned_dev, *tmp; + list_for_each_entry_safe(assigned_dev, tmp, + &kvm->arch.assigned_dev_head, list) { kvm_free_assigned_device(kvm, assigned_dev); } } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6525e926f..bbbaa802d 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -46,11 +46,18 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } +bool kvm_mpx_supported(void) +{ + return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + && kvm_x86_ops->mpx_supported()); +} +EXPORT_SYMBOL_GPL(kvm_mpx_supported); + u64 kvm_supported_xcr0(void) { u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; - if (!kvm_x86_ops->mpx_supported()) + if (!kvm_mpx_supported()) xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); return xcr0; @@ -81,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 7, 0); + if (best) { + /* Update OSPKE bit */ + if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) { + best->ecx &= ~F(OSPKE); + if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) + best->ecx |= F(OSPKE); + } + } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { vcpu->arch.guest_supported_xcr0 = 0; @@ -97,8 +114,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu); - if (vcpu->arch.eager_fpu) + if (use_eager_fpu()) kvm_x86_ops->fpu_activate(vcpu); /* @@ -295,11 +311,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; - unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0; + unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; /* cpuid 1.edx */ - const u32 kvm_supported_word0_x86_features = + const u32 kvm_cpuid_1_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | @@ -309,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */; /* cpuid 0x80000001.edx */ - const u32 kvm_supported_word1_x86_features = + const u32 kvm_cpuid_8000_0001_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | @@ -319,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ - const u32 kvm_supported_word4_x86_features = + const u32 kvm_cpuid_1_ecx_x86_features = /* NOTE: MONITOR (and MWAIT) are emulated as NOP, * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | @@ -331,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ - const u32 kvm_supported_word6_x86_features = + const u32 kvm_cpuid_8000_0001_ecx_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ - const u32 kvm_supported_word5_x86_features = + const u32 kvm_cpuid_C000_0001_edx_x86_features = F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); /* cpuid 7.0.ebx */ - const u32 kvm_supported_word9_x86_features = + const u32 kvm_cpuid_7_0_ebx_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ - const u32 kvm_supported_word10_x86_features = + const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -370,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, (u32)0xd); break; case 1: - entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); - entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); + entry->edx &= kvm_cpuid_1_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_1_EDX); + entry->ecx &= kvm_cpuid_1_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_1_ECX); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -427,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ if (index == 0) { - entry->ebx &= kvm_supported_word9_x86_features; - cpuid_mask(&entry->ebx, 9); + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_7_0_EBX); // TSC_ADJUST is emulated entry->ebx |= F(TSC_ADJUST); - } else + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled) + entry->ecx &= ~F(PKU); + } else { entry->ebx = 0; + entry->ecx = 0; + } entry->eax = 0; - entry->ecx = 0; entry->edx = 0; break; } @@ -508,7 +533,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, do_cpuid_1_ent(&entry[i], function, idx); if (idx == 1) { - entry[i].eax &= kvm_supported_word10_x86_features; + entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; + cpuid_mask(&entry[i].eax, CPUID_D_1_EAX); entry[i].ebx = 0; if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) entry[i].ebx = @@ -558,10 +584,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0x8000001a); break; case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); - entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); + entry->edx &= kvm_cpuid_8000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_8000_0001_EDX); + entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -594,8 +620,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0xC0000004); break; case 0xC0000001: - entry->edx &= kvm_supported_word5_x86_features; - cpuid_mask(&entry->edx, 5); + entry->edx &= kvm_cpuid_C000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_C000_0001_EDX); break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c8eda1498..e17a74b1d 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -5,6 +5,7 @@ #include <asm/cpu.h> int kvm_update_cpuid(struct kvm_vcpu *vcpu); +bool kvm_mpx_supported(void); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, @@ -79,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ecx & bit(X86_FEATURE_PKU)); +} + static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -135,14 +144,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } -static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_MPX)); -} - static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b9b09fec1..a2f24af3c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -309,23 +309,29 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); -#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" +#define FOP_FUNC(name) \ + ".align " __stringify(FASTOP_SIZE) " \n\t" \ + ".type " name ", @function \n\t" \ + name ":\n\t" + #define FOP_RET "ret \n\t" #define FOP_START(op) \ extern void em_##op(struct fastop *fake); \ asm(".pushsection .text, \"ax\" \n\t" \ ".global em_" #op " \n\t" \ - FOP_ALIGN \ - "em_" #op ": \n\t" + FOP_FUNC("em_" #op) #define FOP_END \ ".popsection") -#define FOPNOP() FOP_ALIGN FOP_RET +#define FOPNOP() \ + FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \ + FOP_RET #define FOP1E(op, dst) \ - FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst) \ + "10: " #op " %" #dst " \n\t" FOP_RET #define FOP1EEX(op, dst) \ FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception) @@ -357,7 +363,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP2E(op, dst, src) \ - FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src) \ + #op " %" #src ", %" #dst " \n\t" FOP_RET #define FASTOP2(op) \ FOP_START(op) \ @@ -395,7 +402,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END #define FOP3E(op, dst, src, src2) \ - FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \ + #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET /* 3-operand, word-only, src2=cl */ #define FASTOP3WCL(op) \ @@ -407,7 +415,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); FOP_END /* Special case for SETcc - 1 instruction per cc */ -#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" +#define FOP_SETCC(op) \ + ".align 4 \n\t" \ + ".type " #op ", @function \n\t" \ + #op ": \n\t" \ + #op " %al \n\t" \ + FOP_RET asm(".global kvm_fastop_exception \n" "kvm_fastop_exception: xor %esi, %esi; ret"); @@ -956,7 +969,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt) return fastop(ctxt, em_bsr); } -static u8 test_cc(unsigned int condition, unsigned long flags) +static __always_inline u8 test_cc(unsigned int condition, unsigned long flags) { u8 rc; void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); @@ -5097,13 +5110,17 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) { + register void *__sp asm(_ASM_SP); ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + if (!(ctxt->d & ByteOp)) fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), - [fastop]"+S"(fop) + [fastop]"+S"(fop), "+r"(__sp) : "c"(ctxt->src2.val)); + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); if (!fop) /* exception is returned in fop variable */ return emulate_de(ctxt); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index c58ba6717..01bd7b7a6 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm) return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; } +static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) +{ + bool longmode; + + longmode = is_64_bit_mode(vcpu); + if (longmode) + kvm_register_write(vcpu, VCPU_REGS_RAX, result); + else { + kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32); + kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff); + } +} + +static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result); + return 1; +} + int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { u64 param, ingpa, outgpa, ret; @@ -1055,7 +1076,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) */ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); - return 0; + return 1; } longmode = is_64_bit_mode(vcpu); @@ -1083,22 +1104,38 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); + /* Hypercall continuation is not supported yet */ + if (rep_cnt || rep_idx) { + res = HV_STATUS_INVALID_HYPERCALL_CODE; + goto set_result; + } + switch (code) { - case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT: + case HVCALL_NOTIFY_LONG_SPIN_WAIT: kvm_vcpu_on_spin(vcpu); break; + case HVCALL_POST_MESSAGE: + case HVCALL_SIGNAL_EVENT: + /* don't bother userspace if it has no way to handle it */ + if (!vcpu_to_synic(vcpu)->active) { + res = HV_STATUS_INVALID_HYPERCALL_CODE; + break; + } + vcpu->run->exit_reason = KVM_EXIT_HYPERV; + vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL; + vcpu->run->hyperv.u.hcall.input = param; + vcpu->run->hyperv.u.hcall.params[0] = ingpa; + vcpu->run->hyperv.u.hcall.params[1] = outgpa; + vcpu->arch.complete_userspace_io = + kvm_hv_hypercall_complete_userspace; + return 0; default: res = HV_STATUS_INVALID_HYPERCALL_CODE; break; } +set_result: ret = res | (((u64)rep_done & 0xfff) << 32); - if (longmode) { - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); - } else { - kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32); - kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff); - } - + kvm_hv_hypercall_set_result(vcpu, ret); return 1; } diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index ab5318727..a4bf5b45d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -51,32 +51,9 @@ #define RW_STATE_WORD0 3 #define RW_STATE_WORD1 4 -/* Compute with 96 bit intermediate result: (a*b)/c */ -static u64 muldiv64(u64 a, u32 b, u32 c) +static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val) { - union { - u64 ll; - struct { - u32 low, high; - } l; - } u, res; - u64 rl, rh; - - u.ll = a; - rl = (u64)u.l.low * (u64)b; - rh = (u64)u.l.high * (u64)b; - rh += (rl >> 32); - res.l.high = div64_u64(rh, c); - res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c); - return res.ll; -} - -static void pit_set_gate(struct kvm *kvm, int channel, u32 val) -{ - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; switch (c->mode) { default: @@ -97,18 +74,16 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val) c->gate = val; } -static int pit_get_gate(struct kvm *kvm, int channel) +static int pit_get_gate(struct kvm_pit *pit, int channel) { - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - return kvm->arch.vpit->pit_state.channels[channel].gate; + return pit->pit_state.channels[channel].gate; } -static s64 __kpit_elapsed(struct kvm *kvm) +static s64 __kpit_elapsed(struct kvm_pit *pit) { s64 elapsed; ktime_t remaining; - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + struct kvm_kpit_state *ps = &pit->pit_state; if (!ps->period) return 0; @@ -128,26 +103,23 @@ static s64 __kpit_elapsed(struct kvm *kvm) return elapsed; } -static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c, +static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c, int channel) { if (channel == 0) - return __kpit_elapsed(kvm); + return __kpit_elapsed(pit); return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); } -static int pit_get_count(struct kvm *kvm, int channel) +static int pit_get_count(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; s64 d, t; int counter; - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - t = kpit_elapsed(kvm, c, channel); - d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + t = kpit_elapsed(pit, c, channel); + d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { case 0: @@ -167,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel) return counter; } -static int pit_get_out(struct kvm *kvm, int channel) +static int pit_get_out(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; s64 d, t; int out; - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); - - t = kpit_elapsed(kvm, c, channel); - d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + t = kpit_elapsed(pit, c, channel); + d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC); switch (c->mode) { default: @@ -202,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel) return out; } -static void pit_latch_count(struct kvm *kvm, int channel) +static void pit_latch_count(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; if (!c->count_latched) { - c->latched_count = pit_get_count(kvm, channel); + c->latched_count = pit_get_count(pit, channel); c->count_latched = c->rw_mode; } } -static void pit_latch_status(struct kvm *kvm, int channel) +static void pit_latch_status(struct kvm_pit *pit, int channel) { - struct kvm_kpit_channel_state *c = - &kvm->arch.vpit->pit_state.channels[channel]; - - WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel]; if (!c->status_latched) { /* TODO: Return NULL COUNT (bit 6). */ - c->status = ((pit_get_out(kvm, channel) << 7) | + c->status = ((pit_get_out(pit, channel) << 7) | (c->rw_mode << 4) | (c->mode << 1) | c->bcd); @@ -232,26 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } +static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps) +{ + return container_of(ps, struct kvm_pit, pit_state); +} + static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - int value; - - spin_lock(&ps->inject_lock); - value = atomic_dec_return(&ps->pending); - if (value < 0) - /* spurious acks can be generated if, for example, the - * PIC is being reset. Handle it gracefully here - */ - atomic_inc(&ps->pending); - else if (value > 0 && ps->reinject) - /* in this case, we had multiple outstanding pit interrupts - * that we needed to inject. Reinject - */ - queue_kthread_work(&ps->pit->worker, &ps->pit->expired); - ps->irq_ack = 1; - spin_unlock(&ps->inject_lock); + struct kvm_pit *pit = pit_state_to_pit(ps); + + atomic_set(&ps->irq_ack, 1); + /* irq_ack should be set before pending is read. Order accesses with + * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work. + */ + smp_mb(); + if (atomic_dec_if_positive(&ps->pending) > 0) + queue_kthread_work(&pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -282,44 +243,33 @@ static void pit_do_work(struct kthread_work *work) struct kvm_vcpu *vcpu; int i; struct kvm_kpit_state *ps = &pit->pit_state; - int inject = 0; - /* Try to inject pending interrupts when - * last one has been acked. + if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0)) + return; + + kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false); + kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation does not follow + * the MP specification. We propagate a PIT interrupt to all + * VCPUs and only when LVT0 is in NMI mode. The interrupt can + * also be simultaneously delivered through PIC and IOAPIC. */ - spin_lock(&ps->inject_lock); - if (!ps->reinject) - inject = 1; - else if (ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - spin_unlock(&ps->inject_lock); - if (inject) { - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); - } + if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); } static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer); - struct kvm_pit *pt = ps->kvm->arch.vpit; + struct kvm_pit *pt = pit_state_to_pit(ps); - if (ps->reinject) + if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); queue_kthread_work(&pt->worker, &pt->expired); @@ -331,30 +281,54 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) +static inline void kvm_pit_reset_reinject(struct kvm_pit *pit) +{ + atomic_set(&pit->pit_state.pending, 0); + atomic_set(&pit->pit_state.irq_ack, 1); +} + +void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) +{ + struct kvm_kpit_state *ps = &pit->pit_state; + struct kvm *kvm = pit->kvm; + + if (atomic_read(&ps->reinject) == reinject) + return; + + if (reinject) { + /* The initial state is preserved while ps->reinject == 0. */ + kvm_pit_reset_reinject(pit); + kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); + kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + } else { + kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); + kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + } + + atomic_set(&ps->reinject, reinject); +} + +static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period) { - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + struct kvm_kpit_state *ps = &pit->pit_state; + struct kvm *kvm = pit->kvm; s64 interval; if (!ioapic_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY) return; - interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); + interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&ps->timer); - flush_kthread_work(&ps->pit->expired); + flush_kthread_work(&pit->expired); ps->period = interval; ps->is_periodic = is_period; - ps->timer.function = pit_timer_fn; - ps->kvm = ps->pit->kvm; - - atomic_set(&ps->pending, 0); - ps->irq_ack = 1; + kvm_pit_reset_reinject(pit); /* * Do not allow the guest to program periodic timers with small @@ -377,11 +351,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) HRTIMER_MODE_ABS); } -static void pit_load_count(struct kvm *kvm, int channel, u32 val) +static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) { - struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; - - WARN_ON(!mutex_is_locked(&ps->lock)); + struct kvm_kpit_state *ps = &pit->pit_state; pr_debug("load_count val is %d, channel is %d\n", val, channel); @@ -406,29 +378,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(kvm, val, 0); + create_pit_timer(pit, val, 0); break; case 2: case 3: - create_pit_timer(kvm, val, 1); + create_pit_timer(pit, val, 1); break; default: - destroy_pit_timer(kvm->arch.vpit); + destroy_pit_timer(pit); } } -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) +void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, + int hpet_legacy_start) { u8 saved_mode; + + WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock)); + if (hpet_legacy_start) { /* save existing mode for later reenablement */ WARN_ON(channel != 0); - saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; - kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ - pit_load_count(kvm, channel, val); - kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; + saved_mode = pit->pit_state.channels[0].mode; + pit->pit_state.channels[0].mode = 0xff; /* disable timer */ + pit_load_count(pit, channel, val); + pit->pit_state.channels[0].mode = saved_mode; } else { - pit_load_count(kvm, channel, val); + pit_load_count(pit, channel, val); } } @@ -454,7 +430,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; int channel, access; struct kvm_kpit_channel_state *s; u32 val = *(u32 *) data; @@ -478,9 +453,9 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, s = &pit_state->channels[channel]; if (val & (2 << channel)) { if (!(val & 0x20)) - pit_latch_count(kvm, channel); + pit_latch_count(pit, channel); if (!(val & 0x10)) - pit_latch_status(kvm, channel); + pit_latch_status(pit, channel); } } } else { @@ -488,7 +463,7 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, s = &pit_state->channels[channel]; access = (val >> 4) & KVM_PIT_CHANNEL_MASK; if (access == 0) { - pit_latch_count(kvm, channel); + pit_latch_count(pit, channel); } else { s->rw_mode = access; s->read_state = access; @@ -505,17 +480,17 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu, switch (s->write_state) { default: case RW_STATE_LSB: - pit_load_count(kvm, addr, val); + pit_load_count(pit, addr, val); break; case RW_STATE_MSB: - pit_load_count(kvm, addr, val << 8); + pit_load_count(pit, addr, val << 8); break; case RW_STATE_WORD0: s->write_latch = val; s->write_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: - pit_load_count(kvm, addr, s->write_latch | (val << 8)); + pit_load_count(pit, addr, s->write_latch | (val << 8)); s->write_state = RW_STATE_WORD0; break; } @@ -531,7 +506,6 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = dev_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; int ret, count; struct kvm_kpit_channel_state *s; if (!pit_in_range(addr)) @@ -568,20 +542,20 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu, switch (s->read_state) { default: case RW_STATE_LSB: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = count & 0xff; break; case RW_STATE_MSB: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; break; case RW_STATE_WORD0: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = count & 0xff; s->read_state = RW_STATE_WORD1; break; case RW_STATE_WORD1: - count = pit_get_count(kvm, addr); + count = pit_get_count(pit, addr); ret = (count >> 8) & 0xff; s->read_state = RW_STATE_WORD0; break; @@ -602,14 +576,13 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; u32 val = *(u32 *) data; if (addr != KVM_SPEAKER_BASE_ADDRESS) return -EOPNOTSUPP; mutex_lock(&pit_state->lock); pit_state->speaker_data_on = (val >> 1) & 1; - pit_set_gate(kvm, 2, val & 1); + pit_set_gate(pit, 2, val & 1); mutex_unlock(&pit_state->lock); return 0; } @@ -620,7 +593,6 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, { struct kvm_pit *pit = speaker_to_pit(this); struct kvm_kpit_state *pit_state = &pit->pit_state; - struct kvm *kvm = pit->kvm; unsigned int refresh_clock; int ret; if (addr != KVM_SPEAKER_BASE_ADDRESS) @@ -630,8 +602,8 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; mutex_lock(&pit_state->lock); - ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) | - (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4)); + ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) | + (pit_get_out(pit, 2) << 5) | (refresh_clock << 4)); if (len > sizeof(ret)) len = sizeof(ret); memcpy(data, (char *)&ret, len); @@ -639,33 +611,28 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu, return 0; } -void kvm_pit_reset(struct kvm_pit *pit) +static void kvm_pit_reset(struct kvm_pit *pit) { int i; struct kvm_kpit_channel_state *c; - mutex_lock(&pit->pit_state.lock); pit->pit_state.flags = 0; for (i = 0; i < 3; i++) { c = &pit->pit_state.channels[i]; c->mode = 0xff; c->gate = (i != 2); - pit_load_count(pit->kvm, i, 0); + pit_load_count(pit, i, 0); } - mutex_unlock(&pit->pit_state.lock); - atomic_set(&pit->pit_state.pending, 0); - pit->pit_state.irq_ack = 1; + kvm_pit_reset_reinject(pit); } static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask) { struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier); - if (!mask) { - atomic_set(&pit->pit_state.pending, 0); - pit->pit_state.irq_ack = 1; - } + if (!mask) + kvm_pit_reset_reinject(pit); } static const struct kvm_io_device_ops pit_dev_ops = { @@ -692,14 +659,10 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) return NULL; pit->irq_source_id = kvm_request_irq_source_id(kvm); - if (pit->irq_source_id < 0) { - kfree(pit); - return NULL; - } + if (pit->irq_source_id < 0) + goto fail_request; mutex_init(&pit->pit_state.lock); - mutex_lock(&pit->pit_state.lock); - spin_lock_init(&pit->pit_state.inject_lock); pid = get_pid(task_tgid(current)); pid_nr = pid_vnr(pid); @@ -708,36 +671,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) init_kthread_worker(&pit->worker); pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) { - mutex_unlock(&pit->pit_state.lock); - kvm_free_irq_source_id(kvm, pit->irq_source_id); - kfree(pit); - return NULL; - } + if (IS_ERR(pit->worker_task)) + goto fail_kthread; + init_kthread_work(&pit->expired, pit_do_work); - kvm->arch.vpit = pit; pit->kvm = kvm; pit_state = &pit->pit_state; - pit_state->pit = pit; hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pit_state->timer.function = pit_timer_fn; + pit_state->irq_ack_notifier.gsi = 0; pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; - kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - pit_state->reinject = true; - mutex_unlock(&pit->pit_state.lock); + pit->mask_notifier.func = pit_mask_notifer; kvm_pit_reset(pit); - pit->mask_notifier.func = pit_mask_notifer; - kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); + kvm_pit_set_reinject(pit, true); kvm_iodevice_init(&pit->dev, &pit_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, KVM_PIT_MEM_LENGTH, &pit->dev); if (ret < 0) - goto fail; + goto fail_register_pit; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); @@ -745,42 +702,35 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) KVM_SPEAKER_BASE_ADDRESS, 4, &pit->speaker_dev); if (ret < 0) - goto fail_unregister; + goto fail_register_speaker; } return pit; -fail_unregister: +fail_register_speaker: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); - -fail: - kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); - kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); - kvm_free_irq_source_id(kvm, pit->irq_source_id); +fail_register_pit: + kvm_pit_set_reinject(pit, false); kthread_stop(pit->worker_task); +fail_kthread: + kvm_free_irq_source_id(kvm, pit->irq_source_id); +fail_request: kfree(pit); return NULL; } void kvm_free_pit(struct kvm *kvm) { - struct hrtimer *timer; - - if (kvm->arch.vpit) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &kvm->arch.vpit->speaker_dev); - kvm_unregister_irq_mask_notifier(kvm, 0, - &kvm->arch.vpit->mask_notifier); - kvm_unregister_irq_ack_notifier(kvm, - &kvm->arch.vpit->pit_state.irq_ack_notifier); - mutex_lock(&kvm->arch.vpit->pit_state.lock); - timer = &kvm->arch.vpit->pit_state.timer; - hrtimer_cancel(timer); - flush_kthread_work(&kvm->arch.vpit->expired); - kthread_stop(kvm->arch.vpit->worker_task); - kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); - kfree(kvm->arch.vpit); + struct kvm_pit *pit = kvm->arch.vpit; + + if (pit) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); + kvm_pit_set_reinject(pit, false); + hrtimer_cancel(&pit->pit_state.timer); + flush_kthread_work(&pit->expired); + kthread_stop(pit->worker_task); + kvm_free_irq_source_id(kvm, pit->irq_source_id); + kfree(pit); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index c84990b42..2f5af0798 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -22,19 +22,18 @@ struct kvm_kpit_channel_state { }; struct kvm_kpit_state { + /* All members before "struct mutex lock" are protected by the lock. */ struct kvm_kpit_channel_state channels[3]; u32 flags; bool is_periodic; s64 period; /* unit: ns */ struct hrtimer timer; - atomic_t pending; /* accumulated triggered timers */ - bool reinject; - struct kvm *kvm; u32 speaker_data_on; + struct mutex lock; - struct kvm_pit *pit; - spinlock_t inject_lock; - unsigned long irq_ack; + atomic_t reinject; + atomic_t pending; /* accumulated triggered timers */ + atomic_t irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; @@ -57,9 +56,11 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); -void kvm_pit_reset(struct kvm_pit *pit); + +void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val, + int hpet_legacy_start); +void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject); #endif diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 1facfd60b..9db47090e 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic) { ioapic->rtc_status.pending_eoi = 0; - bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS); + bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS); } static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic); @@ -117,16 +117,16 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); - old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); if (new_val == old_val) return; if (new_val) { - __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); ioapic->rtc_status.pending_eoi++; } else { - __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map); + __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); ioapic->rtc_status.pending_eoi--; rtc_status_pending_eoi_check_valid(ioapic); } @@ -156,7 +156,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) { - if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) { + if (test_and_clear_bit(vcpu->vcpu_id, + ioapic->rtc_status.dest_map.map)) { --ioapic->rtc_status.pending_eoi; rtc_status_pending_eoi_check_valid(ioapic); } @@ -236,10 +237,17 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr) void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) { struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; union kvm_ioapic_redirect_entry *e; int index; spin_lock(&ioapic->lock); + + /* Make sure we see any missing RTC EOI */ + if (test_bit(vcpu->vcpu_id, dest_map->map)) + __set_bit(dest_map->vectors[vcpu->vcpu_id], + ioapic_handled_vectors); + for (index = 0; index < IOAPIC_NUM_PINS; index++) { e = &ioapic->redirtbl[index]; if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG || @@ -346,7 +354,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) */ BUG_ON(ioapic->rtc_status.pending_eoi != 0); ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, - ioapic->rtc_status.dest_map); + &ioapic->rtc_status.dest_map); ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret); } else ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL); @@ -407,8 +415,14 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work) static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, struct kvm_ioapic *ioapic, int vector, int trigger_mode) { - int i; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; struct kvm_lapic *apic = vcpu->arch.apic; + int i; + + /* RTC special handling */ + if (test_bit(vcpu->vcpu_id, dest_map->map) && + vector == dest_map->vectors[vcpu->vcpu_id]) + rtc_irq_eoi(ioapic, vcpu); for (i = 0; i < IOAPIC_NUM_PINS; i++) { union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; @@ -416,8 +430,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, if (ent->fields.vector != vector) continue; - if (i == RTC_GSI) - rtc_irq_eoi(ioapic, vcpu); /* * We are dropping lock while calling ack notifiers because ack * notifier callbacks for assigned devices call into IOAPIC diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 2d16dc251..7d2692a49 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -40,9 +40,21 @@ struct kvm_vcpu; #define RTC_GSI -1U #endif +struct dest_map { + /* vcpu bitmap where IRQ has been sent */ + DECLARE_BITMAP(map, KVM_MAX_VCPUS); + + /* + * Vector sent to a given vcpu, only valid when + * the vcpu's bit in map is set + */ + u8 vectors[KVM_MAX_VCPUS]; +}; + + struct rtc_status { int pending_eoi; - DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS); + struct dest_map dest_map; }; union kvm_ioapic_redirect_entry { @@ -118,7 +130,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id, int level, bool line_status); void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, unsigned long *dest_map); + struct kvm_lapic_irq *irq, + struct dest_map *dest_map); int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 3982b479b..95fcc7b13 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -33,7 +33,10 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return apic_has_pending_timer(vcpu); + if (lapic_in_kernel(vcpu)) + return apic_has_pending_timer(vcpu); + + return 0; } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); @@ -137,8 +140,8 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { - kvm_inject_apic_timer_irqs(vcpu); - /* TODO: PIT, RTC etc. */ + if (lapic_in_kernel(vcpu)) + kvm_inject_apic_timer_irqs(vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ae5c78f23..61ebdc13a 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -109,14 +109,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) return ret; } -static inline int lapic_in_kernel(struct kvm_vcpu *vcpu) -{ - /* Same as irqchip_in_kernel(vcpu->kvm), but with less - * pointer chasing and no unnecessary memory barriers. - */ - return vcpu->arch.apic != NULL; -} - void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 8fc89efb5..54ead79e4 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -34,6 +34,7 @@ #include "lapic.h" #include "hyperv.h" +#include "x86.h" static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, @@ -53,10 +54,12 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, } int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, unsigned long *dest_map) + struct kvm_lapic_irq *irq, struct dest_map *dest_map) { int i, r = -1; struct kvm_vcpu *vcpu, *lowest = NULL; + unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + unsigned int dest_vcpus = 0; if (irq->dest_mode == 0 && irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) { @@ -67,6 +70,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map)) return r; + memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap)); + kvm_for_each_vcpu(i, vcpu, kvm) { if (!kvm_apic_present(vcpu)) continue; @@ -80,13 +85,25 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); } else if (kvm_lapic_enabled(vcpu)) { - if (!lowest) - lowest = vcpu; - else if (kvm_apic_compare_prio(vcpu, lowest) < 0) - lowest = vcpu; + if (!kvm_vector_hashing_enabled()) { + if (!lowest) + lowest = vcpu; + else if (kvm_apic_compare_prio(vcpu, lowest) < 0) + lowest = vcpu; + } else { + __set_bit(i, dest_vcpu_bitmap); + dest_vcpus++; + } } } + if (dest_vcpus != 0) { + int idx = kvm_vector_to_index(irq->vector, dest_vcpus, + dest_vcpu_bitmap, KVM_MAX_VCPUS); + + lowest = kvm_get_vcpu(kvm, idx); + } + if (lowest) r = kvm_apic_set_irq(lowest, irq, dest_map); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index e1e89ee4a..762cdf259 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); } +static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->get_pkru(vcpu); +} + static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3a045f39e..1a2da0e5a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -281,7 +281,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return; feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); @@ -475,26 +475,20 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { - int highest_irr; - /* This may race with setting of irr in __apic_accept_irq() and * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq * will cause vmexit immediately and the value will be recalculated * on the next vmentry. */ - if (!kvm_vcpu_has_lapic(vcpu)) - return 0; - highest_irr = apic_find_highest_irr(vcpu->arch.apic); - - return highest_irr; + return apic_find_highest_irr(vcpu->arch.apic); } static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, - unsigned long *dest_map); + struct dest_map *dest_map); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map) + struct dest_map *dest_map) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -675,8 +669,33 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, } } +int kvm_vector_to_index(u32 vector, u32 dest_vcpus, + const unsigned long *bitmap, u32 bitmap_size) +{ + u32 mod; + int i, idx = -1; + + mod = vector % dest_vcpus; + + for (i = 0; i <= mod; i++) { + idx = find_next_bit(bitmap, bitmap_size, idx + 1); + BUG_ON(idx == bitmap_size); + } + + return idx; +} + +static void kvm_apic_disabled_lapic_found(struct kvm *kvm) +{ + if (!kvm->arch.disabled_lapic_found) { + kvm->arch.disabled_lapic_found = true; + printk(KERN_INFO + "Disabled LAPIC found during irq injection\n"); + } +} + bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map) + struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) { struct kvm_apic_map *map; unsigned long bitmap = 1; @@ -727,21 +746,42 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, dst = map->logical_map[cid]; - if (kvm_lowest_prio_delivery(irq)) { + if (!kvm_lowest_prio_delivery(irq)) + goto set_irq; + + if (!kvm_vector_hashing_enabled()) { int l = -1; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; if (l < 0) l = i; - else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0) + else if (kvm_apic_compare_prio(dst[i]->vcpu, + dst[l]->vcpu) < 0) l = i; } - bitmap = (l >= 0) ? 1 << l : 0; + } else { + int idx; + unsigned int dest_vcpus; + + dest_vcpus = hweight16(bitmap); + if (dest_vcpus == 0) + goto out; + + idx = kvm_vector_to_index(irq->vector, + dest_vcpus, &bitmap, 16); + + if (!dst[idx]) { + kvm_apic_disabled_lapic_found(kvm); + goto out; + } + + bitmap = (idx >= 0) ? 1 << idx : 0; } } +set_irq: for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) continue; @@ -754,6 +794,20 @@ out: return ret; } +/* + * This routine tries to handler interrupts in posted mode, here is how + * it deals with different cases: + * - For single-destination interrupts, handle it in posted mode + * - Else if vector hashing is enabled and it is a lowest-priority + * interrupt, handle it in posted mode and use the following mechanism + * to find the destinaiton vCPU. + * 1. For lowest-priority interrupts, store all the possible + * destination vCPUs in an array. + * 2. Use "guest vector % max number of destination vCPUs" to find + * the right destination vCPU in the array for the lowest-priority + * interrupt. + * - Otherwise, use remapped mode to inject the interrupt. + */ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { @@ -795,16 +849,37 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, if (cid >= ARRAY_SIZE(map->logical_map)) goto out; - for_each_set_bit(i, &bitmap, 16) { - dst = map->logical_map[cid][i]; - if (++r == 2) + if (kvm_vector_hashing_enabled() && + kvm_lowest_prio_delivery(irq)) { + int idx; + unsigned int dest_vcpus; + + dest_vcpus = hweight16(bitmap); + if (dest_vcpus == 0) goto out; - } - if (dst && kvm_apic_present(dst->vcpu)) + idx = kvm_vector_to_index(irq->vector, dest_vcpus, + &bitmap, 16); + + dst = map->logical_map[cid][idx]; + if (!dst) { + kvm_apic_disabled_lapic_found(kvm); + goto out; + } + *dest_vcpu = dst->vcpu; - else - goto out; + } else { + for_each_set_bit(i, &bitmap, 16) { + dst = map->logical_map[cid][i]; + if (++r == 2) + goto out; + } + + if (dst && kvm_apic_present(dst->vcpu)) + *dest_vcpu = dst->vcpu; + else + goto out; + } } ret = true; @@ -819,7 +894,7 @@ out: */ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, int vector, int level, int trig_mode, - unsigned long *dest_map) + struct dest_map *dest_map) { int result = 0; struct kvm_vcpu *vcpu = apic->vcpu; @@ -839,8 +914,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; - if (dest_map) - __set_bit(vcpu->vcpu_id, dest_map); + if (dest_map) { + __set_bit(vcpu->vcpu_id, dest_map->map); + dest_map->vectors[vcpu->vcpu_id] = vector; + } if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) { if (trig_mode) @@ -1239,7 +1316,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return; if (apic->lapic_timer.expired_tscdeadline == 0) @@ -1292,7 +1369,7 @@ static void start_apic_timer(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " @@ -1325,7 +1402,7 @@ static void start_apic_timer(struct kvm_lapic *apic) expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, lapic_timer_advance_ns); hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS); + expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -1515,8 +1592,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_has_lapic(vcpu)) - apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); + apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); @@ -1566,7 +1642,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return 0; @@ -1577,7 +1653,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) || + if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) return; @@ -1590,9 +1666,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu)) - return; - apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4)); } @@ -1601,9 +1674,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { u64 tpr; - if (!kvm_vcpu_has_lapic(vcpu)) - return 0; - tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI); return (tpr & 0xf0) >> 4; @@ -1728,8 +1798,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) && - apic_lvt_enabled(apic, APIC_LVTT)) + if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT)) return atomic_read(&apic->lapic_timer.pending); return 0; @@ -1799,7 +1868,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); + HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; /* @@ -1826,7 +1895,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; int highest_irr; - if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic)) + if (!apic_enabled(apic)) return -1; apic_update_ppr(apic); @@ -1854,9 +1923,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu)) - return; - if (atomic_read(&apic->lapic_timer.pending) > 0) { kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) @@ -1932,12 +1998,12 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct hrtimer *timer; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return; timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* @@ -2105,7 +2171,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return 1; /* if this is ICR write vector before command */ @@ -2119,7 +2185,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) struct kvm_lapic *apic = vcpu->arch.apic; u32 low, high = 0; - if (!kvm_vcpu_has_lapic(vcpu)) + if (!lapic_in_kernel(vcpu)) return 1; if (apic_reg_read(apic, reg, 4, &low)) @@ -2151,7 +2217,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) u8 sipi_vector; unsigned long pe; - if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) + if (!lapic_in_kernel(vcpu) || !apic->pending_events) return; /* diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 41bdb35b4..f71183e50 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -42,6 +42,9 @@ struct kvm_lapic { unsigned long pending_events; unsigned int sipi_vector; }; + +struct dest_map; + int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); @@ -60,11 +63,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu); void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, - unsigned long *dest_map); + struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map); + struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -103,7 +106,7 @@ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off) extern struct static_key kvm_no_apic_vcpu; -static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu) +static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu) { if (static_key_false(&kvm_no_apic_vcpu)) return vcpu->arch.apic; @@ -130,7 +133,7 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic) static inline bool kvm_apic_present(struct kvm_vcpu *vcpu) { - return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic); + return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic); } static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) @@ -150,7 +153,7 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu) static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) { - return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; + return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events; } static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) @@ -161,7 +164,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) { - return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } static inline int kvm_apic_id(struct kvm_lapic *apic) @@ -175,4 +178,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); +int kvm_vector_to_index(u32 vector, u32 dest_vcpus, + const unsigned long *bitmap, u32 bitmap_size); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1e7a49bfc..b6f50e8b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -41,6 +41,7 @@ #include <asm/cmpxchg.h> #include <asm/io.h> #include <asm/vmx.h> +#include <asm/kvm_page_track.h> /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -478,7 +479,7 @@ static bool spte_is_locklessly_modifiable(u64 spte) static bool spte_has_volatile_bits(u64 spte) { /* - * Always atomicly update spte if it can be updated + * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. @@ -549,15 +550,22 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) /* * For the spte updated out of mmu-lock is safe, since - * we always atomicly update it, see the comments in + * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ if (spte_is_locklessly_modifiable(old_spte) && !is_writable_pte(new_spte)) ret = true; - if (!shadow_accessed_mask) + if (!shadow_accessed_mask) { + /* + * We don't set page dirty when dropping non-writable spte. + * So do it now if the new spte is becoming non-writable. + */ + if (ret) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); return ret; + } /* * Flush TLB when accessed/dirty bits are changed in the page tables, @@ -604,7 +612,8 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) + if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : + PT_WRITABLE_MASK)) kvm_set_pfn_dirty(pfn); return 1; } @@ -631,12 +640,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) * kvm_flush_remote_tlbs() IPI to all active vcpus. */ local_irq_disable(); - vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode. */ - smp_mb(); + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) @@ -646,8 +655,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) * reads to sptes. If it does, kvm_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ - smp_mb(); - vcpu->mode = OUTSIDE_GUEST_MODE; + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); } @@ -776,62 +784,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, + gfn_t gfn, int count) +{ + struct kvm_lpage_info *linfo; + int i; + + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + linfo = lpage_info_slot(gfn, slot, i); + linfo->disallow_lpage += count; + WARN_ON(linfo->disallow_lpage < 0); + } +} + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +{ + update_gfn_disallow_lpage_count(slot, gfn, 1); +} + +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +{ + update_gfn_disallow_lpage_count(slot, gfn, -1); +} + static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; gfn_t gfn; - int i; + kvm->arch.indirect_shadow_pages++; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count += 1; - } - kvm->arch.indirect_shadow_pages++; + + /* the non-leaf shadow pages are keeping readonly. */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return kvm_slot_page_track_add_page(kvm, slot, gfn, + KVM_PAGE_TRACK_WRITE); + + kvm_mmu_gfn_disallow_lpage(slot, gfn); } static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; struct kvm_memory_slot *slot; - struct kvm_lpage_info *linfo; gfn_t gfn; - int i; + kvm->arch.indirect_shadow_pages--; gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { - linfo = lpage_info_slot(gfn, slot, i); - linfo->write_count -= 1; - WARN_ON(linfo->write_count < 0); - } - kvm->arch.indirect_shadow_pages--; + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return kvm_slot_page_track_remove_page(kvm, slot, gfn, + KVM_PAGE_TRACK_WRITE); + + kvm_mmu_gfn_allow_lpage(slot, gfn); } -static int __has_wrprotected_page(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { struct kvm_lpage_info *linfo; if (slot) { linfo = lpage_info_slot(gfn, slot, level); - return linfo->write_count; + return !!linfo->disallow_lpage; } - return 1; + return true; } -static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn, + int level) { struct kvm_memory_slot *slot; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - return __has_wrprotected_page(gfn, level, slot); + return __mmu_gfn_lpage_is_disallowed(gfn, level, slot); } static int host_mapping_level(struct kvm *kvm, gfn_t gfn) @@ -897,7 +928,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (__has_wrprotected_page(large_gfn, level, slot)) + if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot)) break; return level - 1; @@ -1323,23 +1354,29 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn) { - struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; int i; bool write_protected = false; - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true); + write_protected |= __rmap_write_protect(kvm, rmap_head, true); } return write_protected; } +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn); +} + static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head) { u64 *sptep; @@ -1754,7 +1791,7 @@ static void mark_unsync(u64 *spte) static int nonpaging_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - return 1; + return 0; } static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) @@ -1840,13 +1877,16 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return nr_unsync_leaf; } +#define INVALID_INDEX (-1) + static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_mmu_pages *pvec) { + pvec->nr = 0; if (!sp->unsync_children) return 0; - mmu_pages_add(pvec, sp, 0); + mmu_pages_add(pvec, sp, INVALID_INDEX); return __mmu_unsync_walk(sp, pvec); } @@ -1883,37 +1923,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if ((_sp)->role.direct || (_sp)->role.invalid) {} else /* @sp->gfn should be write-protected at the call site */ -static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - struct list_head *invalid_list, bool clear_unsync) +static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; + return false; } - if (clear_unsync) - kvm_unlink_unsync_page(vcpu->kvm, sp); - - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return 1; + return false; } - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - return 0; + return true; } -static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) +static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu, + struct list_head *invalid_list, + bool remote_flush, bool local_flush) { - LIST_HEAD(invalid_list); - int ret; - - ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (ret) - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (!list_empty(invalid_list)) { + kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list); + return; + } - return ret; + if (remote_flush) + kvm_flush_remote_tlbs(vcpu->kvm); + else if (local_flush) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } #ifdef CONFIG_KVM_MMU_AUDIT @@ -1923,46 +1961,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } static void mmu_audit_disable(void) { } #endif -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, +static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - return __kvm_sync_page(vcpu, sp, invalid_list, true); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return __kvm_sync_page(vcpu, sp, invalid_list); } /* @gfn should be write-protected at the call site */ -static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, + struct list_head *invalid_list) { struct kvm_mmu_page *s; - LIST_HEAD(invalid_list); - bool flush = false; + bool ret = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - kvm_unlink_unsync_page(vcpu->kvm, s); - if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); - continue; - } - flush = true; + ret |= kvm_sync_page(vcpu, s, invalid_list); } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - if (flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return ret; } struct mmu_page_path { - struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; - unsigned int idx[PT64_ROOT_LEVEL-1]; + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL]; + unsigned int idx[PT64_ROOT_LEVEL]; }; #define for_each_sp(pvec, sp, parents, i) \ - for (i = mmu_pages_next(&pvec, &parents, -1), \ - sp = pvec.page[i].sp; \ + for (i = mmu_pages_first(&pvec, &parents); \ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ i = mmu_pages_next(&pvec, &parents, i)) @@ -1974,19 +2004,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec, for (n = i+1; n < pvec->nr; n++) { struct kvm_mmu_page *sp = pvec->page[n].sp; + unsigned idx = pvec->page[n].idx; + int level = sp->role.level; - if (sp->role.level == PT_PAGE_TABLE_LEVEL) { - parents->idx[0] = pvec->page[n].idx; - return n; - } + parents->idx[level-1] = idx; + if (level == PT_PAGE_TABLE_LEVEL) + break; - parents->parent[sp->role.level-2] = sp; - parents->idx[sp->role.level-1] = pvec->page[n].idx; + parents->parent[level-2] = sp; } return n; } +static int mmu_pages_first(struct kvm_mmu_pages *pvec, + struct mmu_page_path *parents) +{ + struct kvm_mmu_page *sp; + int level; + + if (pvec->nr == 0) + return 0; + + WARN_ON(pvec->page[0].idx != INVALID_INDEX); + + sp = pvec->page[0].sp; + level = sp->role.level; + WARN_ON(level == PT_PAGE_TABLE_LEVEL); + + parents->parent[level-2] = sp; + + /* Also set up a sentinel. Further entries in pvec are all + * children of sp, so this element is never overwritten. + */ + parents->parent[level-1] = NULL; + return mmu_pages_next(pvec, parents, 0); +} + static void mmu_pages_clear_parents(struct mmu_page_path *parents) { struct kvm_mmu_page *sp; @@ -1994,22 +2048,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents) do { unsigned int idx = parents->idx[level]; - sp = parents->parent[level]; if (!sp) return; + WARN_ON(idx == INVALID_INDEX); clear_unsync_child_bit(sp, idx); level++; - } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); -} - -static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, - struct mmu_page_path *parents, - struct kvm_mmu_pages *pvec) -{ - parents->parent[parent->role.level-1] = NULL; - pvec->nr = 0; + } while (!sp->unsync_children); } static void mmu_sync_children(struct kvm_vcpu *vcpu, @@ -2020,30 +2066,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct mmu_page_path parents; struct kvm_mmu_pages pages; LIST_HEAD(invalid_list); + bool flush = false; - kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu, sp->gfn); - if (protected) + if (protected) { kvm_flush_remote_tlbs(vcpu->kvm); + flush = false; + } for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp, &invalid_list); + flush |= kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - cond_resched_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_pages_init(parent, &parents, &pages); + if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) { + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); + cond_resched_lock(&vcpu->kvm->mmu_lock); + flush = false; + } } + + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); } static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) { - sp->write_flooding_count = 0; + atomic_set(&sp->write_flooding_count, 0); } static void clear_sp_write_flooding_count(u64 *spte) @@ -2069,6 +2121,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; + bool flush = false; + LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; role.level = level; @@ -2092,8 +2146,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; + if (sp->unsync) { + /* The page is good, but __kvm_sync_page might still end + * up zapping it. If so, break in order to rebuild it. + */ + if (!__kvm_sync_page(vcpu, sp, &invalid_list)) + break; + + WARN_ON(!list_empty(&invalid_list)); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } if (sp->unsync_children) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); @@ -2112,16 +2174,24 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_add_head(&sp->hash_link, &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { - if (rmap_write_protect(vcpu, gfn)) + /* + * we should do write protection before syncing pages + * otherwise the content of the synced shadow page may + * be inconsistent with guest page table. + */ + account_shadowed(vcpu->kvm, sp); + if (level == PT_PAGE_TABLE_LEVEL && + rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) - kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, sp); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); + + kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); return sp; } @@ -2269,7 +2339,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm, if (parent->role.level == PT_PAGE_TABLE_LEVEL) return 0; - kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { struct kvm_mmu_page *sp; @@ -2278,7 +2347,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm, mmu_pages_clear_parents(&parents); zapped++; } - kvm_mmu_pages_init(parent, &parents, &pages); } return zapped; @@ -2329,14 +2397,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, return; /* - * wmb: make sure everyone sees our modifications to the page tables - * rmb: make sure we see changes to vcpu->mode - */ - smp_mb(); - - /* - * Wait for all vcpus to exit guest mode and/or lockless shadow - * page table walks. + * We need to make sure everyone sees our modifications to + * the page tables and see changes to vcpu->mode here. The barrier + * in the kvm_flush_remote_tlbs() achieves this. This pairs + * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. + * + * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit + * guest mode and/or lockless shadow page table walks. */ kvm_flush_remote_tlbs(kvm); @@ -2354,8 +2421,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, if (list_empty(&kvm->arch.active_mmu_pages)) return false; - sp = list_entry(kvm->arch.active_mmu_pages.prev, - struct kvm_mmu_page, link); + sp = list_last_entry(&kvm->arch.active_mmu_pages, + struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); return true; @@ -2408,7 +2475,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; @@ -2417,37 +2484,26 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) kvm_mmu_mark_parents_unsync(sp); } -static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) { - struct kvm_mmu_page *s; - - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { - if (s->unsync) - continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); - __kvm_unsync_page(vcpu, s); - } -} + struct kvm_mmu_page *sp; -static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, - bool can_unsync) -{ - struct kvm_mmu_page *s; - bool need_unsync = false; + if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + return true; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (!can_unsync) - return 1; + return true; - if (s->role.level != PT_PAGE_TABLE_LEVEL) - return 1; + if (sp->unsync) + continue; - if (!s->unsync) - need_unsync = true; + WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + kvm_unsync_page(vcpu, sp); } - if (need_unsync) - kvm_unsync_pages(vcpu, gfn); - return 0; + + return false; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) @@ -2503,7 +2559,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * be fixed if guest refault. */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu, gfn, level)) + mmu_gfn_lpage_is_disallowed(vcpu, gfn, level)) goto done; spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2767,8 +2823,8 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && - PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { + PageTransCompoundMap(pfn_to_page(pfn)) && + !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the @@ -2796,20 +2852,16 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, kvm_pfn_t pfn, unsigned access, int *ret_val) { - bool ret = true; - /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); - goto exit; + return true; } if (unlikely(is_noslot_pfn(pfn))) vcpu_cache_mmio_info(vcpu, gva, gfn, access); - ret = false; -exit: - return ret; + return false; } static bool page_fault_can_be_fast(u32 error_code) @@ -3273,7 +3325,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); } -static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) +static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) { if (direct) return vcpu_match_mmio_gpa(vcpu, addr); @@ -3332,7 +3384,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) u64 spte; bool reserved; - if (quickly_check_mmio_pf(vcpu, addr, direct)) + if (mmio_info_in_cache(vcpu, addr, direct)) return RET_MMIO_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); @@ -3362,20 +3414,53 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) } EXPORT_SYMBOL_GPL(handle_mmio_page_fault); +static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, + u32 error_code, gfn_t gfn) +{ + if (unlikely(error_code & PFERR_RSVD_MASK)) + return false; + + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + /* + * guest is writing the page which is write tracked which can + * not be fixed by page fault handler. + */ + if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE)) + return true; + + return false; +} + +static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) +{ + struct kvm_shadow_walk_iterator iterator; + u64 spte; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { + clear_sp_write_flooding_count(iterator.sptep); + if (!is_shadow_present_pte(spte)) + break; + } + walk_shadow_page_lockless_end(vcpu); +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) { - gfn_t gfn; + gfn_t gfn = gva >> PAGE_SHIFT; int r; pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gva, true); - - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - } + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return 1; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3383,7 +3468,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, error_code, gfn, prefault); @@ -3460,12 +3544,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gpa, true); - - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - } + if (page_fault_handle_page_track(vcpu, error_code, gfn)) + return 1; r = mmu_topup_memory_caches(vcpu); if (r) @@ -3558,13 +3638,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, return false; } -static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) +static inline bool is_last_gpte(struct kvm_mmu *mmu, + unsigned level, unsigned gpte) { - unsigned index; + /* + * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set + * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means + * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + */ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + + /* + * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. + * If it is clear, there are no large pages at this level, so clear + * PT_PAGE_SIZE_MASK in gpte if that is the case. + */ + gpte &= level - mmu->last_nonleaf_level; - index = level - 1; - index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); - return mmu->last_pte_bitmap & (1 << index); + return gpte & PT_PAGE_SIZE_MASK; } #define PTTYPE_EPT 18 /* arbitrary */ @@ -3838,22 +3929,88 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } -static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) -{ - u8 map; - unsigned level, root_level = mmu->root_level; - const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ - - if (root_level == PT32E_ROOT_LEVEL) - --root_level; - /* PT_PAGE_TABLE_LEVEL always terminates */ - map = 1 | (1 << ps_set_index); - for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { - if (level <= PT_PDPE_LEVEL - && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) - map |= 1 << (ps_set_index | (level - 1)); +/* +* PKU is an additional mechanism by which the paging controls access to +* user-mode addresses based on the value in the PKRU register. Protection +* key violations are reported through a bit in the page fault error code. +* Unlike other bits of the error code, the PK bit is not known at the +* call site of e.g. gva_to_gpa; it must be computed directly in +* permission_fault based on two bits of PKRU, on some machine state (CR4, +* CR0, EFER, CPL), and on other bits of the error code and the page tables. +* +* In particular the following conditions come from the error code, the +* page tables and the machine state: +* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 +* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) +* - PK is always zero if U=0 in the page tables +* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. +* +* The PKRU bitmask caches the result of these four conditions. The error +* code (minus the P bit) and the page table's U bit form an index into the +* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed +* with the two bits of the PKRU register corresponding to the protection key. +* For the first three conditions above the bits will be 00, thus masking +* away both AD and WD. For all reads or if the last condition holds, WD +* only will be masked away. +*/ +static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept) +{ + unsigned bit; + bool wp; + + if (ept) { + mmu->pkru_mask = 0; + return; + } + + /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { + mmu->pkru_mask = 0; + return; + } + + wp = is_write_protection(vcpu); + + for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { + unsigned pfec, pkey_bits; + bool check_pkey, check_write, ff, uf, wf, pte_user; + + pfec = bit << 1; + ff = pfec & PFERR_FETCH_MASK; + uf = pfec & PFERR_USER_MASK; + wf = pfec & PFERR_WRITE_MASK; + + /* PFEC.RSVD is replaced by ACC_USER_MASK. */ + pte_user = pfec & PFERR_RSVD_MASK; + + /* + * Only need to check the access which is not an + * instruction fetch and is to a user page. + */ + check_pkey = (!ff && pte_user); + /* + * write access is controlled by PKRU if it is a + * user access or CR0.WP = 1. + */ + check_write = check_pkey && wf && (uf || wp); + + /* PKRU.AD stops both read and write access. */ + pkey_bits = !!check_pkey; + /* PKRU.WD stops write access. */ + pkey_bits |= (!!check_write) << 1; + + mmu->pkru_mask |= (pkey_bits & 3) << pfec; } - mmu->last_pte_bitmap = map; +} + +static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + unsigned root_level = mmu->root_level; + + mmu->last_nonleaf_level = root_level; + if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu)) + mmu->last_nonleaf_level++; } static void paging64_init_context_common(struct kvm_vcpu *vcpu, @@ -3865,7 +4022,8 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_pkru_bitmask(vcpu, context, false); + update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); context->page_fault = paging64_page_fault; @@ -3892,7 +4050,8 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_pkru_bitmask(vcpu, context, false); + update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -3950,7 +4109,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); - update_last_pte_bitmap(vcpu, context); + update_pkru_bitmask(vcpu, context, false); + update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4002,6 +4162,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->direct_map = false; update_permission_bitmask(vcpu, context, true); + update_pkru_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4056,7 +4217,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); - update_last_pte_bitmap(vcpu, g_context); + update_pkru_bitmask(vcpu, g_context, false); + update_last_nonleaf_level(vcpu, g_context); } static void init_kvm_mmu(struct kvm_vcpu *vcpu) @@ -4127,18 +4289,6 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, - bool remote_flush, bool local_flush) -{ - if (zap_page) - return; - - if (remote_flush) - kvm_flush_remote_tlbs(vcpu->kvm); - else if (local_flush) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); -} - static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, const u8 *new, int *bytes) { @@ -4188,7 +4338,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) if (sp->role.level == PT_PAGE_TABLE_LEVEL) return false; - return ++sp->write_flooding_count >= 3; + atomic_inc(&sp->write_flooding_count); + return atomic_read(&sp->write_flooding_count) >= 3; } /* @@ -4250,15 +4401,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) +static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, + const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; - bool remote_flush, local_flush, zap_page; + bool remote_flush, local_flush; union kvm_mmu_page_role mask = { }; mask.cr0_wp = 1; @@ -4275,7 +4426,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) return; - zap_page = remote_flush = local_flush = false; + remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -4295,8 +4446,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { - zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -4318,8 +4468,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++spte; } } - mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush); kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); } @@ -4356,32 +4505,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) -{ - if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) - return vcpu_match_mmio_gpa(vcpu, addr); - - return vcpu_match_mmio_gva(vcpu, addr); -} - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; enum emulation_result er; + bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu); + + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, cr2, direct); + if (r == RET_MMIO_PF_EMULATE) { + emulation_type = 0; + goto emulate; + } + if (r == RET_MMIO_PF_RETRY) + return 1; + if (r < 0) + return r; + } r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); if (r < 0) - goto out; - - if (!r) { - r = 1; - goto out; - } + return r; + if (!r) + return 1; - if (is_mmio_page_fault(vcpu, cr2)) + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; - +emulate: er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { @@ -4395,8 +4546,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, default: BUG(); } -out: - return r; } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); @@ -4465,6 +4614,21 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +void kvm_mmu_init_vm(struct kvm *kvm) +{ + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + + node->track_write = kvm_mmu_pte_write; + kvm_page_track_register_notifier(kvm, node); +} + +void kvm_mmu_uninit_vm(struct kvm *kvm) +{ + struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + + kvm_page_track_unregister_notifier(kvm, node); +} + /* The return value indicates if tlb flush on all vcpus is needed. */ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); @@ -4621,7 +4785,7 @@ restart: */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && - PageTransCompound(pfn_to_page(pfn))) { + PageTransCompoundMap(pfn_to_page(pfn))) { drop_spte(kvm, sptep); need_tlb_flush = 1; goto restart; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 55ffb7b0f..66b33b96a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -10,10 +10,11 @@ #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) #define PT_WRITABLE_SHIFT 1 +#define PT_USER_SHIFT 2 #define PT_PRESENT_MASK (1ULL << 0) #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) +#define PT_USER_MASK (1ULL << PT_USER_SHIFT) #define PT_PWT_MASK (1ULL << 3) #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 @@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) } /* - * Will a fault with a given page-fault error code (pfec) cause a permission - * fault with the given access (in ACC_* format)? + * Check if a given access (described through the I/D, W/R and U/S bits of a + * page fault error code pfec) causes a permission fault with the given PTE + * access rights (in ACC_* format). + * + * Return zero if the access does not fault; return the page fault error code + * if the access faults. */ -static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - unsigned pte_access, unsigned pfec) +static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pte_pkey, + unsigned pfec) { int cpl = kvm_x86_ops->get_cpl(vcpu); unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -166,12 +172,38 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); - - WARN_ON(pfec & PFERR_RSVD_MASK); - - return (mmu->permissions[index] >> pte_access) & 1; + bool fault = (mmu->permissions[index] >> pte_access) & 1; + u32 errcode = PFERR_PRESENT_MASK; + + WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); + if (unlikely(mmu->pkru_mask)) { + u32 pkru_bits, offset; + + /* + * PKRU defines 32 bits, there are 16 domains and 2 + * attribute bits per domain in pkru. pte_pkey is the + * index of the protection domain, so pte_pkey * 2 is + * is the index of the first bit for the domain. + */ + pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; + + /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ + offset = (pfec & ~1) + + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); + + pkru_bits &= mmu->pkru_mask >> offset; + errcode |= -pkru_bits & PFERR_PK_MASK; + fault |= (pkru_bits != 0); + } + + return -(u32)fault & errcode; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); #endif diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 3f8c73211..c146f3c26 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -44,8 +44,6 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRdefType: case MSR_IA32_CR_PAT: return true; - case 0x2f8: - return true; } return false; } diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c new file mode 100644 index 000000000..b431539c3 --- /dev/null +++ b/arch/x86/kvm/page_track.c @@ -0,0 +1,227 @@ +/* + * Support KVM gust page tracking + * + * This feature allows us to track page access in guest. Currently, only + * write access is tracked. + * + * Copyright(C) 2015 Intel Corporation. + * + * Author: + * Xiao Guangrong <guangrong.xiao@linux.intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kvm_host.h> +#include <asm/kvm_host.h> +#include <asm/kvm_page_track.h> + +#include "mmu.h" + +void kvm_page_track_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) + if (!dont || free->arch.gfn_track[i] != + dont->arch.gfn_track[i]) { + kvfree(free->arch.gfn_track[i]); + free->arch.gfn_track[i] = NULL; + } +} + +int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, + unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { + slot->arch.gfn_track[i] = kvm_kvzalloc(npages * + sizeof(*slot->arch.gfn_track[i])); + if (!slot->arch.gfn_track[i]) + goto track_free; + } + + return 0; + +track_free: + kvm_page_track_free_memslot(slot, NULL); + return -ENOMEM; +} + +static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) +{ + if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX) + return false; + + return true; +} + +static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode, short count) +{ + int index, val; + + index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + + val = slot->arch.gfn_track[mode][index]; + + if (WARN_ON(val + count < 0 || val + count > USHRT_MAX)) + return; + + slot->arch.gfn_track[mode][index] += count; +} + +/* + * add guest page to the tracking pool so that corresponding access on that + * page will be intercepted. + * + * It should be called under the protection both of mmu-lock and kvm->srcu + * or kvm->slots_lock. + * + * @kvm: the guest instance we are interested in. + * @slot: the @gfn belongs to. + * @gfn: the guest page. + * @mode: tracking mode, currently only write track is supported. + */ +void kvm_slot_page_track_add_page(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + + if (WARN_ON(!page_track_mode_is_valid(mode))) + return; + + update_gfn_track(slot, gfn, mode, 1); + + /* + * new track stops large page mapping for the + * tracked page. + */ + kvm_mmu_gfn_disallow_lpage(slot, gfn); + + if (mode == KVM_PAGE_TRACK_WRITE) + if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) + kvm_flush_remote_tlbs(kvm); +} + +/* + * remove the guest page from the tracking pool which stops the interception + * of corresponding access on that page. It is the opposed operation of + * kvm_slot_page_track_add_page(). + * + * It should be called under the protection both of mmu-lock and kvm->srcu + * or kvm->slots_lock. + * + * @kvm: the guest instance we are interested in. + * @slot: the @gfn belongs to. + * @gfn: the guest page. + * @mode: tracking mode, currently only write track is supported. + */ +void kvm_slot_page_track_remove_page(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + if (WARN_ON(!page_track_mode_is_valid(mode))) + return; + + update_gfn_track(slot, gfn, mode, -1); + + /* + * allow large page mapping for the tracked page + * after the tracker is gone. + */ + kvm_mmu_gfn_allow_lpage(slot, gfn); +} + +/* + * check if the corresponding access on the specified guest page is tracked. + */ +bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, + enum kvm_page_track_mode mode) +{ + struct kvm_memory_slot *slot; + int index; + + if (WARN_ON(!page_track_mode_is_valid(mode))) + return false; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!slot) + return false; + + index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); +} + +void kvm_page_track_init(struct kvm *kvm) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + init_srcu_struct(&head->track_srcu); + INIT_HLIST_HEAD(&head->track_notifier_list); +} + +/* + * register the notifier so that event interception for the tracked guest + * pages can be received. + */ +void +kvm_page_track_register_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + + spin_lock(&kvm->mmu_lock); + hlist_add_head_rcu(&n->node, &head->track_notifier_list); + spin_unlock(&kvm->mmu_lock); +} + +/* + * stop receiving the event interception. It is the opposed operation of + * kvm_page_track_register_notifier(). + */ +void +kvm_page_track_unregister_notifier(struct kvm *kvm, + struct kvm_page_track_notifier_node *n) +{ + struct kvm_page_track_notifier_head *head; + + head = &kvm->arch.track_notifier_head; + + spin_lock(&kvm->mmu_lock); + hlist_del_rcu(&n->node); + spin_unlock(&kvm->mmu_lock); + synchronize_srcu(&head->track_srcu); +} + +/* + * Notify the node that write access is intercepted and write emulation is + * finished at this time. + * + * The node should figure out if the written page is the one that node is + * interested in by itself. + */ +void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, + int bytes) +{ + struct kvm_page_track_notifier_head *head; + struct kvm_page_track_notifier_node *n; + int idx; + + head = &vcpu->kvm->arch.track_notifier_head; + + if (hlist_empty(&head->track_notifier_list)) + return; + + idx = srcu_read_lock(&head->track_srcu); + hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + if (n->track_write) + n->track_write(vcpu, gpa, new, bytes); + srcu_read_unlock(&head->track_srcu, idx); +} diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2ce4f05e8..bc019f70e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -189,8 +189,11 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | ACC_USER_MASK; #else - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - access &= ~(gpte >> PT64_NX_SHIFT); + BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); + BUILD_BUG_ON(ACC_EXEC_MASK != 1); + access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); + /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ + access ^= (gpte >> PT64_NX_SHIFT); #endif return access; @@ -254,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, return 0; } +static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned pkeys = 0; +#if PTTYPE == 64 + pte_t pte = {.pte = gpte}; + + pkeys = pte_flags_pkey(pte_flags(pte)); +#endif + return pkeys; +} + /* * Fetch a guest pte for a guest virtual address */ @@ -265,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty; + unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -346,7 +360,7 @@ retry_walk: goto error; if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) { - errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK; + errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } @@ -356,10 +370,10 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { - errcode |= PFERR_PRESENT_MASK; + pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); + errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access); + if (unlikely(errcode)) goto error; - } gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; @@ -702,24 +716,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu)); - if (likely(r != RET_MMIO_PF_INVALID)) - return r; - - /* - * page fault with PFEC.RSVD = 1 is caused by shadow - * page fault, should not be used to walk guest page - * table. - */ - error_code &= ~PFERR_RSVD_MASK; - }; - r = mmu_topup_memory_caches(vcpu); if (r) return r; /* + * If PFEC.RSVD is set, this is a shadow page fault. + * The bit needs to be cleared before walking guest page tables. + */ + error_code &= ~PFERR_RSVD_MASK; + + /* * Look up the guest pte for the faulting address. */ r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); @@ -735,6 +742,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) { + shadow_page_table_clear_flood(vcpu, addr); + return 1; + } + vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, @@ -945,9 +957,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) - return -EINVAL; + return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + /* + * Update spte before increasing tlbs_dirty to make + * sure no tlb flush is lost after spte is zapped; see + * the comments in kvm_flush_remote_tlbs(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } @@ -963,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); + /* + * The same as above where we are doing + * prefetch_invalid_gpte(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } @@ -977,7 +1000,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable); } - return !nr_present; + return nr_present; } #undef pt_element_t diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 31aa2c85d..06ce377dc 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -257,7 +257,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { - if (vcpu->arch.apic) + if (lapic_in_kernel(vcpu)) kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c13a64b7d..31346a3f2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static u32 svm_get_pkru(struct kvm_vcpu *vcpu) +{ + return 0; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -1858,8 +1863,7 @@ static int halt_interception(struct vcpu_svm *svm) static int vmmcall_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - kvm_emulate_hypercall(&svm->vcpu); - return 1; + return kvm_emulate_hypercall(&svm->vcpu); } static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) @@ -4348,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + + .get_pkru = svm_get_pkru, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ad9f6a23f..2f1ea2f61 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -996,11 +996,13 @@ TRACE_EVENT(kvm_enter_smm, * Tracepoint for VT-d posted-interrupts. */ TRACE_EVENT(kvm_pi_irte_update, - TP_PROTO(unsigned int vcpu_id, unsigned int gsi, - unsigned int gvec, u64 pi_desc_addr, bool set), - TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set), + TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, + unsigned int gsi, unsigned int gvec, + u64 pi_desc_addr, bool set), + TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set), TP_STRUCT__entry( + __field( unsigned int, host_irq ) __field( unsigned int, vcpu_id ) __field( unsigned int, gsi ) __field( unsigned int, gvec ) @@ -1009,6 +1011,7 @@ TRACE_EVENT(kvm_pi_irte_update, ), TP_fast_assign( + __entry->host_irq = host_irq; __entry->vcpu_id = vcpu_id; __entry->gsi = gsi; __entry->gvec = gvec; @@ -1016,9 +1019,10 @@ TRACE_EVENT(kvm_pi_irte_update, __entry->set = set; ), - TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, " + TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, " "gvec: 0x%x, pi_desc_addr: 0x%llx", __entry->set ? "enabled and being updated" : "disabled", + __entry->host_irq, __entry->vcpu_id, __entry->gsi, __entry->gvec, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 539062e24..faf52bac1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -598,6 +598,10 @@ struct vcpu_vmx { struct page *pml_pg; u64 current_tsc_ratio; + + bool guest_pkru_valid; + u32 guest_pkru; + u32 host_pkru; }; enum segment_cache_field { @@ -863,7 +867,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); -static bool vmx_mpx_supported(void); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -963,25 +966,36 @@ static const u32 vmx_msr_index[] = { MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; -static inline bool is_page_fault(u32 intr_info) +static inline bool is_exception_n(u32 intr_info, u8 vector) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); + (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK); +} + +static inline bool is_debug(u32 intr_info) +{ + return is_exception_n(intr_info, DB_VECTOR); +} + +static inline bool is_breakpoint(u32 intr_info) +{ + return is_exception_n(intr_info, BP_VECTOR); +} + +static inline bool is_page_fault(u32 intr_info) +{ + return is_exception_n(intr_info, PF_VECTOR); } static inline bool is_no_device(u32 intr_info) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); + return is_exception_n(intr_info, NM_VECTOR); } static inline bool is_invalid_opcode(u32 intr_info) { - return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | - INTR_INFO_VALID_MASK)) == - (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); + return is_exception_n(intr_info, UD_VECTOR); } static inline bool is_external_interrupt(u32 intr_info) @@ -2097,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); } + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2157,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } vmx_vcpu_pi_load(vcpu, cpu); + vmx->host_pkru = read_pkru(); } static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -2276,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->guest_pkru; +} + static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -2605,7 +2626,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - if (vmx_mpx_supported()) + if (kvm_mpx_supported()) vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ @@ -2626,7 +2647,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VM_ENTRY_LOAD_IA32_PAT; vmx->nested.nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - if (vmx_mpx_supported()) + if (kvm_mpx_supported()) vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ @@ -2877,7 +2898,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: - if (!vmx_mpx_supported()) + if (!kvm_mpx_supported()) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; @@ -2954,7 +2975,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_BNDCFGS: - if (!vmx_mpx_supported()) + if (!kvm_mpx_supported()) return 1; vmcs_write64(GUEST_BNDCFGS, data); break; @@ -3082,6 +3103,8 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { + intel_pt_handle_vmx(1); + asm volatile (ASM_VMX_VMXON_RAX : : "a"(&addr), "m"(addr) : "memory", "cc"); @@ -3151,6 +3174,8 @@ static void vmclear_local_loaded_vmcss(void) static void kvm_cpu_vmxoff(void) { asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); + + intel_pt_handle_vmx(0); } static void hardware_disable(void) @@ -3427,7 +3452,7 @@ static void init_vmcs_shadow_fields(void) for (i = j = 0; i < max_shadow_read_write_fields; i++) { switch (shadow_read_write_fields[i]) { case GUEST_BNDCFGS: - if (!vmx_mpx_supported()) + if (!kvm_mpx_supported()) continue; break; default: @@ -3883,13 +3908,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!enable_unrestricted_guest && !is_paging(vcpu)) /* - * SMEP/SMAP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode without - * unrestricted guest. - * To emulate this behavior, SMEP/SMAP needs to be manually - * disabled when guest switches to non-paging mode. + * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in + * hardware. To emulate this behavior, SMEP/SMAP/PKU needs + * to be manually disabled when guest switches to non-paging + * mode. + * + * If !enable_unrestricted_guest, the CPU is always running + * with CR0.PG=1 and CR4 needs to be modified. + * If enable_unrestricted_guest, the CPU automatically + * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); @@ -5021,8 +5050,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ vmx_set_cr4(vcpu, 0); vmx_set_efer(vcpu, 0); vmx_fpu_activate(vcpu); @@ -5503,7 +5532,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) return kvm_set_cr4(vcpu, val); } -/* called to set cr0 as approriate for clts instruction exit. */ +/* called to set cr0 as appropriate for clts instruction exit. */ static void handle_clts(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { @@ -5636,11 +5665,8 @@ static int handle_dr(struct kvm_vcpu *vcpu) } if (vcpu->guest_debug == 0) { - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_MOV_DR_EXITING); /* * No more DR vmexits; force a reload of the debug registers @@ -5677,8 +5703,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { - u32 cpu_based_vm_exec_control; - get_debugreg(vcpu->arch.db[0], 0); get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); @@ -5687,10 +5711,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -5775,8 +5796,7 @@ static int handle_halt(struct kvm_vcpu *vcpu) static int handle_vmcall(struct kvm_vcpu *vcpu) { - kvm_emulate_hypercall(vcpu); - return 1; + return kvm_emulate_hypercall(vcpu); } static int handle_invd(struct kvm_vcpu *vcpu) @@ -6463,8 +6483,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { /* Recycle the least recently used VMCS. */ - item = list_entry(vmx->nested.vmcs02_pool.prev, - struct vmcs02_list, list); + item = list_last_entry(&vmx->nested.vmcs02_pool, + struct vmcs02_list, list); item->vmptr = vmx->nested.current_vmptr; list_move(&item->list, &vmx->nested.vmcs02_pool); return &item->vmcs02; @@ -7251,7 +7271,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 - * bit (field_value), and then copies only the approriate number of + * bit (field_value), and then copies only the appropriate number of * bits into the vmcs12 field. */ u64 field_value = 0; @@ -7787,6 +7807,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) return false; + else if (is_debug(intr_info) && + vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; + else if (is_breakpoint(intr_info) && + vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + return false; return vmcs12->exception_bitmap & (1u << (intr_info & INTR_INFO_VECTOR_MASK)); case EXIT_REASON_EXTERNAL_INTERRUPT: @@ -8391,6 +8418,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) { u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + register void *__sp asm(_ASM_SP); /* * If external interrupt exists, IF bit is set in rflags/eflags on the @@ -8423,8 +8451,9 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) "call *%[entry]\n\t" : #ifdef CONFIG_X86_64 - [sp]"=&r"(tmp) + [sp]"=&r"(tmp), #endif + "+r"(__sp) : [entry]"r"(entry), [ss]"i"(__KERNEL_DS), @@ -8625,6 +8654,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + if (vmx->guest_pkru_valid) + __write_pkru(vmx->guest_pkru); + atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -8765,6 +8797,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); /* + * eager fpu is enabled if PKEY is supported and CR4 is switched + * back on host, so it is safe to read guest PKRU from current + * XSAVE. + */ + if (boot_cpu_has(X86_FEATURE_OSPKE)) { + vmx->guest_pkru = __read_pkru(); + if (vmx->guest_pkru != vmx->host_pkru) { + vmx->guest_pkru_valid = true; + __write_pkru(vmx->host_pkru); + } else + vmx->guest_pkru_valid = false; + } + + /* * the KVM_REQ_EVENT optimization bit is only on for one entry, and if * we did not inject a still-pending event to L1 now because of * nested_run_pending, we need to re-enable this bit. @@ -10291,7 +10337,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); - if (vmx_mpx_supported()) + if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); if (nested_cpu_has_xsaves(vmcs12)) vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); @@ -10799,13 +10845,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, */ kvm_set_msi_irq(e, &irq); - if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + /* + * Make sure the IRTE is in remapped mode if + * we don't handle it in posted mode. + */ + ret = irq_set_vcpu_affinity(host_irq, NULL); + if (ret < 0) { + printk(KERN_INFO + "failed to back to remapped mode, irq: %u\n", + host_irq); + goto out; + } + continue; + } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; - trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi, + trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); if (set) @@ -10875,6 +10934,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + + .get_pkru = vmx_get_pkru, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ac4963c38..9b7798c7b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); +static bool __read_mostly vector_hashing = true; +module_param(vector_hashing, bool, S_IRUGO); + static bool __read_mostly backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -719,7 +722,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP; + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; if (cr4 & CR4_RESERVED_BITS) return 1; @@ -736,6 +739,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; + if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; @@ -761,7 +767,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); return 0; @@ -1195,17 +1201,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { - uint32_t quotient, remainder; - - /* Don't try to replace with do_div(), this one calculates - * "(dividend << 32) / divisor" */ - __asm__ ( "divl %4" - : "=a" (quotient), "=d" (remainder) - : "0" (0), "1" (dividend), "r" (divisor) ); - return quotient; + do_shl32_div32(dividend, divisor); + return dividend; } -static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, +static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, s8 *pshift, u32 *pmultiplier) { uint64_t scaled64; @@ -1213,8 +1213,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, uint64_t tps64; uint32_t tps32; - tps64 = base_khz * 1000LL; - scaled64 = scaled_khz * 1000LL; + tps64 = base_hz; + scaled64 = scaled_hz; while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; @@ -1232,8 +1232,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", - __func__, base_khz, scaled_khz, shift, *pmultiplier); + pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", + __func__, base_hz, scaled_hz, shift, *pmultiplier); } #ifdef CONFIG_X86_64 @@ -1292,23 +1292,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) return 0; } -static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ - if (this_tsc_khz == 0) { + if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; return -1; } /* Compute a scale to convert nanoseconds in TSC cycles */ - kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, &vcpu->arch.virtual_tsc_shift, &vcpu->arch.virtual_tsc_mult); - vcpu->arch.virtual_tsc_khz = this_tsc_khz; + vcpu->arch.virtual_tsc_khz = user_tsc_khz; /* * Compute the variation in TSC rate which is acceptable @@ -1318,11 +1318,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) */ thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); - if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { - pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } - return set_tsc_khz(vcpu, this_tsc_khz, use_scaling); + return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) @@ -1561,7 +1561,7 @@ static cycle_t read_tsc(void) /* * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is + * predictable (it's just a function of time and the likely is * very likely) and there's a data dependence, so force GCC * to generate a branch instead. I don't barrier() because * we don't actually need a barrier, and if this function @@ -1715,7 +1715,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags, this_tsc_khz, tgt_tsc_khz; + unsigned long flags, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -1741,8 +1741,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - this_tsc_khz = __this_cpu_read(cpu_tsc_khz); - if (unlikely(this_tsc_khz == 0)) { + tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); + if (unlikely(tgt_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); return 1; @@ -1777,13 +1777,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (!vcpu->pv_time_enabled) return 0; - if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - tgt_tsc_khz = kvm_has_tsc_control ? - vcpu->virtual_tsc_khz : this_tsc_khz; - kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz, + if (kvm_has_tsc_control) + tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); + + if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); - vcpu->hw_tsc_khz = this_tsc_khz; + vcpu->hw_tsc_khz = tgt_tsc_khz; } /* With all the info we got, fill in the values */ @@ -2751,7 +2752,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -2987,7 +2987,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && - kvm_vcpu_has_lapic(vcpu)) + lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { @@ -3000,7 +3000,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (kvm_vcpu_has_lapic(vcpu)) { + if (lapic_in_kernel(vcpu)) { if (events->smi.latched_init) set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); else @@ -3240,7 +3240,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) goto out; u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); @@ -3258,7 +3258,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_LAPIC: { r = -EINVAL; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); if (IS_ERR(u.lapic)) @@ -3605,20 +3605,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) { - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; + + BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); + + mutex_lock(&kps->lock); + memcpy(ps, &kps->channels, sizeof(*ps)); + mutex_unlock(&kps->lock); return 0; } static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) { int i; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); for (i = 0; i < 3; i++) - kvm_pit_load_count(kvm, i, ps->channels[i].count, 0); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + kvm_pit_load_count(pit, i, ps->channels[i].count, 0); + mutex_unlock(&pit->pit_state.lock); return 0; } @@ -3638,29 +3644,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) int start = 0; int i; u32 prev_legacy, cur_legacy; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + struct kvm_pit *pit = kvm->arch.vpit; + + mutex_lock(&pit->pit_state.lock); + prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; if (!prev_legacy && cur_legacy) start = 1; - memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, - sizeof(kvm->arch.vpit->pit_state.channels)); - kvm->arch.vpit->pit_state.flags = ps->flags; + memcpy(&pit->pit_state.channels, &ps->channels, + sizeof(pit->pit_state.channels)); + pit->pit_state.flags = ps->flags; for (i = 0; i < 3; i++) - kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count, + kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count, start && i == 0); - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + mutex_unlock(&pit->pit_state.lock); return 0; } static int kvm_vm_ioctl_reinject(struct kvm *kvm, struct kvm_reinject_control *control) { - if (!kvm->arch.vpit) + struct kvm_pit *pit = kvm->arch.vpit; + + if (!pit) return -ENXIO; - mutex_lock(&kvm->arch.vpit->pit_state.lock); - kvm->arch.vpit->pit_state.reinject = control->pit_reinject; - mutex_unlock(&kvm->arch.vpit->pit_state.lock); + + /* pit->pit_state.lock was overloaded to prevent userspace from getting + * an inconsistent state after running multiple KVM_REINJECT_CONTROL + * ioctls in parallel. Use a separate lock if that ioctl isn't rare. + */ + mutex_lock(&pit->pit_state.lock); + kvm_pit_set_reinject(pit, control->pit_reinject); + mutex_unlock(&pit->pit_state.lock); + return 0; } @@ -4093,7 +4109,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, do { n = min(len, 8); - if (!(vcpu->arch.apic && + if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) break; @@ -4113,7 +4129,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) do { n = min(len, 8); - if (!(vcpu->arch.apic && + if (!(lapic_in_kernel(vcpu) && !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, addr, n, v)) && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) @@ -4312,9 +4328,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); + /* + * currently PKRU is only applied to ept enabled guest so + * there is no pkey in EPT page table for L1 guest or EPT + * shadow page table for L2 guest. + */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, access)) { + vcpu->arch.access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -4346,7 +4367,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + kvm_page_track_write(vcpu, gpa, val, bytes); return 1; } @@ -4604,7 +4625,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CMPXCHG_FAILED; kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); - kvm_mmu_pte_write(vcpu, gpa, new, bytes); + kvm_page_track_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -6010,7 +6031,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) if (!kvm_x86_ops->update_cr8_intercept) return; - if (!vcpu->arch.apic) + if (!lapic_in_kernel(vcpu)) return; if (vcpu->arch.apicv_active) @@ -6572,8 +6593,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - /* We should set ->mode before check ->requests, - * see the comment in make_all_cpus_request. + /* + * We should set ->mode before check ->requests, + * Please see the comment in kvm_make_all_cpus_request. + * This also orders the write to mode from any reads + * to the page tables done while the VCPU is running. + * Please see the comment in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); @@ -7040,7 +7065,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (!kvm_vcpu_has_lapic(vcpu) && + if (!lapic_in_kernel(vcpu) && mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; @@ -7111,7 +7136,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) + if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -7313,7 +7338,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) * Every 255 times fpu_counter rolls over to 0; a guest that uses * the FPU in bursts will revert to loading it on demand. */ - if (!vcpu->arch.eager_fpu) { + if (!use_eager_fpu()) { if (++vcpu->fpu_counter < 5) kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); } @@ -7592,6 +7617,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) } struct static_key kvm_no_apic_vcpu __read_mostly; +EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { @@ -7723,6 +7749,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + kvm_page_track_init(kvm); + kvm_mmu_init_vm(kvm); + return 0; } @@ -7849,6 +7878,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvm_mmu_uninit_vm(kvm); } void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, @@ -7870,6 +7900,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, free->arch.lpage_info[i - 1] = NULL; } } + + kvm_page_track_free_memslot(free, dont); } int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -7878,6 +7910,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; int level = i + 1; @@ -7892,15 +7925,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, if (i == 0) continue; - slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * - sizeof(*slot->arch.lpage_info[i - 1])); - if (!slot->arch.lpage_info[i - 1]) + linfo = kvm_kvzalloc(lpages * sizeof(*linfo)); + if (!linfo) goto out_free; + slot->arch.lpage_info[i - 1] = linfo; + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i - 1][0].write_count = 1; + linfo[0].disallow_lpage = 1; if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; + linfo[lpages - 1].disallow_lpage = 1; ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each @@ -7912,10 +7946,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long j; for (j = 0; j < lpages; ++j) - slot->arch.lpage_info[i - 1][j].write_count = 1; + linfo[j].disallow_lpage = 1; } } + if (kvm_page_track_create_memslot(slot, npages)) + goto out_free; + return 0; out_free: @@ -8369,6 +8406,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); } +bool kvm_vector_hashing_enabled(void) +{ + return vector_hashing; +} +EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f2afa5fe4..7ce3634ab 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -179,10 +179,12 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); +bool kvm_vector_hashing_enabled(void); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ - | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512) + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); @@ -192,4 +194,19 @@ extern unsigned int min_timer_period_us; extern unsigned int lapic_timer_advance_ns; extern struct static_key kvm_no_apic_vcpu; + +/* Same "calling convention" as do_div: + * - divide (n << 32) by base + * - put result in n + * - return remainder + */ +#define do_shl32_div32(n, base) \ + ({ \ + u32 __quot, __rem; \ + asm("divl %2" : "=a" (__quot), "=d" (__rem) \ + : "rm" (base), "0" (0), "1" ((u32) n)); \ + n = __quot; \ + __rem; \ + }) + #endif |