summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/vmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r--arch/x86/kvm/vmx.c146
1 files changed, 104 insertions, 42 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 539062e24..faf52bac1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -598,6 +598,10 @@ struct vcpu_vmx {
struct page *pml_pg;
u64 current_tsc_ratio;
+
+ bool guest_pkru_valid;
+ u32 guest_pkru;
+ u32 host_pkru;
};
enum segment_cache_field {
@@ -863,7 +867,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
-static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -963,25 +966,36 @@ static const u32 vmx_msr_index[] = {
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+ (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+ return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+ return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, PF_VECTOR);
}
static inline bool is_no_device(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, NM_VECTOR);
}
static inline bool is_invalid_opcode(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, UD_VECTOR);
}
static inline bool is_external_interrupt(u32 intr_info)
@@ -2097,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
} while (cmpxchg(&pi_desc->control, old.control,
new.control) != old.control);
}
+
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -2157,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
vmx_vcpu_pi_load(vcpu, cpu);
+ vmx->host_pkru = read_pkru();
}
static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2276,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
}
+static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->guest_pkru;
+}
+
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2605,7 +2626,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
- if (vmx_mpx_supported())
+ if (kvm_mpx_supported())
vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
@@ -2626,7 +2647,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
VM_ENTRY_LOAD_IA32_PAT;
vmx->nested.nested_vmx_entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
- if (vmx_mpx_supported())
+ if (kvm_mpx_supported())
vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
@@ -2877,7 +2898,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
- if (!vmx_mpx_supported())
+ if (!kvm_mpx_supported())
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
@@ -2954,7 +2975,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_BNDCFGS:
- if (!vmx_mpx_supported())
+ if (!kvm_mpx_supported())
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
@@ -3082,6 +3103,8 @@ static __init int vmx_disabled_by_bios(void)
static void kvm_cpu_vmxon(u64 addr)
{
+ intel_pt_handle_vmx(1);
+
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&addr), "m"(addr)
: "memory", "cc");
@@ -3151,6 +3174,8 @@ static void vmclear_local_loaded_vmcss(void)
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+
+ intel_pt_handle_vmx(0);
}
static void hardware_disable(void)
@@ -3427,7 +3452,7 @@ static void init_vmcs_shadow_fields(void)
for (i = j = 0; i < max_shadow_read_write_fields; i++) {
switch (shadow_read_write_fields[i]) {
case GUEST_BNDCFGS:
- if (!vmx_mpx_supported())
+ if (!kvm_mpx_supported())
continue;
break;
default:
@@ -3883,13 +3908,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!enable_unrestricted_guest && !is_paging(vcpu))
/*
- * SMEP/SMAP is disabled if CPU is in non-paging mode in
- * hardware. However KVM always uses paging mode without
- * unrestricted guest.
- * To emulate this behavior, SMEP/SMAP needs to be manually
- * disabled when guest switches to non-paging mode.
+ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
+ * to be manually disabled when guest switches to non-paging
+ * mode.
+ *
+ * If !enable_unrestricted_guest, the CPU is always running
+ * with CR0.PG=1 and CR4 needs to be modified.
+ * If enable_unrestricted_guest, the CPU automatically
+ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
*/
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -5021,8 +5050,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
vmx_fpu_activate(vcpu);
@@ -5503,7 +5532,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
return kvm_set_cr4(vcpu, val);
}
-/* called to set cr0 as approriate for clts instruction exit. */
+/* called to set cr0 as appropriate for clts instruction exit. */
static void handle_clts(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
@@ -5636,11 +5665,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
}
if (vcpu->guest_debug == 0) {
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_MOV_DR_EXITING);
/*
* No more DR vmexits; force a reload of the debug registers
@@ -5677,8 +5703,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
get_debugreg(vcpu->arch.db[2], 2);
@@ -5687,10 +5711,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
}
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5775,8 +5796,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
- kvm_emulate_hypercall(vcpu);
- return 1;
+ return kvm_emulate_hypercall(vcpu);
}
static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6463,8 +6483,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
/* Recycle the least recently used VMCS. */
- item = list_entry(vmx->nested.vmcs02_pool.prev,
- struct vmcs02_list, list);
+ item = list_last_entry(&vmx->nested.vmcs02_pool,
+ struct vmcs02_list, list);
item->vmptr = vmx->nested.current_vmptr;
list_move(&item->list, &vmx->nested.vmcs02_pool);
return &item->vmcs02;
@@ -7251,7 +7271,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
/* The value to write might be 32 or 64 bits, depending on L1's long
* mode, and eventually we need to write that into a field of several
* possible lengths. The code below first zero-extends the value to 64
- * bit (field_value), and then copies only the approriate number of
+ * bit (field_value), and then copies only the appropriate number of
* bits into the vmcs12 field.
*/
u64 field_value = 0;
@@ -7787,6 +7807,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
else if (is_no_device(intr_info) &&
!(vmcs12->guest_cr0 & X86_CR0_TS))
return false;
+ else if (is_debug(intr_info) &&
+ vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ return false;
+ else if (is_breakpoint(intr_info) &&
+ vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ return false;
return vmcs12->exception_bitmap &
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -8391,6 +8418,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
{
u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ register void *__sp asm(_ASM_SP);
/*
* If external interrupt exists, IF bit is set in rflags/eflags on the
@@ -8423,8 +8451,9 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
"call *%[entry]\n\t"
:
#ifdef CONFIG_X86_64
- [sp]"=&r"(tmp)
+ [sp]"=&r"(tmp),
#endif
+ "+r"(__sp)
:
[entry]"r"(entry),
[ss]"i"(__KERNEL_DS),
@@ -8625,6 +8654,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ if (vmx->guest_pkru_valid)
+ __write_pkru(vmx->guest_pkru);
+
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -8765,6 +8797,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
/*
+ * eager fpu is enabled if PKEY is supported and CR4 is switched
+ * back on host, so it is safe to read guest PKRU from current
+ * XSAVE.
+ */
+ if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+ vmx->guest_pkru = __read_pkru();
+ if (vmx->guest_pkru != vmx->host_pkru) {
+ vmx->guest_pkru_valid = true;
+ __write_pkru(vmx->host_pkru);
+ } else
+ vmx->guest_pkru_valid = false;
+ }
+
+ /*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
* we did not inject a still-pending event to L1 now because of
* nested_run_pending, we need to re-enable this bit.
@@ -10291,7 +10337,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
- if (vmx_mpx_supported())
+ if (kvm_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
if (nested_cpu_has_xsaves(vmcs12))
vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@ -10799,13 +10845,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
*/
kvm_set_msi_irq(e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
continue;
+ }
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
- trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
if (set)
@@ -10875,6 +10934,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+
+ .get_pkru = vmx_get_pkru,
+
.fpu_activate = vmx_fpu_activate,
.fpu_deactivate = vmx_fpu_deactivate,