summaryrefslogtreecommitdiff
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c468
1 files changed, 359 insertions, 109 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43609af03..97592e190 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,8 @@
#include <linux/pci.h>
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -64,6 +66,7 @@
#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
#include <asm/div64.h>
+#include <asm/irq_remapping.h>
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
@@ -90,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
-struct kvm_x86_ops *kvm_x86_ops;
+struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
-static bool ignore_msrs = 0;
+static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
unsigned int min_timer_period_us = 500;
@@ -102,20 +105,25 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);
-bool kvm_has_tsc_control;
+bool __read_mostly kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
-u32 kvm_max_guest_tsc_khz;
+u32 __read_mostly kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
+u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
+EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
+u64 __read_mostly kvm_max_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
+static u64 __read_mostly kvm_default_tsc_scaling_ratio;
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
-static u32 tsc_tolerance_ppm = 250;
+static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int lapic_timer_advance_ns = 0;
+unsigned int __read_mostly lapic_timer_advance_ns = 0;
module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-static bool backwards_tsc_observed = false;
+static bool __read_mostly backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -665,9 +673,9 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
if (index != XCR_XFEATURE_ENABLED_MASK)
return 1;
- if (!(xcr0 & XSTATE_FP))
+ if (!(xcr0 & XFEATURE_MASK_FP))
return 1;
- if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+ if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
return 1;
/*
@@ -675,23 +683,24 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
* saving. However, xcr0 bit 0 is always set, even if the
* emulated CPU does not support XSAVE (see fx_init).
*/
- valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
if (xcr0 & ~valid_bits)
return 1;
- if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
+ if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
+ (!(xcr0 & XFEATURE_MASK_BNDCSR)))
return 1;
- if (xcr0 & XSTATE_AVX512) {
- if (!(xcr0 & XSTATE_YMM))
+ if (xcr0 & XFEATURE_MASK_AVX512) {
+ if (!(xcr0 & XFEATURE_MASK_YMM))
return 1;
- if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+ if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
return 1;
}
kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
- if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
+ if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
kvm_update_cpuid(vcpu);
return 0;
}
@@ -790,7 +799,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
@@ -800,7 +809,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
- if (irqchip_in_kernel(vcpu->kvm))
+ if (lapic_in_kernel(vcpu))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
@@ -954,6 +963,9 @@ static u32 emulated_msrs[] = {
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
@@ -1242,14 +1254,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
return v;
}
-static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
+{
+ u64 ratio;
+
+ /* Guest TSC same frequency as host TSC? */
+ if (!scale) {
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return 0;
+ }
+
+ /* TSC scaling supported? */
+ if (!kvm_has_tsc_control) {
+ if (user_tsc_khz > tsc_khz) {
+ vcpu->arch.tsc_catchup = 1;
+ vcpu->arch.tsc_always_catchup = 1;
+ return 0;
+ } else {
+ WARN(1, "user requested TSC rate below hardware speed\n");
+ return -1;
+ }
+ }
+
+ /* TSC scaling required - calculate ratio */
+ ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
+ user_tsc_khz, tsc_khz);
+
+ if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) {
+ WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
+ user_tsc_khz);
+ return -1;
+ }
+
+ vcpu->arch.tsc_scaling_ratio = ratio;
+ return 0;
+}
+
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
{
u32 thresh_lo, thresh_hi;
int use_scaling = 0;
/* tsc_khz can be zero if TSC calibration fails */
- if (this_tsc_khz == 0)
- return;
+ if (this_tsc_khz == 0) {
+ /* set tsc_scaling_ratio to a safe value */
+ vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ return -1;
+ }
/* Compute a scale to convert nanoseconds in TSC cycles */
kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
@@ -1269,7 +1320,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
- kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+ return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
}
static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1315,6 +1366,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
+/*
+ * Multiply tsc by a fixed point number represented by ratio.
+ *
+ * The most significant 64-N bits (mult) of ratio represent the
+ * integral part of the fixed point number; the remaining N bits
+ * (frac) represent the fractional part, ie. ratio represents a fixed
+ * point number (mult + frac * 2^(-N)).
+ *
+ * N equals to kvm_tsc_scaling_ratio_frac_bits.
+ */
+static inline u64 __scale_tsc(u64 ratio, u64 tsc)
+{
+ return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
+}
+
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+{
+ u64 _tsc = tsc;
+ u64 ratio = vcpu->arch.tsc_scaling_ratio;
+
+ if (ratio != kvm_default_tsc_scaling_ratio)
+ _tsc = __scale_tsc(ratio, tsc);
+
+ return _tsc;
+}
+EXPORT_SYMBOL_GPL(kvm_scale_tsc);
+
+static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+{
+ u64 tsc;
+
+ tsc = kvm_scale_tsc(vcpu, rdtsc());
+
+ return target_tsc - tsc;
+}
+
+u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+{
+ return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc));
+}
+EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1326,7 +1419,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -1383,7 +1476,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
- offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
@@ -1440,6 +1533,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
EXPORT_SYMBOL_GPL(kvm_write_tsc);
+static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
+ s64 adjustment)
+{
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
+static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
+{
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ WARN_ON(adjustment < 0);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+}
+
#ifdef CONFIG_X86_64
static cycle_t read_tsc(void)
@@ -1601,7 +1708,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags, this_tsc_khz;
+ unsigned long flags, this_tsc_khz, tgt_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
@@ -1638,7 +1745,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kernel_ns = get_kernel_ns();
}
- tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+ tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
/*
* We may have to catch up the TSC to match elapsed wall clock
@@ -1664,7 +1771,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 0;
if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
+ tgt_tsc_khz = kvm_has_tsc_control ?
+ vcpu->virtual_tsc_khz : this_tsc_khz;
+ kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = this_tsc_khz;
@@ -1899,6 +2008,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ accumulate_steal_time(vcpu);
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -2049,12 +2160,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & KVM_MSR_ENABLED))
break;
- vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
- preempt_disable();
- accumulate_steal_time(vcpu);
- preempt_enable();
-
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
break;
@@ -2450,6 +2555,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
+ case KVM_CAP_SPLIT_IRQCHIP:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2613,7 +2719,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
- u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
+ u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
@@ -2629,7 +2735,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
vcpu->cpu = cpu;
}
- accumulate_steal_time(vcpu);
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
@@ -2658,17 +2763,50 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
+{
+ return (!lapic_in_kernel(vcpu) ||
+ kvm_apic_accept_pic_intr(vcpu));
+}
+
+/*
+ * if userspace requested an interrupt window, check that the
+ * interrupt window is open.
+ *
+ * No need to exit to userspace if we already have an interrupt queued.
+ */
+static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_interrupt_allowed(vcpu) &&
+ !kvm_cpu_has_interrupt(vcpu) &&
+ !kvm_event_needs_reinjection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+}
+
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
if (irq->irq >= KVM_NR_INTERRUPTS)
return -EINVAL;
- if (irqchip_in_kernel(vcpu->kvm))
+
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ kvm_queue_interrupt(vcpu, irq->irq, false);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+
+ /*
+ * With in-kernel LAPIC, we only use this to inject EXTINT, so
+ * fail for in-kernel 8259.
+ */
+ if (pic_in_kernel(vcpu->kvm))
return -ENXIO;
- kvm_queue_interrupt(vcpu, irq->irq, false);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (vcpu->arch.pending_external_vector != -1)
+ return -EEXIST;
+ vcpu->arch.pending_external_vector = irq->irq;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
}
@@ -2907,7 +3045,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
* Copy each region from the possibly compacted offset to the
* non-compacted offset.
*/
- valid = xstate_bv & ~XSTATE_FPSSE;
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
u64 feature = valid & -valid;
int index = fls64(feature) - 1;
@@ -2945,7 +3083,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
* Copy each region from the non-compacted offset to the
* possibly compacted offset.
*/
- valid = xstate_bv & ~XSTATE_FPSSE;
+ valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
u64 feature = valid & -valid;
int index = fls64(feature) - 1;
@@ -2973,7 +3111,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
&vcpu->arch.guest_fpu.state.fxsave,
sizeof(struct fxregs_state));
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
- XSTATE_FPSSE;
+ XFEATURE_MASK_FPSSE;
}
}
@@ -2993,7 +3131,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
- if (xstate_bv & ~XSTATE_FPSSE)
+ if (xstate_bv & ~XFEATURE_MASK_FPSSE)
return -EINVAL;
memcpy(&vcpu->arch.guest_fpu.state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
@@ -3177,7 +3315,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_vapic_addr va;
r = -EINVAL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
goto out;
r = -EFAULT;
if (copy_from_user(&va, argp, sizeof va))
@@ -3304,9 +3442,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (user_tsc_khz == 0)
user_tsc_khz = tsc_khz;
- kvm_set_tsc_khz(vcpu, user_tsc_khz);
+ if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
+ r = 0;
- r = 0;
goto out;
}
case KVM_GET_TSC_KHZ: {
@@ -3426,41 +3564,38 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- int r = 0;
-
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
{
- int r = 0;
-
+ int i;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
- kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
- int r = 0;
-
mutex_lock(&kvm->arch.vpit->pit_state.lock);
memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
sizeof(ps->channels));
ps->flags = kvm->arch.vpit->pit_state.flags;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
memset(&ps->reserved, 0, sizeof(ps->reserved));
- return r;
+ return 0;
}
static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
{
- int r = 0, start = 0;
+ int start = 0;
+ int i;
u32 prev_legacy, cur_legacy;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3470,9 +3605,11 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
sizeof(kvm->arch.vpit->pit_state.channels));
kvm->arch.vpit->pit_state.flags = ps->flags;
- kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
+ for (i = 0; i < 3; i++)
+ kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ start && i == 0);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- return r;
+ return 0;
}
static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@@ -3557,6 +3694,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.disabled_quirks = cap->args[0];
r = 0;
break;
+ case KVM_CAP_SPLIT_IRQCHIP: {
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+ goto split_irqchip_unlock;
+ r = -EEXIST;
+ if (irqchip_in_kernel(kvm))
+ goto split_irqchip_unlock;
+ if (atomic_read(&kvm->online_vcpus))
+ goto split_irqchip_unlock;
+ r = kvm_setup_empty_irq_routing(kvm);
+ if (r)
+ goto split_irqchip_unlock;
+ /* Pairs with irqchip_in_kernel. */
+ smp_wmb();
+ kvm->arch.irqchip_split = true;
+ kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+ r = 0;
+split_irqchip_unlock:
+ mutex_unlock(&kvm->lock);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -3670,7 +3829,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -3694,7 +3853,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm))
+ if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
@@ -5678,7 +5837,7 @@ void kvm_arch_exit(void)
int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
- if (irqchip_in_kernel(vcpu->kvm)) {
+ if (lapic_in_kernel(vcpu)) {
vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
return 1;
} else {
@@ -5777,17 +5936,10 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
}
-/*
- * Check if userspace requested an interrupt window, and that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
{
- return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
- vcpu->run->request_interrupt_window &&
- kvm_arch_interrupt_allowed(vcpu));
+ return vcpu->run->request_interrupt_window &&
+ likely(!pic_in_kernel(vcpu->kvm));
}
static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5798,13 +5950,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
- if (irqchip_in_kernel(vcpu->kvm))
- kvm_run->ready_for_interrupt_injection = 1;
- else
- kvm_run->ready_for_interrupt_injection =
- kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
- !kvm_event_needs_reinjection(vcpu);
+ kvm_run->ready_for_interrupt_injection =
+ pic_in_kernel(vcpu->kvm) ||
+ kvm_vcpu_ready_for_interrupt_injection(vcpu);
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -6155,18 +6303,18 @@ static void process_smi(struct kvm_vcpu *vcpu)
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
- u64 eoi_exit_bitmap[4];
- u32 tmr[8];
-
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- memset(eoi_exit_bitmap, 0, 32);
- memset(tmr, 0, 32);
+ memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
- kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
- kvm_apic_update_tmr(vcpu, tmr);
+ if (irqchip_split(vcpu->kvm))
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
+ else {
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+ }
+ kvm_x86_ops->load_eoi_exitmap(vcpu);
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6179,7 +6327,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
struct page *page = NULL;
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
return;
if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6217,8 +6365,10 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
- bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
- vcpu->run->request_interrupt_window;
+ bool req_int_win =
+ dm_request_for_irq_injection(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu);
+
bool req_immediate_exit = false;
if (vcpu->requests) {
@@ -6269,6 +6419,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
kvm_pmu_deliver_pmi(vcpu);
+ if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+ BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+ if (test_bit(vcpu->arch.pending_ioapic_eoi,
+ (void *) vcpu->arch.eoi_exit_bitmap)) {
+ vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+ vcpu->run->eoi.vector =
+ vcpu->arch.pending_ioapic_eoi;
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6279,6 +6440,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
+ if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+ r = 0;
+ goto out;
+ }
+ }
+
+ /*
+ * KVM_REQ_EVENT is not set when posted interrupts are set by
+ * VT-d hardware, so we have to update RVI unconditionally.
+ */
+ if (kvm_lapic_enabled(vcpu)) {
+ /*
+ * Update architecture specific hints for APIC
+ * virtual interrupt delivery.
+ */
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ kvm_lapic_find_highest_irr(vcpu));
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6297,13 +6478,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_x86_ops->enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
- /*
- * Update architecture specific hints for APIC
- * virtual interrupt delivery.
- */
- if (kvm_x86_ops->hwapic_irr_update)
- kvm_x86_ops->hwapic_irr_update(vcpu,
- kvm_lapic_find_highest_irr(vcpu));
update_cr8_intercept(vcpu);
kvm_lapic_sync_to_vapic(vcpu);
}
@@ -6346,6 +6520,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
+ trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
__kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -6358,8 +6534,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
- trace_kvm_entry(vcpu->vcpu_id);
- wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
@@ -6387,8 +6561,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
- rdtsc());
+ vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -6439,10 +6612,15 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_vcpu_runnable(vcpu)) {
+ if (!kvm_arch_vcpu_runnable(vcpu) &&
+ (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_x86_ops->post_block)
+ kvm_x86_ops->post_block(vcpu);
+
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
}
@@ -6479,10 +6657,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
for (;;) {
- if (kvm_vcpu_running(vcpu))
+ if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
- else
+ } else {
r = vcpu_block(kvm, vcpu);
+ }
+
if (r <= 0)
break;
@@ -6490,9 +6670,10 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
- if (dm_request_for_irq_injection(vcpu)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
+ if (dm_request_for_irq_injection(vcpu) &&
+ kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
+ r = 0;
+ vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}
@@ -6619,7 +6800,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (!lapic_in_kernel(vcpu)) {
if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
r = -EINVAL;
goto out;
@@ -6943,7 +7124,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->update_db_bp_intercept(vcpu);
+ kvm_x86_ops->update_bp_intercept(vcpu);
r = 0;
@@ -7017,7 +7198,7 @@ static void fx_init(struct kvm_vcpu *vcpu)
/*
* Ensure guest xcr0 is valid for loading
*/
- vcpu->arch.xcr0 = XSTATE_FP;
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
vcpu->arch.cr0 |= X86_CR0_ET;
}
@@ -7292,6 +7473,20 @@ int kvm_arch_hardware_setup(void)
if (r != 0)
return r;
+ if (kvm_has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated needed because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
+ kvm_max_guest_tsc_khz = max;
+
+ kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
+ }
+
kvm_init_msr_list();
return 0;
}
@@ -7319,7 +7514,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
{
- return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+ return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
}
struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7388,6 +7583,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
+ vcpu->arch.pending_external_vector = -1;
+
return 0;
fail_free_mce_banks:
@@ -7413,7 +7610,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
kvm_mmu_destroy(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
free_page((unsigned long)vcpu->arch.pio_data);
- if (!irqchip_in_kernel(vcpu->kvm))
+ if (!lapic_in_kernel(vcpu))
static_key_slow_dec(&kvm_no_apic_vcpu);
}
@@ -8040,7 +8237,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (kvm_x86_ops->update_pi_irte) {
+ irqfd->producer = prod;
+ return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
+ }
+
+ return -EINVAL;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+ int ret;
+ struct kvm_kernel_irqfd *irqfd =
+ container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+ if (!kvm_x86_ops->update_pi_irte) {
+ WARN_ON(irqfd->producer != NULL);
+ return;
+ }
+
+ WARN_ON(irqfd->producer != prod);
+ irqfd->producer = NULL;
+
+ /*
+ * When producer of consumer is unregistered, we change back to
+ * remapped mode, so we can re-use the current implementation
+ * when the irq is masked/disabed or the consumer side (KVM
+ * int this case doesn't want to receive the interrupts.
+ */
+ ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ if (ret)
+ printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+ " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+ uint32_t guest_irq, bool set)
+{
+ if (!kvm_x86_ops->update_pi_irte)
+ return -EINVAL;
+
+ return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -8055,3 +8304,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);