27 files changed, 1454 insertions, 737 deletions
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a1ff508bb..464fa477a 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,9 +13,10 @@ kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
-			   hyperv.o
+			   hyperv.o page_track.o
 
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
+
 kvm-intel-y		+= vmx.o pmu_intel.o
 kvm-amd-y		+= svm.o pmu_amd.o
 
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index 9dc091acd..308b8597c 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -51,11 +51,9 @@ struct kvm_assigned_dev_kernel {
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
 {
-	struct list_head *ptr;
 	struct kvm_assigned_dev_kernel *match;
 
-	list_for_each(ptr, head) {
-		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+	list_for_each_entry(match, head, list) {
 		if (match->assigned_dev_id == assigned_dev_id)
 			return match;
 	}
@@ -373,14 +371,10 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 
 void kvm_free_all_assigned_devices(struct kvm *kvm)
 {
-	struct list_head *ptr, *ptr2;
-	struct kvm_assigned_dev_kernel *assigned_dev;
-
-	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
-		assigned_dev = list_entry(ptr,
-					  struct kvm_assigned_dev_kernel,
-					  list);
+	struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
 
+	list_for_each_entry_safe(assigned_dev, tmp,
+				 &kvm->arch.assigned_dev_head, list) {
 		kvm_free_assigned_device(kvm, assigned_dev);
 	}
 }
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6525e926f..bbbaa802d 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -46,11 +46,18 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 	return ret;
 }
 
+bool kvm_mpx_supported(void)
+{
+	return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR))
+		 && kvm_x86_ops->mpx_supported());
+}
+EXPORT_SYMBOL_GPL(kvm_mpx_supported);
+
 u64 kvm_supported_xcr0(void)
 {
 	u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0;
 
-	if (!kvm_x86_ops->mpx_supported())
+	if (!kvm_mpx_supported())
 		xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
 
 	return xcr0;
@@ -81,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 			apic->lapic_timer.timer_mode_mask = 1 << 17;
 	}
 
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	if (best) {
+		/* Update OSPKE bit */
+		if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+			best->ecx &= ~F(OSPKE);
+			if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+				best->ecx |= F(OSPKE);
+		}
+	}
+
 	best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
 	if (!best) {
 		vcpu->arch.guest_supported_xcr0 = 0;
@@ -97,8 +114,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
-	vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
-	if (vcpu->arch.eager_fpu)
+	if (use_eager_fpu())
 		kvm_x86_ops->fpu_activate(vcpu);
 
 	/*
@@ -295,11 +311,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 #endif
 	unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
 	unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
-	unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
+	unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0;
 	unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
 
 	/* cpuid 1.edx */
-	const u32 kvm_supported_word0_x86_features =
+	const u32 kvm_cpuid_1_edx_x86_features =
 		F(FPU) | F(VME) | F(DE) | F(PSE) |
 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
@@ -309,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
 		0 /* HTT, TM, Reserved, PBE */;
 	/* cpuid 0x80000001.edx */
-	const u32 kvm_supported_word1_x86_features =
+	const u32 kvm_cpuid_8000_0001_edx_x86_features =
 		F(FPU) | F(VME) | F(DE) | F(PSE) |
 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
@@ -319,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
-	const u32 kvm_supported_word4_x86_features =
+	const u32 kvm_cpuid_1_ecx_x86_features =
 		/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
 		 * but *not* advertised to guests via CPUID ! */
 		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
@@ -331,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
 		F(F16C) | F(RDRAND);
 	/* cpuid 0x80000001.ecx */
-	const u32 kvm_supported_word6_x86_features =
+	const u32 kvm_cpuid_8000_0001_ecx_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
 		F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
 		0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
 
 	/* cpuid 0xC0000001.edx */
-	const u32 kvm_supported_word5_x86_features =
+	const u32 kvm_cpuid_C000_0001_edx_x86_features =
 		F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
 		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
 		F(PMM) | F(PMM_EN);
 
 	/* cpuid 7.0.ebx */
-	const u32 kvm_supported_word9_x86_features =
+	const u32 kvm_cpuid_7_0_ebx_x86_features =
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
 		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
 
 	/* cpuid 0xD.1.eax */
-	const u32 kvm_supported_word10_x86_features =
+	const u32 kvm_cpuid_D_1_eax_x86_features =
 		F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
 
+	/* cpuid 7.0.ecx*/
+	const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 
@@ -370,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, (u32)0xd);
 		break;
 	case 1:
-		entry->edx &= kvm_supported_word0_x86_features;
-		cpuid_mask(&entry->edx, 0);
-		entry->ecx &= kvm_supported_word4_x86_features;
-		cpuid_mask(&entry->ecx, 4);
+		entry->edx &= kvm_cpuid_1_edx_x86_features;
+		cpuid_mask(&entry->edx, CPUID_1_EDX);
+		entry->ecx &= kvm_cpuid_1_ecx_x86_features;
+		cpuid_mask(&entry->ecx, CPUID_1_ECX);
 		/* we support x2apic emulation even if host does not support
 		 * it since we emulate x2apic in software */
 		entry->ecx |= F(X2APIC);
@@ -427,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* Mask ebx against host capability word 9 */
 		if (index == 0) {
-			entry->ebx &= kvm_supported_word9_x86_features;
-			cpuid_mask(&entry->ebx, 9);
+			entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+			cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
 			// TSC_ADJUST is emulated
 			entry->ebx |= F(TSC_ADJUST);
-		} else
+			entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+			cpuid_mask(&entry->ecx, CPUID_7_ECX);
+			/* PKU is not yet implemented for shadow paging. */
+			if (!tdp_enabled)
+				entry->ecx &= ~F(PKU);
+		} else {
 			entry->ebx = 0;
+			entry->ecx = 0;
+		}
 		entry->eax = 0;
-		entry->ecx = 0;
 		entry->edx = 0;
 		break;
 	}
@@ -508,7 +533,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 			do_cpuid_1_ent(&entry[i], function, idx);
 			if (idx == 1) {
-				entry[i].eax &= kvm_supported_word10_x86_features;
+				entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
+				cpuid_mask(&entry[i].eax, CPUID_D_1_EAX);
 				entry[i].ebx = 0;
 				if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
 					entry[i].ebx =
@@ -558,10 +584,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, 0x8000001a);
 		break;
 	case 0x80000001:
-		entry->edx &= kvm_supported_word1_x86_features;
-		cpuid_mask(&entry->edx, 1);
-		entry->ecx &= kvm_supported_word6_x86_features;
-		cpuid_mask(&entry->ecx, 6);
+		entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
+		cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+		entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
+		cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
 		break;
 	case 0x80000007: /* Advanced power management */
 		/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -594,8 +620,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->eax = min(entry->eax, 0xC0000004);
 		break;
 	case 0xC0000001:
-		entry->edx &= kvm_supported_word5_x86_features;
-		cpuid_mask(&entry->edx, 5);
+		entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
+		cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
 		break;
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c8eda1498..e17a74b1d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -5,6 +5,7 @@
 #include <asm/cpu.h>
 
 int kvm_update_cpuid(struct kvm_vcpu *vcpu);
+bool kvm_mpx_supported(void);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 					      u32 function, u32 index);
 int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
@@ -79,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 
+static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ecx & bit(X86_FEATURE_PKU));
+}
+
 static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -135,14 +144,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_RTM));
 }
 
-static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 7, 0);
-	return best && (best->ebx & bit(X86_FEATURE_MPX));
-}
-
 static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b9b09fec1..a2f24af3c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -309,23 +309,29 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
 
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 
-#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
+#define FOP_FUNC(name) \
+	".align " __stringify(FASTOP_SIZE) " \n\t" \
+	".type " name ", @function \n\t" \
+	name ":\n\t"
+
 #define FOP_RET   "ret \n\t"
 
 #define FOP_START(op) \
 	extern void em_##op(struct fastop *fake); \
 	asm(".pushsection .text, \"ax\" \n\t" \
 	    ".global em_" #op " \n\t" \
-            FOP_ALIGN \
-	    "em_" #op ": \n\t"
+	    FOP_FUNC("em_" #op)
 
 #define FOP_END \
 	    ".popsection")
 
-#define FOPNOP() FOP_ALIGN FOP_RET
+#define FOPNOP() \
+	FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \
+	FOP_RET
 
 #define FOP1E(op,  dst) \
-	FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
+	FOP_FUNC(#op "_" #dst) \
+	"10: " #op " %" #dst " \n\t" FOP_RET
 
 #define FOP1EEX(op,  dst) \
 	FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
@@ -357,7 +363,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	FOP_END
 
 #define FOP2E(op,  dst, src)	   \
-	FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
+	FOP_FUNC(#op "_" #dst "_" #src) \
+	#op " %" #src ", %" #dst " \n\t" FOP_RET
 
 #define FASTOP2(op) \
 	FOP_START(op) \
@@ -395,7 +402,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	FOP_END
 
 #define FOP3E(op,  dst, src, src2) \
-	FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
+	FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \
+	#op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
 
 /* 3-operand, word-only, src2=cl */
 #define FASTOP3WCL(op) \
@@ -407,7 +415,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
 	FOP_END
 
 /* Special case for SETcc - 1 instruction per cc */
-#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
+#define FOP_SETCC(op) \
+	".align 4 \n\t" \
+	".type " #op ", @function \n\t" \
+	#op ": \n\t" \
+	#op " %al \n\t" \
+	FOP_RET
 
 asm(".global kvm_fastop_exception \n"
     "kvm_fastop_exception: xor %esi, %esi; ret");
@@ -956,7 +969,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
 	return fastop(ctxt, em_bsr);
 }
 
-static u8 test_cc(unsigned int condition, unsigned long flags)
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
 {
 	u8 rc;
 	void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
@@ -5097,13 +5110,17 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
 
 static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
 {
+	register void *__sp asm(_ASM_SP);
 	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
+
 	if (!(ctxt->d & ByteOp))
 		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
+
 	asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
 	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
-	      [fastop]"+S"(fop)
+	      [fastop]"+S"(fop), "+r"(__sp)
 	    : "c"(ctxt->src2.val));
+
 	ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
 	if (!fop) /* exception is returned in fop variable */
 		return emulate_de(ctxt);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c58ba6717..01bd7b7a6 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 	return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
 }
 
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+	bool longmode;
+
+	longmode = is_64_bit_mode(vcpu);
+	if (longmode)
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+	else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+	}
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+	return 1;
+}
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -1055,7 +1076,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	 */
 	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
+		return 1;
 	}
 
 	longmode = is_64_bit_mode(vcpu);
@@ -1083,22 +1104,38 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
 
+	/* Hypercall continuation is not supported yet */
+	if (rep_cnt || rep_idx) {
+		res = HV_STATUS_INVALID_HYPERCALL_CODE;
+		goto set_result;
+	}
+
 	switch (code) {
-	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
 		break;
+	case HVCALL_POST_MESSAGE:
+	case HVCALL_SIGNAL_EVENT:
+		/* don't bother userspace if it has no way to handle it */
+		if (!vcpu_to_synic(vcpu)->active) {
+			res = HV_STATUS_INVALID_HYPERCALL_CODE;
+			break;
+		}
+		vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+		vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+		vcpu->run->hyperv.u.hcall.input = param;
+		vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+		vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+		vcpu->arch.complete_userspace_io =
+				kvm_hv_hypercall_complete_userspace;
+		return 0;
 	default:
 		res = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
 	}
 
+set_result:
 	ret = res | (((u64)rep_done & 0xfff) << 32);
-	if (longmode) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-	} else {
-		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
-	}
-
+	kvm_hv_hypercall_set_result(vcpu, ret);
 	return 1;
 }
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ab5318727..a4bf5b45d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -51,32 +51,9 @@
 #define RW_STATE_WORD0 3
 #define RW_STATE_WORD1 4
 
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static u64 muldiv64(u64 a, u32 b, u32 c)
+static void pit_set_gate(struct kvm_pit *pit, int channel, u32 val)
 {
-	union {
-		u64 ll;
-		struct {
-			u32 low, high;
-		} l;
-	} u, res;
-	u64 rl, rh;
-
-	u.ll = a;
-	rl = (u64)u.l.low * (u64)b;
-	rh = (u64)u.l.high * (u64)b;
-	rh += (rl >> 32);
-	res.l.high = div64_u64(rh, c);
-	res.l.low = div64_u64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
-	return res.ll;
-}
-
-static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
-{
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
-
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
 	switch (c->mode) {
 	default:
@@ -97,18 +74,16 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
 	c->gate = val;
 }
 
-static int pit_get_gate(struct kvm *kvm, int channel)
+static int pit_get_gate(struct kvm_pit *pit, int channel)
 {
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
-	return kvm->arch.vpit->pit_state.channels[channel].gate;
+	return pit->pit_state.channels[channel].gate;
 }
 
-static s64 __kpit_elapsed(struct kvm *kvm)
+static s64 __kpit_elapsed(struct kvm_pit *pit)
 {
 	s64 elapsed;
 	ktime_t remaining;
-	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+	struct kvm_kpit_state *ps = &pit->pit_state;
 
 	if (!ps->period)
 		return 0;
@@ -128,26 +103,23 @@ static s64 __kpit_elapsed(struct kvm *kvm)
 	return elapsed;
 }
 
-static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+static s64 kpit_elapsed(struct kvm_pit *pit, struct kvm_kpit_channel_state *c,
 			int channel)
 {
 	if (channel == 0)
-		return __kpit_elapsed(kvm);
+		return __kpit_elapsed(pit);
 
 	return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
 }
 
-static int pit_get_count(struct kvm *kvm, int channel)
+static int pit_get_count(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 	s64 d, t;
 	int counter;
 
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
-	t = kpit_elapsed(kvm, c, channel);
-	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+	t = kpit_elapsed(pit, c, channel);
+	d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
 	case 0:
@@ -167,17 +139,14 @@ static int pit_get_count(struct kvm *kvm, int channel)
 	return counter;
 }
 
-static int pit_get_out(struct kvm *kvm, int channel)
+static int pit_get_out(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 	s64 d, t;
 	int out;
 
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-
-	t = kpit_elapsed(kvm, c, channel);
-	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+	t = kpit_elapsed(pit, c, channel);
+	d = mul_u64_u32_div(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
 	default:
@@ -202,29 +171,23 @@ static int pit_get_out(struct kvm *kvm, int channel)
 	return out;
 }
 
-static void pit_latch_count(struct kvm *kvm, int channel)
+static void pit_latch_count(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
-
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
 	if (!c->count_latched) {
-		c->latched_count = pit_get_count(kvm, channel);
+		c->latched_count = pit_get_count(pit, channel);
 		c->count_latched = c->rw_mode;
 	}
 }
 
-static void pit_latch_status(struct kvm *kvm, int channel)
+static void pit_latch_status(struct kvm_pit *pit, int channel)
 {
-	struct kvm_kpit_channel_state *c =
-		&kvm->arch.vpit->pit_state.channels[channel];
-
-	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+	struct kvm_kpit_channel_state *c = &pit->pit_state.channels[channel];
 
 	if (!c->status_latched) {
 		/* TODO: Return NULL COUNT (bit 6). */
-		c->status = ((pit_get_out(kvm, channel) << 7) |
+		c->status = ((pit_get_out(pit, channel) << 7) |
 				(c->rw_mode << 4) |
 				(c->mode << 1) |
 				c->bcd);
@@ -232,26 +195,24 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	}
 }
 
+static inline struct kvm_pit *pit_state_to_pit(struct kvm_kpit_state *ps)
+{
+	return container_of(ps, struct kvm_pit, pit_state);
+}
+
 static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
-	int value;
-
-	spin_lock(&ps->inject_lock);
-	value = atomic_dec_return(&ps->pending);
-	if (value < 0)
-		/* spurious acks can be generated if, for example, the
-		 * PIC is being reset.  Handle it gracefully here
-		 */
-		atomic_inc(&ps->pending);
-	else if (value > 0 && ps->reinject)
-		/* in this case, we had multiple outstanding pit interrupts
-		 * that we needed to inject.  Reinject
-		 */
-		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
-	ps->irq_ack = 1;
-	spin_unlock(&ps->inject_lock);
+	struct kvm_pit *pit = pit_state_to_pit(ps);
+
+	atomic_set(&ps->irq_ack, 1);
+	/* irq_ack should be set before pending is read.  Order accesses with
+	 * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+	 */
+	smp_mb();
+	if (atomic_dec_if_positive(&ps->pending) > 0)
+		queue_kthread_work(&pit->worker, &pit->expired);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -282,44 +243,33 @@ static void pit_do_work(struct kthread_work *work)
 	struct kvm_vcpu *vcpu;
 	int i;
 	struct kvm_kpit_state *ps = &pit->pit_state;
-	int inject = 0;
 
-	/* Try to inject pending interrupts when
-	 * last one has been acked.
+	if (atomic_read(&ps->reinject) && !atomic_xchg(&ps->irq_ack, 0))
+		return;
+
+	kvm_set_irq(kvm, pit->irq_source_id, 0, 1, false);
+	kvm_set_irq(kvm, pit->irq_source_id, 0, 0, false);
+
+	/*
+	 * Provides NMI watchdog support via Virtual Wire mode.
+	 * The route is: PIT -> LVT0 in NMI mode.
+	 *
+	 * Note: Our Virtual Wire implementation does not follow
+	 * the MP specification.  We propagate a PIT interrupt to all
+	 * VCPUs and only when LVT0 is in NMI mode.  The interrupt can
+	 * also be simultaneously delivered through PIC and IOAPIC.
 	 */
-	spin_lock(&ps->inject_lock);
-	if (!ps->reinject)
-		inject = 1;
-	else if (ps->irq_ack) {
-		ps->irq_ack = 0;
-		inject = 1;
-	}
-	spin_unlock(&ps->inject_lock);
-	if (inject) {
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
-
-		/*
-		 * Provides NMI watchdog support via Virtual Wire mode.
-		 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-		 *
-		 * Note: Our Virtual Wire implementation is simplified, only
-		 * propagating PIT interrupts to all VCPUs when they have set
-		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-		 * VCPU0, and only if its LVT0 is in EXTINT mode.
-		 */
-		if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
-			kvm_for_each_vcpu(i, vcpu, kvm)
-				kvm_apic_nmi_wd_deliver(vcpu);
-	}
+	if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvm_apic_nmi_wd_deliver(vcpu);
 }
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 {
 	struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
-	struct kvm_pit *pt = ps->kvm->arch.vpit;
+	struct kvm_pit *pt = pit_state_to_pit(ps);
 
-	if (ps->reinject)
+	if (atomic_read(&ps->reinject))
 		atomic_inc(&ps->pending);
 
 	queue_kthread_work(&pt->worker, &pt->expired);
@@ -331,30 +281,54 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 		return HRTIMER_NORESTART;
 }
 
-static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+	atomic_set(&pit->pit_state.pending, 0);
+	atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
+{
+	struct kvm_kpit_state *ps = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
+
+	if (atomic_read(&ps->reinject) == reinject)
+		return;
+
+	if (reinject) {
+		/* The initial state is preserved while ps->reinject == 0. */
+		kvm_pit_reset_reinject(pit);
+		kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+		kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+	} else {
+		kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
+		kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+	}
+
+	atomic_set(&ps->reinject, reinject);
+}
+
+static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
 {
-	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+	struct kvm_kpit_state *ps = &pit->pit_state;
+	struct kvm *kvm = pit->kvm;
 	s64 interval;
 
 	if (!ioapic_in_kernel(kvm) ||
 	    ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 
-	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+	interval = mul_u64_u32_div(val, NSEC_PER_SEC, KVM_PIT_FREQ);
 
 	pr_debug("create pit timer, interval is %llu nsec\n", interval);
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&ps->timer);
-	flush_kthread_work(&ps->pit->expired);
+	flush_kthread_work(&pit->expired);
 	ps->period = interval;
 	ps->is_periodic = is_period;
 
-	ps->timer.function = pit_timer_fn;
-	ps->kvm = ps->pit->kvm;
-
-	atomic_set(&ps->pending, 0);
-	ps->irq_ack = 1;
+	kvm_pit_reset_reinject(pit);
 
 	/*
 	 * Do not allow the guest to program periodic timers with small
@@ -377,11 +351,9 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 		      HRTIMER_MODE_ABS);
 }
 
-static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+static void pit_load_count(struct kvm_pit *pit, int channel, u32 val)
 {
-	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
-
-	WARN_ON(!mutex_is_locked(&ps->lock));
+	struct kvm_kpit_state *ps = &pit->pit_state;
 
 	pr_debug("load_count val is %d, channel is %d\n", val, channel);
 
@@ -406,29 +378,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-		create_pit_timer(kvm, val, 0);
+		create_pit_timer(pit, val, 0);
 		break;
 	case 2:
 	case 3:
-		create_pit_timer(kvm, val, 1);
+		create_pit_timer(pit, val, 1);
 		break;
 	default:
-		destroy_pit_timer(kvm->arch.vpit);
+		destroy_pit_timer(pit);
 	}
 }
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start)
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+		int hpet_legacy_start)
 {
 	u8 saved_mode;
+
+	WARN_ON_ONCE(!mutex_is_locked(&pit->pit_state.lock));
+
 	if (hpet_legacy_start) {
 		/* save existing mode for later reenablement */
 		WARN_ON(channel != 0);
-		saved_mode = kvm->arch.vpit->pit_state.channels[0].mode;
-		kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */
-		pit_load_count(kvm, channel, val);
-		kvm->arch.vpit->pit_state.channels[0].mode = saved_mode;
+		saved_mode = pit->pit_state.channels[0].mode;
+		pit->pit_state.channels[0].mode = 0xff; /* disable timer */
+		pit_load_count(pit, channel, val);
+		pit->pit_state.channels[0].mode = saved_mode;
 	} else {
-		pit_load_count(kvm, channel, val);
+		pit_load_count(pit, channel, val);
 	}
 }
 
@@ -454,7 +430,6 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	int channel, access;
 	struct kvm_kpit_channel_state *s;
 	u32 val = *(u32 *) data;
@@ -478,9 +453,9 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 				s = &pit_state->channels[channel];
 				if (val & (2 << channel)) {
 					if (!(val & 0x20))
-						pit_latch_count(kvm, channel);
+						pit_latch_count(pit, channel);
 					if (!(val & 0x10))
-						pit_latch_status(kvm, channel);
+						pit_latch_status(pit, channel);
 				}
 			}
 		} else {
@@ -488,7 +463,7 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 			s = &pit_state->channels[channel];
 			access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
 			if (access == 0) {
-				pit_latch_count(kvm, channel);
+				pit_latch_count(pit, channel);
 			} else {
 				s->rw_mode = access;
 				s->read_state = access;
@@ -505,17 +480,17 @@ static int pit_ioport_write(struct kvm_vcpu *vcpu,
 		switch (s->write_state) {
 		default:
 		case RW_STATE_LSB:
-			pit_load_count(kvm, addr, val);
+			pit_load_count(pit, addr, val);
 			break;
 		case RW_STATE_MSB:
-			pit_load_count(kvm, addr, val << 8);
+			pit_load_count(pit, addr, val << 8);
 			break;
 		case RW_STATE_WORD0:
 			s->write_latch = val;
 			s->write_state = RW_STATE_WORD1;
 			break;
 		case RW_STATE_WORD1:
-			pit_load_count(kvm, addr, s->write_latch | (val << 8));
+			pit_load_count(pit, addr, s->write_latch | (val << 8));
 			s->write_state = RW_STATE_WORD0;
 			break;
 		}
@@ -531,7 +506,6 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	int ret, count;
 	struct kvm_kpit_channel_state *s;
 	if (!pit_in_range(addr))
@@ -568,20 +542,20 @@ static int pit_ioport_read(struct kvm_vcpu *vcpu,
 		switch (s->read_state) {
 		default:
 		case RW_STATE_LSB:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = count & 0xff;
 			break;
 		case RW_STATE_MSB:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = (count >> 8) & 0xff;
 			break;
 		case RW_STATE_WORD0:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = count & 0xff;
 			s->read_state = RW_STATE_WORD1;
 			break;
 		case RW_STATE_WORD1:
-			count = pit_get_count(kvm, addr);
+			count = pit_get_count(pit, addr);
 			ret = (count >> 8) & 0xff;
 			s->read_state = RW_STATE_WORD0;
 			break;
@@ -602,14 +576,13 @@ static int speaker_ioport_write(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	u32 val = *(u32 *) data;
 	if (addr != KVM_SPEAKER_BASE_ADDRESS)
 		return -EOPNOTSUPP;
 
 	mutex_lock(&pit_state->lock);
 	pit_state->speaker_data_on = (val >> 1) & 1;
-	pit_set_gate(kvm, 2, val & 1);
+	pit_set_gate(pit, 2, val & 1);
 	mutex_unlock(&pit_state->lock);
 	return 0;
 }
@@ -620,7 +593,6 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
-	struct kvm *kvm = pit->kvm;
 	unsigned int refresh_clock;
 	int ret;
 	if (addr != KVM_SPEAKER_BASE_ADDRESS)
@@ -630,8 +602,8 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
 	refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
 
 	mutex_lock(&pit_state->lock);
-	ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
-		(pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+	ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(pit, 2) |
+		(pit_get_out(pit, 2) << 5) | (refresh_clock << 4));
 	if (len > sizeof(ret))
 		len = sizeof(ret);
 	memcpy(data, (char *)&ret, len);
@@ -639,33 +611,28 @@ static int speaker_ioport_read(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-void kvm_pit_reset(struct kvm_pit *pit)
+static void kvm_pit_reset(struct kvm_pit *pit)
 {
 	int i;
 	struct kvm_kpit_channel_state *c;
 
-	mutex_lock(&pit->pit_state.lock);
 	pit->pit_state.flags = 0;
 	for (i = 0; i < 3; i++) {
 		c = &pit->pit_state.channels[i];
 		c->mode = 0xff;
 		c->gate = (i != 2);
-		pit_load_count(pit->kvm, i, 0);
+		pit_load_count(pit, i, 0);
 	}
-	mutex_unlock(&pit->pit_state.lock);
 
-	atomic_set(&pit->pit_state.pending, 0);
-	pit->pit_state.irq_ack = 1;
+	kvm_pit_reset_reinject(pit);
 }
 
 static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 {
 	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
-	if (!mask) {
-		atomic_set(&pit->pit_state.pending, 0);
-		pit->pit_state.irq_ack = 1;
-	}
+	if (!mask)
+		kvm_pit_reset_reinject(pit);
 }
 
 static const struct kvm_io_device_ops pit_dev_ops = {
@@ -692,14 +659,10 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 		return NULL;
 
 	pit->irq_source_id = kvm_request_irq_source_id(kvm);
-	if (pit->irq_source_id < 0) {
-		kfree(pit);
-		return NULL;
-	}
+	if (pit->irq_source_id < 0)
+		goto fail_request;
 
 	mutex_init(&pit->pit_state.lock);
-	mutex_lock(&pit->pit_state.lock);
-	spin_lock_init(&pit->pit_state.inject_lock);
 
 	pid = get_pid(task_tgid(current));
 	pid_nr = pid_vnr(pid);
@@ -708,36 +671,30 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	init_kthread_worker(&pit->worker);
 	pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
 				       "kvm-pit/%d", pid_nr);
-	if (IS_ERR(pit->worker_task)) {
-		mutex_unlock(&pit->pit_state.lock);
-		kvm_free_irq_source_id(kvm, pit->irq_source_id);
-		kfree(pit);
-		return NULL;
-	}
+	if (IS_ERR(pit->worker_task))
+		goto fail_kthread;
+
 	init_kthread_work(&pit->expired, pit_do_work);
 
-	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
 
 	pit_state = &pit->pit_state;
-	pit_state->pit = pit;
 	hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	pit_state->timer.function = pit_timer_fn;
+
 	pit_state->irq_ack_notifier.gsi = 0;
 	pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
-	kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-	pit_state->reinject = true;
-	mutex_unlock(&pit->pit_state.lock);
+	pit->mask_notifier.func = pit_mask_notifer;
 
 	kvm_pit_reset(pit);
 
-	pit->mask_notifier.func = pit_mask_notifer;
-	kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
+	kvm_pit_set_reinject(pit, true);
 
 	kvm_iodevice_init(&pit->dev, &pit_dev_ops);
 	ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
 				      KVM_PIT_MEM_LENGTH, &pit->dev);
 	if (ret < 0)
-		goto fail;
+		goto fail_register_pit;
 
 	if (flags & KVM_PIT_SPEAKER_DUMMY) {
 		kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
@@ -745,42 +702,35 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 					      KVM_SPEAKER_BASE_ADDRESS, 4,
 					      &pit->speaker_dev);
 		if (ret < 0)
-			goto fail_unregister;
+			goto fail_register_speaker;
 	}
 
 	return pit;
 
-fail_unregister:
+fail_register_speaker:
 	kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
-
-fail:
-	kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
-	kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
-	kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_register_pit:
+	kvm_pit_set_reinject(pit, false);
 	kthread_stop(pit->worker_task);
+fail_kthread:
+	kvm_free_irq_source_id(kvm, pit->irq_source_id);
+fail_request:
 	kfree(pit);
 	return NULL;
 }
 
 void kvm_free_pit(struct kvm *kvm)
 {
-	struct hrtimer *timer;
-
-	if (kvm->arch.vpit) {
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
-		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
-					      &kvm->arch.vpit->speaker_dev);
-		kvm_unregister_irq_mask_notifier(kvm, 0,
-					       &kvm->arch.vpit->mask_notifier);
-		kvm_unregister_irq_ack_notifier(kvm,
-				&kvm->arch.vpit->pit_state.irq_ack_notifier);
-		mutex_lock(&kvm->arch.vpit->pit_state.lock);
-		timer = &kvm->arch.vpit->pit_state.timer;
-		hrtimer_cancel(timer);
-		flush_kthread_work(&kvm->arch.vpit->expired);
-		kthread_stop(kvm->arch.vpit->worker_task);
-		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
-		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-		kfree(kvm->arch.vpit);
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	if (pit) {
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
+		kvm_pit_set_reinject(pit, false);
+		hrtimer_cancel(&pit->pit_state.timer);
+		flush_kthread_work(&pit->expired);
+		kthread_stop(pit->worker_task);
+		kvm_free_irq_source_id(kvm, pit->irq_source_id);
+		kfree(pit);
 	}
 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index c84990b42..2f5af0798 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -22,19 +22,18 @@ struct kvm_kpit_channel_state {
 };
 
 struct kvm_kpit_state {
+	/* All members before "struct mutex lock" are protected by the lock. */
 	struct kvm_kpit_channel_state channels[3];
 	u32 flags;
 	bool is_periodic;
 	s64 period; 				/* unit: ns */
 	struct hrtimer timer;
-	atomic_t pending;			/* accumulated triggered timers */
-	bool reinject;
-	struct kvm *kvm;
 	u32    speaker_data_on;
+
 	struct mutex lock;
-	struct kvm_pit *pit;
-	spinlock_t inject_lock;
-	unsigned long irq_ack;
+	atomic_t reinject;
+	atomic_t pending; /* accumulated triggered timers */
+	atomic_t irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
@@ -57,9 +56,11 @@ struct kvm_pit {
 #define KVM_MAX_PIT_INTR_INTERVAL   HZ / 100
 #define KVM_PIT_CHANNEL_MASK	    0x3
 
-void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
 void kvm_free_pit(struct kvm *kvm);
-void kvm_pit_reset(struct kvm_pit *pit);
+
+void kvm_pit_load_count(struct kvm_pit *pit, int channel, u32 val,
+		int hpet_legacy_start);
+void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject);
 
 #endif
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 1facfd60b..9db47090e 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -94,7 +94,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
 {
 	ioapic->rtc_status.pending_eoi = 0;
-	bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+	bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPUS);
 }
 
 static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -117,16 +117,16 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
 		return;
 
 	new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
-	old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+	old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
 
 	if (new_val == old_val)
 		return;
 
 	if (new_val) {
-		__set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+		__set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
 		ioapic->rtc_status.pending_eoi++;
 	} else {
-		__clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+		__clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map);
 		ioapic->rtc_status.pending_eoi--;
 		rtc_status_pending_eoi_check_valid(ioapic);
 	}
@@ -156,7 +156,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
 
 static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
 {
-	if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
+	if (test_and_clear_bit(vcpu->vcpu_id,
+			       ioapic->rtc_status.dest_map.map)) {
 		--ioapic->rtc_status.pending_eoi;
 		rtc_status_pending_eoi_check_valid(ioapic);
 	}
@@ -236,10 +237,17 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+	struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
 	union kvm_ioapic_redirect_entry *e;
 	int index;
 
 	spin_lock(&ioapic->lock);
+
+	/* Make sure we see any missing RTC EOI */
+	if (test_bit(vcpu->vcpu_id, dest_map->map))
+		__set_bit(dest_map->vectors[vcpu->vcpu_id],
+			  ioapic_handled_vectors);
+
 	for (index = 0; index < IOAPIC_NUM_PINS; index++) {
 		e = &ioapic->redirtbl[index];
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
@@ -346,7 +354,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 		 */
 		BUG_ON(ioapic->rtc_status.pending_eoi != 0);
 		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
-				ioapic->rtc_status.dest_map);
+					       &ioapic->rtc_status.dest_map);
 		ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
 	} else
 		ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
@@ -407,8 +415,14 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
 static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
-	int i;
+	struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	int i;
+
+	/* RTC special handling */
+	if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+	    vector == dest_map->vectors[vcpu->vcpu_id])
+		rtc_irq_eoi(ioapic, vcpu);
 
 	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@@ -416,8 +430,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 		if (ent->fields.vector != vector)
 			continue;
 
-		if (i == RTC_GSI)
-			rtc_irq_eoi(ioapic, vcpu);
 		/*
 		 * We are dropping lock while calling ack notifiers because ack
 		 * notifier callbacks for assigned devices call into IOAPIC
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 2d16dc251..7d2692a49 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -40,9 +40,21 @@ struct kvm_vcpu;
 #define RTC_GSI -1U
 #endif
 
+struct dest_map {
+	/* vcpu bitmap where IRQ has been sent */
+	DECLARE_BITMAP(map, KVM_MAX_VCPUS);
+
+	/*
+	 * Vector sent to a given vcpu, only valid when
+	 * the vcpu's bit in map is set
+	 */
+	u8 vectors[KVM_MAX_VCPUS];
+};
+
+
 struct rtc_status {
 	int pending_eoi;
-	DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+	struct dest_map dest_map;
 };
 
 union kvm_ioapic_redirect_entry {
@@ -118,7 +130,8 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
 		       int level, bool line_status);
 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, unsigned long *dest_map);
+			     struct kvm_lapic_irq *irq,
+			     struct dest_map *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 3982b479b..95fcc7b13 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -33,7 +33,10 @@
  */
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	return apic_has_pending_timer(vcpu);
+	if (lapic_in_kernel(vcpu))
+		return apic_has_pending_timer(vcpu);
+
+	return 0;
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
@@ -137,8 +140,8 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
-	kvm_inject_apic_timer_irqs(vcpu);
-	/* TODO: PIT, RTC etc. */
+	if (lapic_in_kernel(vcpu))
+		kvm_inject_apic_timer_irqs(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ae5c78f23..61ebdc13a 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -109,14 +109,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return ret;
 }
 
-static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
-{
-	/* Same as irqchip_in_kernel(vcpu->kvm), but with less
-	 * pointer chasing and no unnecessary memory barriers.
-	 */
-	return vcpu->arch.apic != NULL;
-}
-
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8fc89efb5..54ead79e4 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -34,6 +34,7 @@
 #include "lapic.h"
 
 #include "hyperv.h"
+#include "x86.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
@@ -53,10 +54,12 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, unsigned long *dest_map)
+		struct kvm_lapic_irq *irq, struct dest_map *dest_map)
 {
 	int i, r = -1;
 	struct kvm_vcpu *vcpu, *lowest = NULL;
+	unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
+	unsigned int dest_vcpus = 0;
 
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
 			kvm_lowest_prio_delivery(irq)) {
@@ -67,6 +70,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
 		return r;
 
+	memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
+
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
 			continue;
@@ -80,13 +85,25 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq, dest_map);
 		} else if (kvm_lapic_enabled(vcpu)) {
-			if (!lowest)
-				lowest = vcpu;
-			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
-				lowest = vcpu;
+			if (!kvm_vector_hashing_enabled()) {
+				if (!lowest)
+					lowest = vcpu;
+				else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+					lowest = vcpu;
+			} else {
+				__set_bit(i, dest_vcpu_bitmap);
+				dest_vcpus++;
+			}
 		}
 	}
 
+	if (dest_vcpus != 0) {
+		int idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+					dest_vcpu_bitmap, KVM_MAX_VCPUS);
+
+		lowest = kvm_get_vcpu(kvm, idx);
+	}
+
 	if (lowest)
 		r = kvm_apic_set_irq(lowest, irq, dest_map);
 
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee4a..762cdf259 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
 		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
 }
 
+static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->get_pkru(vcpu);
+}
+
 static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3a045f39e..1a2da0e5a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -281,7 +281,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
 	struct kvm_cpuid_entry2 *feat;
 	u32 v = APIC_VERSION;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
@@ -475,26 +475,20 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 {
-	int highest_irr;
-
 	/* This may race with setting of irr in __apic_accept_irq() and
 	 * value returned may be wrong, but kvm_vcpu_kick() in __apic_accept_irq
 	 * will cause vmexit immediately and the value will be recalculated
 	 * on the next vmentry.
 	 */
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return 0;
-	highest_irr = apic_find_highest_irr(vcpu->arch.apic);
-
-	return highest_irr;
+	return apic_find_highest_irr(vcpu->arch.apic);
 }
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode,
-			     unsigned long *dest_map);
+			     struct dest_map *dest_map);
 
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-		unsigned long *dest_map)
+		     struct dest_map *dest_map)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
@@ -675,8 +669,33 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 	}
 }
 
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+		       const unsigned long *bitmap, u32 bitmap_size)
+{
+	u32 mod;
+	int i, idx = -1;
+
+	mod = vector % dest_vcpus;
+
+	for (i = 0; i <= mod; i++) {
+		idx = find_next_bit(bitmap, bitmap_size, idx + 1);
+		BUG_ON(idx == bitmap_size);
+	}
+
+	return idx;
+}
+
+static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
+{
+	if (!kvm->arch.disabled_lapic_found) {
+		kvm->arch.disabled_lapic_found = true;
+		printk(KERN_INFO
+		       "Disabled LAPIC found during irq injection\n");
+	}
+}
+
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
+		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
 {
 	struct kvm_apic_map *map;
 	unsigned long bitmap = 1;
@@ -727,21 +746,42 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 		dst = map->logical_map[cid];
 
-		if (kvm_lowest_prio_delivery(irq)) {
+		if (!kvm_lowest_prio_delivery(irq))
+			goto set_irq;
+
+		if (!kvm_vector_hashing_enabled()) {
 			int l = -1;
 			for_each_set_bit(i, &bitmap, 16) {
 				if (!dst[i])
 					continue;
 				if (l < 0)
 					l = i;
-				else if (kvm_apic_compare_prio(dst[i]->vcpu, dst[l]->vcpu) < 0)
+				else if (kvm_apic_compare_prio(dst[i]->vcpu,
+							dst[l]->vcpu) < 0)
 					l = i;
 			}
-
 			bitmap = (l >= 0) ? 1 << l : 0;
+		} else {
+			int idx;
+			unsigned int dest_vcpus;
+
+			dest_vcpus = hweight16(bitmap);
+			if (dest_vcpus == 0)
+				goto out;
+
+			idx = kvm_vector_to_index(irq->vector,
+				dest_vcpus, &bitmap, 16);
+
+			if (!dst[idx]) {
+				kvm_apic_disabled_lapic_found(kvm);
+				goto out;
+			}
+
+			bitmap = (idx >= 0) ? 1 << idx : 0;
 		}
 	}
 
+set_irq:
 	for_each_set_bit(i, &bitmap, 16) {
 		if (!dst[i])
 			continue;
@@ -754,6 +794,20 @@ out:
 	return ret;
 }
 
+/*
+ * This routine tries to handler interrupts in posted mode, here is how
+ * it deals with different cases:
+ * - For single-destination interrupts, handle it in posted mode
+ * - Else if vector hashing is enabled and it is a lowest-priority
+ *   interrupt, handle it in posted mode and use the following mechanism
+ *   to find the destinaiton vCPU.
+ *	1. For lowest-priority interrupts, store all the possible
+ *	   destination vCPUs in an array.
+ *	2. Use "guest vector % max number of destination vCPUs" to find
+ *	   the right destination vCPU in the array for the lowest-priority
+ *	   interrupt.
+ * - Otherwise, use remapped mode to inject the interrupt.
+ */
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu)
 {
@@ -795,16 +849,37 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 		if (cid >= ARRAY_SIZE(map->logical_map))
 			goto out;
 
-		for_each_set_bit(i, &bitmap, 16) {
-			dst = map->logical_map[cid][i];
-			if (++r == 2)
+		if (kvm_vector_hashing_enabled() &&
+				kvm_lowest_prio_delivery(irq)) {
+			int idx;
+			unsigned int dest_vcpus;
+
+			dest_vcpus = hweight16(bitmap);
+			if (dest_vcpus == 0)
 				goto out;
-		}
 
-		if (dst && kvm_apic_present(dst->vcpu))
+			idx = kvm_vector_to_index(irq->vector, dest_vcpus,
+						  &bitmap, 16);
+
+			dst = map->logical_map[cid][idx];
+			if (!dst) {
+				kvm_apic_disabled_lapic_found(kvm);
+				goto out;
+			}
+
 			*dest_vcpu = dst->vcpu;
-		else
-			goto out;
+		} else {
+			for_each_set_bit(i, &bitmap, 16) {
+				dst = map->logical_map[cid][i];
+				if (++r == 2)
+					goto out;
+			}
+
+			if (dst && kvm_apic_present(dst->vcpu))
+				*dest_vcpu = dst->vcpu;
+			else
+				goto out;
+		}
 	}
 
 	ret = true;
@@ -819,7 +894,7 @@ out:
  */
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode,
-			     unsigned long *dest_map)
+			     struct dest_map *dest_map)
 {
 	int result = 0;
 	struct kvm_vcpu *vcpu = apic->vcpu;
@@ -839,8 +914,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
 		result = 1;
 
-		if (dest_map)
-			__set_bit(vcpu->vcpu_id, dest_map);
+		if (dest_map) {
+			__set_bit(vcpu->vcpu_id, dest_map->map);
+			dest_map->vectors[vcpu->vcpu_id] = vector;
+		}
 
 		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
 			if (trig_mode)
@@ -1239,7 +1316,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u64 guest_tsc, tsc_deadline;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	if (apic->lapic_timer.expired_tscdeadline == 0)
@@ -1292,7 +1369,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 
 		hrtimer_start(&apic->lapic_timer.timer,
 			      ktime_add_ns(now, apic->lapic_timer.period),
-			      HRTIMER_MODE_ABS);
+			      HRTIMER_MODE_ABS_PINNED);
 
 		apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
 			   PRIx64 ", "
@@ -1325,7 +1402,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			expire = ktime_add_ns(now, ns);
 			expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
 			hrtimer_start(&apic->lapic_timer.timer,
-				      expire, HRTIMER_MODE_ABS);
+				      expire, HRTIMER_MODE_ABS_PINNED);
 		} else
 			apic_timer_expired(apic);
 
@@ -1515,8 +1592,7 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
 {
-	if (kvm_vcpu_has_lapic(vcpu))
-		apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+	apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
 
@@ -1566,7 +1642,7 @@ u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+	if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
 			apic_lvtt_period(apic))
 		return 0;
 
@@ -1577,7 +1653,7 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || apic_lvtt_oneshot(apic) ||
+	if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
 			apic_lvtt_period(apic))
 		return;
 
@@ -1590,9 +1666,6 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return;
-
 	apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
 		     | (kvm_apic_get_reg(apic, APIC_TASKPRI) & 4));
 }
@@ -1601,9 +1674,6 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
 {
 	u64 tpr;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return 0;
-
 	tpr = (u64) kvm_apic_get_reg(vcpu->arch.apic, APIC_TASKPRI);
 
 	return (tpr & 0xf0) >> 4;
@@ -1728,8 +1798,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (kvm_vcpu_has_lapic(vcpu) && apic_enabled(apic) &&
-			apic_lvt_enabled(apic, APIC_LVTT))
+	if (apic_enabled(apic) && apic_lvt_enabled(apic, APIC_LVTT))
 		return atomic_read(&apic->lapic_timer.pending);
 
 	return 0;
@@ -1799,7 +1868,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	apic->vcpu = vcpu;
 
 	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_ABS);
+		     HRTIMER_MODE_ABS_PINNED);
 	apic->lapic_timer.timer.function = apic_timer_fn;
 
 	/*
@@ -1826,7 +1895,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || !apic_enabled(apic))
+	if (!apic_enabled(apic))
 		return -1;
 
 	apic_update_ppr(apic);
@@ -1854,9 +1923,6 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
-		return;
-
 	if (atomic_read(&apic->lapic_timer.pending) > 0) {
 		kvm_apic_local_deliver(apic, APIC_LVTT);
 		if (apic_lvtt_tscdeadline(apic))
@@ -1932,12 +1998,12 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 {
 	struct hrtimer *timer;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	timer = &vcpu->arch.apic->lapic_timer.timer;
 	if (hrtimer_cancel(timer))
-		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 }
 
 /*
@@ -2105,7 +2171,7 @@ int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return 1;
 
 	/* if this is ICR write vector before command */
@@ -2119,7 +2185,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 low, high = 0;
 
-	if (!kvm_vcpu_has_lapic(vcpu))
+	if (!lapic_in_kernel(vcpu))
 		return 1;
 
 	if (apic_reg_read(apic, reg, 4, &low))
@@ -2151,7 +2217,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 	u8 sipi_vector;
 	unsigned long pe;
 
-	if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
+	if (!lapic_in_kernel(vcpu) || !apic->pending_events)
 		return;
 
 	/*
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 41bdb35b4..f71183e50 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -42,6 +42,9 @@ struct kvm_lapic {
 	unsigned long pending_events;
 	unsigned int sipi_vector;
 };
+
+struct dest_map;
+
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
@@ -60,11 +63,11 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
-		unsigned long *dest_map);
+		     struct dest_map *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-		struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
+		struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
@@ -103,7 +106,7 @@ static inline u32 kvm_apic_get_reg(struct kvm_lapic *apic, int reg_off)
 
 extern struct static_key kvm_no_apic_vcpu;
 
-static inline bool kvm_vcpu_has_lapic(struct kvm_vcpu *vcpu)
+static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu)
 {
 	if (static_key_false(&kvm_no_apic_vcpu))
 		return vcpu->arch.apic;
@@ -130,7 +133,7 @@ static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
 
 static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
-	return kvm_vcpu_has_lapic(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
+	return lapic_in_kernel(vcpu) && kvm_apic_hw_enabled(vcpu->arch.apic);
 }
 
 static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
@@ -150,7 +153,7 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
-	return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+	return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
 }
 
 static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
@@ -161,7 +164,7 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
 
 static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
 {
-	return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+	return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
 static inline int kvm_apic_id(struct kvm_lapic *apic)
@@ -175,4 +178,6 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			struct kvm_vcpu **dest_vcpu);
+int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
+			const unsigned long *bitmap, u32 bitmap_size);
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1e7a49bfc..b6f50e8b0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -41,6 +41,7 @@
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
 #include <asm/vmx.h>
+#include <asm/kvm_page_track.h>
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -478,7 +479,7 @@ static bool spte_is_locklessly_modifiable(u64 spte)
 static bool spte_has_volatile_bits(u64 spte)
 {
 	/*
-	 * Always atomicly update spte if it can be updated
+	 * Always atomically update spte if it can be updated
 	 * out of mmu-lock, it can ensure dirty bit is not lost,
 	 * also, it can help us to get a stable is_writable_pte()
 	 * to ensure tlb flush is not missed.
@@ -549,15 +550,22 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 
 	/*
 	 * For the spte updated out of mmu-lock is safe, since
-	 * we always atomicly update it, see the comments in
+	 * we always atomically update it, see the comments in
 	 * spte_has_volatile_bits().
 	 */
 	if (spte_is_locklessly_modifiable(old_spte) &&
 	      !is_writable_pte(new_spte))
 		ret = true;
 
-	if (!shadow_accessed_mask)
+	if (!shadow_accessed_mask) {
+		/*
+		 * We don't set page dirty when dropping non-writable spte.
+		 * So do it now if the new spte is becoming non-writable.
+		 */
+		if (ret)
+			kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 		return ret;
+	}
 
 	/*
 	 * Flush TLB when accessed/dirty bits are changed in the page tables,
@@ -604,7 +612,8 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 
 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+	if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
+					    PT_WRITABLE_MASK))
 		kvm_set_pfn_dirty(pfn);
 	return 1;
 }
@@ -631,12 +640,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 	 * kvm_flush_remote_tlbs() IPI to all active vcpus.
 	 */
 	local_irq_disable();
-	vcpu->mode = READING_SHADOW_PAGE_TABLES;
+
 	/*
 	 * Make sure a following spte read is not reordered ahead of the write
 	 * to vcpu->mode.
 	 */
-	smp_mb();
+	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 }
 
 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
@@ -646,8 +655,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 	 * reads to sptes.  If it does, kvm_commit_zap_page() can see us
 	 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 	 */
-	smp_mb();
-	vcpu->mode = OUTSIDE_GUEST_MODE;
+	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 	local_irq_enable();
 }
 
@@ -776,62 +784,85 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 	return &slot->arch.lpage_info[level - 2][idx];
 }
 
+static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+					    gfn_t gfn, int count)
+{
+	struct kvm_lpage_info *linfo;
+	int i;
+
+	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+		linfo = lpage_info_slot(gfn, slot, i);
+		linfo->disallow_lpage += count;
+		WARN_ON(linfo->disallow_lpage < 0);
+	}
+}
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	update_gfn_disallow_lpage_count(slot, gfn, 1);
+}
+
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	update_gfn_disallow_lpage_count(slot, gfn, -1);
+}
+
 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
-	struct kvm_lpage_info *linfo;
 	gfn_t gfn;
-	int i;
 
+	kvm->arch.indirect_shadow_pages++;
 	gfn = sp->gfn;
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
-	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-		linfo = lpage_info_slot(gfn, slot, i);
-		linfo->write_count += 1;
-	}
-	kvm->arch.indirect_shadow_pages++;
+
+	/* the non-leaf shadow pages are keeping readonly. */
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return kvm_slot_page_track_add_page(kvm, slot, gfn,
+						    KVM_PAGE_TRACK_WRITE);
+
+	kvm_mmu_gfn_disallow_lpage(slot, gfn);
 }
 
 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
-	struct kvm_lpage_info *linfo;
 	gfn_t gfn;
-	int i;
 
+	kvm->arch.indirect_shadow_pages--;
 	gfn = sp->gfn;
 	slots = kvm_memslots_for_spte_role(kvm, sp->role);
 	slot = __gfn_to_memslot(slots, gfn);
-	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
-		linfo = lpage_info_slot(gfn, slot, i);
-		linfo->write_count -= 1;
-		WARN_ON(linfo->write_count < 0);
-	}
-	kvm->arch.indirect_shadow_pages--;
+	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+		return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+						       KVM_PAGE_TRACK_WRITE);
+
+	kvm_mmu_gfn_allow_lpage(slot, gfn);
 }
 
-static int __has_wrprotected_page(gfn_t gfn, int level,
-				  struct kvm_memory_slot *slot)
+static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+					  struct kvm_memory_slot *slot)
 {
 	struct kvm_lpage_info *linfo;
 
 	if (slot) {
 		linfo = lpage_info_slot(gfn, slot, level);
-		return linfo->write_count;
+		return !!linfo->disallow_lpage;
 	}
 
-	return 1;
+	return true;
 }
 
-static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+					int level)
 {
 	struct kvm_memory_slot *slot;
 
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	return __has_wrprotected_page(gfn, level, slot);
+	return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
 }
 
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@ -897,7 +928,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-		if (__has_wrprotected_page(large_gfn, level, slot))
+		if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
 			break;
 
 	return level - 1;
@@ -1323,23 +1354,29 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+				    struct kvm_memory_slot *slot, u64 gfn)
 {
-	struct kvm_memory_slot *slot;
 	struct kvm_rmap_head *rmap_head;
 	int i;
 	bool write_protected = false;
 
-	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-
 	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		rmap_head = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+		write_protected |= __rmap_write_protect(kvm, rmap_head, true);
 	}
 
 	return write_protected;
 }
 
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+}
+
 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
 {
 	u64 *sptep;
@@ -1754,7 +1791,7 @@ static void mark_unsync(u64 *spte)
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 			       struct kvm_mmu_page *sp)
 {
-	return 1;
+	return 0;
 }
 
 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1840,13 +1877,16 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 	return nr_unsync_leaf;
 }
 
+#define INVALID_INDEX (-1)
+
 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 			   struct kvm_mmu_pages *pvec)
 {
+	pvec->nr = 0;
 	if (!sp->unsync_children)
 		return 0;
 
-	mmu_pages_add(pvec, sp, 0);
+	mmu_pages_add(pvec, sp, INVALID_INDEX);
 	return __mmu_unsync_walk(sp, pvec);
 }
 
@@ -1883,37 +1923,35 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		if ((_sp)->role.direct || (_sp)->role.invalid) {} else
 
 /* @sp->gfn should be write-protected at the call site */
-static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			   struct list_head *invalid_list, bool clear_unsync)
+static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    struct list_head *invalid_list)
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-		return 1;
+		return false;
 	}
 
-	if (clear_unsync)
-		kvm_unlink_unsync_page(vcpu->kvm, sp);
-
-	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+	if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-		return 1;
+		return false;
 	}
 
-	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-	return 0;
+	return true;
 }
 
-static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
-				   struct kvm_mmu_page *sp)
+static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+				 struct list_head *invalid_list,
+				 bool remote_flush, bool local_flush)
 {
-	LIST_HEAD(invalid_list);
-	int ret;
-
-	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-	if (ret)
-		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+	if (!list_empty(invalid_list)) {
+		kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+		return;
+	}
 
-	return ret;
+	if (remote_flush)
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	else if (local_flush)
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
 #ifdef CONFIG_KVM_MMU_AUDIT
@@ -1923,46 +1961,38 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
 static void mmu_audit_disable(void) { }
 #endif
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			 struct list_head *invalid_list)
 {
-	return __kvm_sync_page(vcpu, sp, invalid_list, true);
+	kvm_unlink_unsync_page(vcpu->kvm, sp);
+	return __kvm_sync_page(vcpu, sp, invalid_list);
 }
 
 /* @gfn should be write-protected at the call site */
-static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+			   struct list_head *invalid_list)
 {
 	struct kvm_mmu_page *s;
-	LIST_HEAD(invalid_list);
-	bool flush = false;
+	bool ret = false;
 
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
 		if (!s->unsync)
 			continue;
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-		kvm_unlink_unsync_page(vcpu->kvm, s);
-		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-			(vcpu->arch.mmu.sync_page(vcpu, s))) {
-			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
-			continue;
-		}
-		flush = true;
+		ret |= kvm_sync_page(vcpu, s, invalid_list);
 	}
 
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-	if (flush)
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+	return ret;
 }
 
 struct mmu_page_path {
-	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
-	unsigned int idx[PT64_ROOT_LEVEL-1];
+	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+	unsigned int idx[PT64_ROOT_LEVEL];
 };
 
 #define for_each_sp(pvec, sp, parents, i)			\
-		for (i = mmu_pages_next(&pvec, &parents, -1),	\
-			sp = pvec.page[i].sp;			\
+		for (i = mmu_pages_first(&pvec, &parents);	\
 			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
 			i = mmu_pages_next(&pvec, &parents, i))
 
@@ -1974,19 +2004,43 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec,
 
 	for (n = i+1; n < pvec->nr; n++) {
 		struct kvm_mmu_page *sp = pvec->page[n].sp;
+		unsigned idx = pvec->page[n].idx;
+		int level = sp->role.level;
 
-		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
-			parents->idx[0] = pvec->page[n].idx;
-			return n;
-		}
+		parents->idx[level-1] = idx;
+		if (level == PT_PAGE_TABLE_LEVEL)
+			break;
 
-		parents->parent[sp->role.level-2] = sp;
-		parents->idx[sp->role.level-1] = pvec->page[n].idx;
+		parents->parent[level-2] = sp;
 	}
 
 	return n;
 }
 
+static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+			   struct mmu_page_path *parents)
+{
+	struct kvm_mmu_page *sp;
+	int level;
+
+	if (pvec->nr == 0)
+		return 0;
+
+	WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+
+	sp = pvec->page[0].sp;
+	level = sp->role.level;
+	WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+
+	parents->parent[level-2] = sp;
+
+	/* Also set up a sentinel.  Further entries in pvec are all
+	 * children of sp, so this element is never overwritten.
+	 */
+	parents->parent[level-1] = NULL;
+	return mmu_pages_next(pvec, parents, 0);
+}
+
 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
 	struct kvm_mmu_page *sp;
@@ -1994,22 +2048,14 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
 
 	do {
 		unsigned int idx = parents->idx[level];
-
 		sp = parents->parent[level];
 		if (!sp)
 			return;
 
+		WARN_ON(idx == INVALID_INDEX);
 		clear_unsync_child_bit(sp, idx);
 		level++;
-	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
-}
-
-static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
-			       struct mmu_page_path *parents,
-			       struct kvm_mmu_pages *pvec)
-{
-	parents->parent[parent->role.level-1] = NULL;
-	pvec->nr = 0;
+	} while (!sp->unsync_children);
 }
 
 static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@ -2020,30 +2066,36 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	struct mmu_page_path parents;
 	struct kvm_mmu_pages pages;
 	LIST_HEAD(invalid_list);
+	bool flush = false;
 
-	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
 		bool protected = false;
 
 		for_each_sp(pages, sp, parents, i)
 			protected |= rmap_write_protect(vcpu, sp->gfn);
 
-		if (protected)
+		if (protected) {
 			kvm_flush_remote_tlbs(vcpu->kvm);
+			flush = false;
+		}
 
 		for_each_sp(pages, sp, parents, i) {
-			kvm_sync_page(vcpu, sp, &invalid_list);
+			flush |= kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 		}
-		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-		cond_resched_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_pages_init(parent, &parents, &pages);
+		if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+			kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+			cond_resched_lock(&vcpu->kvm->mmu_lock);
+			flush = false;
+		}
 	}
+
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 }
 
 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
 {
-	sp->write_flooding_count = 0;
+	atomic_set(&sp->write_flooding_count,  0);
 }
 
 static void clear_sp_write_flooding_count(u64 *spte)
@@ -2069,6 +2121,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned quadrant;
 	struct kvm_mmu_page *sp;
 	bool need_sync = false;
+	bool flush = false;
+	LIST_HEAD(invalid_list);
 
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
@@ -2092,8 +2146,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		if (sp->role.word != role.word)
 			continue;
 
-		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
-			break;
+		if (sp->unsync) {
+			/* The page is good, but __kvm_sync_page might still end
+			 * up zapping it.  If so, break in order to rebuild it.
+			 */
+			if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+				break;
+
+			WARN_ON(!list_empty(&invalid_list));
+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		}
 
 		if (sp->unsync_children)
 			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -2112,16 +2174,24 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	hlist_add_head(&sp->hash_link,
 		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
-		if (rmap_write_protect(vcpu, gfn))
+		/*
+		 * we should do write protection before syncing pages
+		 * otherwise the content of the synced shadow page may
+		 * be inconsistent with guest page table.
+		 */
+		account_shadowed(vcpu->kvm, sp);
+		if (level == PT_PAGE_TABLE_LEVEL &&
+		      rmap_write_protect(vcpu, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
-		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
-			kvm_sync_pages(vcpu, gfn);
 
-		account_shadowed(vcpu->kvm, sp);
+		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+			flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
 	}
 	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	clear_page(sp->spt);
 	trace_kvm_mmu_get_page(sp, true);
+
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
 	return sp;
 }
 
@@ -2269,7 +2339,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
 		return 0;
 
-	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
 		struct kvm_mmu_page *sp;
 
@@ -2278,7 +2347,6 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 			mmu_pages_clear_parents(&parents);
 			zapped++;
 		}
-		kvm_mmu_pages_init(parent, &parents, &pages);
 	}
 
 	return zapped;
@@ -2329,14 +2397,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		return;
 
 	/*
-	 * wmb: make sure everyone sees our modifications to the page tables
-	 * rmb: make sure we see changes to vcpu->mode
-	 */
-	smp_mb();
-
-	/*
-	 * Wait for all vcpus to exit guest mode and/or lockless shadow
-	 * page table walks.
+	 * We need to make sure everyone sees our modifications to
+	 * the page tables and see changes to vcpu->mode here. The barrier
+	 * in the kvm_flush_remote_tlbs() achieves this. This pairs
+	 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+	 *
+	 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+	 * guest mode and/or lockless shadow page table walks.
 	 */
 	kvm_flush_remote_tlbs(kvm);
 
@@ -2354,8 +2421,8 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
 	if (list_empty(&kvm->arch.active_mmu_pages))
 		return false;
 
-	sp = list_entry(kvm->arch.active_mmu_pages.prev,
-			struct kvm_mmu_page, link);
+	sp = list_last_entry(&kvm->arch.active_mmu_pages,
+			     struct kvm_mmu_page, link);
 	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 
 	return true;
@@ -2408,7 +2475,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	trace_kvm_mmu_unsync_page(sp);
 	++vcpu->kvm->stat.mmu_unsync;
@@ -2417,37 +2484,26 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	kvm_mmu_mark_parents_unsync(sp);
 }
 
-static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+				   bool can_unsync)
 {
-	struct kvm_mmu_page *s;
-
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
-		if (s->unsync)
-			continue;
-		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
-		__kvm_unsync_page(vcpu, s);
-	}
-}
+	struct kvm_mmu_page *sp;
 
-static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
-				  bool can_unsync)
-{
-	struct kvm_mmu_page *s;
-	bool need_unsync = false;
+	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+		return true;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
 		if (!can_unsync)
-			return 1;
+			return true;
 
-		if (s->role.level != PT_PAGE_TABLE_LEVEL)
-			return 1;
+		if (sp->unsync)
+			continue;
 
-		if (!s->unsync)
-			need_unsync = true;
+		WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+		kvm_unsync_page(vcpu, sp);
 	}
-	if (need_unsync)
-		kvm_unsync_pages(vcpu, gfn);
-	return 0;
+
+	return false;
 }
 
 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@ -2503,7 +2559,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		 * be fixed if guest refault.
 		 */
 		if (level > PT_PAGE_TABLE_LEVEL &&
-		    has_wrprotected_page(vcpu, gfn, level))
+		    mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
 			goto done;
 
 		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2767,8 +2823,8 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	 */
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
 	    level == PT_PAGE_TABLE_LEVEL &&
-	    PageTransCompound(pfn_to_page(pfn)) &&
-	    !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+	    PageTransCompoundMap(pfn_to_page(pfn)) &&
+	    !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
 		unsigned long mask;
 		/*
 		 * mmu_notifier_retry was successful and we hold the
@@ -2796,20 +2852,16 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 				kvm_pfn_t pfn, unsigned access, int *ret_val)
 {
-	bool ret = true;
-
 	/* The pfn is invalid, report the error! */
 	if (unlikely(is_error_pfn(pfn))) {
 		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
-		goto exit;
+		return true;
 	}
 
 	if (unlikely(is_noslot_pfn(pfn)))
 		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
 
-	ret = false;
-exit:
-	return ret;
+	return false;
 }
 
 static bool page_fault_can_be_fast(u32 error_code)
@@ -3273,7 +3325,7 @@ static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
 	return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
 }
 
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
 	if (direct)
 		return vcpu_match_mmio_gpa(vcpu, addr);
@@ -3332,7 +3384,7 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 	u64 spte;
 	bool reserved;
 
-	if (quickly_check_mmio_pf(vcpu, addr, direct))
+	if (mmio_info_in_cache(vcpu, addr, direct))
 		return RET_MMIO_PF_EMULATE;
 
 	reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@ -3362,20 +3414,53 @@ int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 }
 EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
 
+static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+					 u32 error_code, gfn_t gfn)
+{
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return false;
+
+	if (!(error_code & PFERR_PRESENT_MASK) ||
+	      !(error_code & PFERR_WRITE_MASK))
+		return false;
+
+	/*
+	 * guest is writing the page which is write tracked which can
+	 * not be fixed by page fault handler.
+	 */
+	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+		return true;
+
+	return false;
+}
+
+static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	u64 spte;
+
+	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		return;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+		clear_sp_write_flooding_count(iterator.sptep);
+		if (!is_shadow_present_pte(spte))
+			break;
+	}
+	walk_shadow_page_lockless_end(vcpu);
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
 {
-	gfn_t gfn;
+	gfn_t gfn = gva >> PAGE_SHIFT;
 	int r;
 
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
 
-	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, gva, true);
-
-		if (likely(r != RET_MMIO_PF_INVALID))
-			return r;
-	}
+	if (page_fault_handle_page_track(vcpu, error_code, gfn))
+		return 1;
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -3383,7 +3468,6 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-	gfn = gva >> PAGE_SHIFT;
 
 	return nonpaging_map(vcpu, gva & PAGE_MASK,
 			     error_code, gfn, prefault);
@@ -3460,12 +3544,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
 	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
-	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, gpa, true);
-
-		if (likely(r != RET_MMIO_PF_INVALID))
-			return r;
-	}
+	if (page_fault_handle_page_track(vcpu, error_code, gfn))
+		return 1;
 
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
@@ -3558,13 +3638,24 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 	return false;
 }
 
-static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+static inline bool is_last_gpte(struct kvm_mmu *mmu,
+				unsigned level, unsigned gpte)
 {
-	unsigned index;
+	/*
+	 * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+	 * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+	 * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+	 */
+	gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
+	/*
+	 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+	 * If it is clear, there are no large pages at this level, so clear
+	 * PT_PAGE_SIZE_MASK in gpte if that is the case.
+	 */
+	gpte &= level - mmu->last_nonleaf_level;
 
-	index = level - 1;
-	index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
-	return mmu->last_pte_bitmap & (1 << index);
+	return gpte & PT_PAGE_SIZE_MASK;
 }
 
 #define PTTYPE_EPT 18 /* arbitrary */
@@ -3838,22 +3929,88 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
 	}
 }
 
-static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
-{
-	u8 map;
-	unsigned level, root_level = mmu->root_level;
-	const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
-
-	if (root_level == PT32E_ROOT_LEVEL)
-		--root_level;
-	/* PT_PAGE_TABLE_LEVEL always terminates */
-	map = 1 | (1 << ps_set_index);
-	for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
-		if (level <= PT_PDPE_LEVEL
-		    && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
-			map |= 1 << (ps_set_index | (level - 1));
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register.  Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions.  The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD.  For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				bool ept)
+{
+	unsigned bit;
+	bool wp;
+
+	if (ept) {
+		mmu->pkru_mask = 0;
+		return;
+	}
+
+	/* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+		mmu->pkru_mask = 0;
+		return;
+	}
+
+	wp = is_write_protection(vcpu);
+
+	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+		unsigned pfec, pkey_bits;
+		bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+		pfec = bit << 1;
+		ff = pfec & PFERR_FETCH_MASK;
+		uf = pfec & PFERR_USER_MASK;
+		wf = pfec & PFERR_WRITE_MASK;
+
+		/* PFEC.RSVD is replaced by ACC_USER_MASK. */
+		pte_user = pfec & PFERR_RSVD_MASK;
+
+		/*
+		 * Only need to check the access which is not an
+		 * instruction fetch and is to a user page.
+		 */
+		check_pkey = (!ff && pte_user);
+		/*
+		 * write access is controlled by PKRU if it is a
+		 * user access or CR0.WP = 1.
+		 */
+		check_write = check_pkey && wf && (uf || wp);
+
+		/* PKRU.AD stops both read and write access. */
+		pkey_bits = !!check_pkey;
+		/* PKRU.WD stops write access. */
+		pkey_bits |= (!!check_write) << 1;
+
+		mmu->pkru_mask |= (pkey_bits & 3) << pfec;
 	}
-	mmu->last_pte_bitmap = map;
+}
+
+static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+{
+	unsigned root_level = mmu->root_level;
+
+	mmu->last_nonleaf_level = root_level;
+	if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+		mmu->last_nonleaf_level++;
 }
 
 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@ -3865,7 +4022,8 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context, false);
-	update_last_pte_bitmap(vcpu, context);
+	update_pkru_bitmask(vcpu, context, false);
+	update_last_nonleaf_level(vcpu, context);
 
 	MMU_WARN_ON(!is_pae(vcpu));
 	context->page_fault = paging64_page_fault;
@@ -3892,7 +4050,8 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
 
 	reset_rsvds_bits_mask(vcpu, context);
 	update_permission_bitmask(vcpu, context, false);
-	update_last_pte_bitmap(vcpu, context);
+	update_pkru_bitmask(vcpu, context, false);
+	update_last_nonleaf_level(vcpu, context);
 
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -3950,7 +4109,8 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, context, false);
-	update_last_pte_bitmap(vcpu, context);
+	update_pkru_bitmask(vcpu, context, false);
+	update_last_nonleaf_level(vcpu, context);
 	reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -4002,6 +4162,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
 	context->direct_map = false;
 
 	update_permission_bitmask(vcpu, context, true);
+	update_pkru_bitmask(vcpu, context, true);
 	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
 	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
@@ -4056,7 +4217,8 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 	}
 
 	update_permission_bitmask(vcpu, g_context, false);
-	update_last_pte_bitmap(vcpu, g_context);
+	update_pkru_bitmask(vcpu, g_context, false);
+	update_last_nonleaf_level(vcpu, g_context);
 }
 
 static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -4127,18 +4289,6 @@ static bool need_remote_flush(u64 old, u64 new)
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
-				    bool remote_flush, bool local_flush)
-{
-	if (zap_page)
-		return;
-
-	if (remote_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	else if (local_flush)
-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-
 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 				    const u8 *new, int *bytes)
 {
@@ -4188,7 +4338,8 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
 	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
 		return false;
 
-	return ++sp->write_flooding_count >= 3;
+	atomic_inc(&sp->write_flooding_count);
+	return atomic_read(&sp->write_flooding_count) >= 3;
 }
 
 /*
@@ -4250,15 +4401,15 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
 	return spte;
 }
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
+static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+			      const u8 *new, int bytes)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry, *spte;
 	int npte;
-	bool remote_flush, local_flush, zap_page;
+	bool remote_flush, local_flush;
 	union kvm_mmu_page_role mask = { };
 
 	mask.cr0_wp = 1;
@@ -4275,7 +4426,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
 		return;
 
-	zap_page = remote_flush = local_flush = false;
+	remote_flush = local_flush = false;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
@@ -4295,8 +4446,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
 		if (detect_write_misaligned(sp, gpa, bytes) ||
 		      detect_write_flooding(sp)) {
-			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						     &invalid_list);
+			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -4318,8 +4468,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			++spte;
 		}
 	}
-	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
-	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
 	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
@@ -4356,32 +4505,34 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
-static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
-{
-	if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
-		return vcpu_match_mmio_gpa(vcpu, addr);
-
-	return vcpu_match_mmio_gva(vcpu, addr);
-}
-
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 		       void *insn, int insn_len)
 {
 	int r, emulation_type = EMULTYPE_RETRY;
 	enum emulation_result er;
+	bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+
+	if (unlikely(error_code & PFERR_RSVD_MASK)) {
+		r = handle_mmio_page_fault(vcpu, cr2, direct);
+		if (r == RET_MMIO_PF_EMULATE) {
+			emulation_type = 0;
+			goto emulate;
+		}
+		if (r == RET_MMIO_PF_RETRY)
+			return 1;
+		if (r < 0)
+			return r;
+	}
 
 	r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
 	if (r < 0)
-		goto out;
-
-	if (!r) {
-		r = 1;
-		goto out;
-	}
+		return r;
+	if (!r)
+		return 1;
 
-	if (is_mmio_page_fault(vcpu, cr2))
+	if (mmio_info_in_cache(vcpu, cr2, direct))
 		emulation_type = 0;
-
+emulate:
 	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
 	switch (er) {
@@ -4395,8 +4546,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	default:
 		BUG();
 	}
-out:
-	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
@@ -4465,6 +4614,21 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	init_kvm_mmu(vcpu);
 }
 
+void kvm_mmu_init_vm(struct kvm *kvm)
+{
+	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+	node->track_write = kvm_mmu_pte_write;
+	kvm_page_track_register_notifier(kvm, node);
+}
+
+void kvm_mmu_uninit_vm(struct kvm *kvm)
+{
+	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+
+	kvm_page_track_unregister_notifier(kvm, node);
+}
+
 /* The return value indicates if tlb flush on all vcpus is needed. */
 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
 
@@ -4621,7 +4785,7 @@ restart:
 		 */
 		if (sp->role.direct &&
 			!kvm_is_reserved_pfn(pfn) &&
-			PageTransCompound(pfn_to_page(pfn))) {
+			PageTransCompoundMap(pfn_to_page(pfn))) {
 			drop_spte(kvm, sptep);
 			need_tlb_flush = 1;
 			goto restart;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 55ffb7b0f..66b33b96a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -10,10 +10,11 @@
 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
 
 #define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
 
 #define PT_PRESENT_MASK (1ULL << 0)
 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
 #define PT_PWT_MASK (1ULL << 3)
 #define PT_PCD_MASK (1ULL << 4)
 #define PT_ACCESSED_SHIFT 5
@@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
 }
 
 /*
- * Will a fault with a given page-fault error code (pfec) cause a permission
- * fault with the given access (in ACC_* format)?
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
  */
-static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-				    unsigned pte_access, unsigned pfec)
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				  unsigned pte_access, unsigned pte_pkey,
+				  unsigned pfec)
 {
 	int cpl = kvm_x86_ops->get_cpl(vcpu);
 	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
@@ -166,12 +172,38 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
 	int index = (pfec >> 1) +
 		    (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
-
-	WARN_ON(pfec & PFERR_RSVD_MASK);
-
-	return (mmu->permissions[index] >> pte_access) & 1;
+	bool fault = (mmu->permissions[index] >> pte_access) & 1;
+	u32 errcode = PFERR_PRESENT_MASK;
+
+	WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
+	if (unlikely(mmu->pkru_mask)) {
+		u32 pkru_bits, offset;
+
+		/*
+		* PKRU defines 32 bits, there are 16 domains and 2
+		* attribute bits per domain in pkru.  pte_pkey is the
+		* index of the protection domain, so pte_pkey * 2 is
+		* is the index of the first bit for the domain.
+		*/
+		pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+
+		/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+		offset = (pfec & ~1) +
+			((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
+
+		pkru_bits &= mmu->pkru_mask >> offset;
+		errcode |= -pkru_bits & PFERR_PK_MASK;
+		fault |= (pkru_bits != 0);
+	}
+
+	return -(u32)fault & errcode;
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
+
+void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+				    struct kvm_memory_slot *slot, u64 gfn);
 #endif
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 3f8c73211..c146f3c26 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -44,8 +44,6 @@ static bool msr_mtrr_valid(unsigned msr)
 	case MSR_MTRRdefType:
 	case MSR_IA32_CR_PAT:
 		return true;
-	case 0x2f8:
-		return true;
 	}
 	return false;
 }
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
new file mode 100644
index 000000000..b431539c3
--- /dev/null
+++ b/arch/x86/kvm/page_track.c
@@ -0,0 +1,227 @@
+/*
+ * Support KVM gust page tracking
+ *
+ * This feature allows us to track page access in guest. Currently, only
+ * write access is tracked.
+ *
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Author:
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_page_track.h>
+
+#include "mmu.h"
+
+void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
+				 struct kvm_memory_slot *dont)
+{
+	int i;
+
+	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++)
+		if (!dont || free->arch.gfn_track[i] !=
+		      dont->arch.gfn_track[i]) {
+			kvfree(free->arch.gfn_track[i]);
+			free->arch.gfn_track[i] = NULL;
+		}
+}
+
+int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
+				  unsigned long npages)
+{
+	int  i;
+
+	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+		slot->arch.gfn_track[i] = kvm_kvzalloc(npages *
+					    sizeof(*slot->arch.gfn_track[i]));
+		if (!slot->arch.gfn_track[i])
+			goto track_free;
+	}
+
+	return 0;
+
+track_free:
+	kvm_page_track_free_memslot(slot, NULL);
+	return -ENOMEM;
+}
+
+static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode)
+{
+	if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX)
+		return false;
+
+	return true;
+}
+
+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+			     enum kvm_page_track_mode mode, short count)
+{
+	int index, val;
+
+	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+
+	val = slot->arch.gfn_track[mode][index];
+
+	if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+		return;
+
+	slot->arch.gfn_track[mode][index] += count;
+}
+
+/*
+ * add guest page to the tracking pool so that corresponding access on that
+ * page will be intercepted.
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_add_page(struct kvm *kvm,
+				  struct kvm_memory_slot *slot, gfn_t gfn,
+				  enum kvm_page_track_mode mode)
+{
+
+	if (WARN_ON(!page_track_mode_is_valid(mode)))
+		return;
+
+	update_gfn_track(slot, gfn, mode, 1);
+
+	/*
+	 * new track stops large page mapping for the
+	 * tracked page.
+	 */
+	kvm_mmu_gfn_disallow_lpage(slot, gfn);
+
+	if (mode == KVM_PAGE_TRACK_WRITE)
+		if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
+			kvm_flush_remote_tlbs(kvm);
+}
+
+/*
+ * remove the guest page from the tracking pool which stops the interception
+ * of corresponding access on that page. It is the opposed operation of
+ * kvm_slot_page_track_add_page().
+ *
+ * It should be called under the protection both of mmu-lock and kvm->srcu
+ * or kvm->slots_lock.
+ *
+ * @kvm: the guest instance we are interested in.
+ * @slot: the @gfn belongs to.
+ * @gfn: the guest page.
+ * @mode: tracking mode, currently only write track is supported.
+ */
+void kvm_slot_page_track_remove_page(struct kvm *kvm,
+				     struct kvm_memory_slot *slot, gfn_t gfn,
+				     enum kvm_page_track_mode mode)
+{
+	if (WARN_ON(!page_track_mode_is_valid(mode)))
+		return;
+
+	update_gfn_track(slot, gfn, mode, -1);
+
+	/*
+	 * allow large page mapping for the tracked page
+	 * after the tracker is gone.
+	 */
+	kvm_mmu_gfn_allow_lpage(slot, gfn);
+}
+
+/*
+ * check if the corresponding access on the specified guest page is tracked.
+ */
+bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
+			      enum kvm_page_track_mode mode)
+{
+	struct kvm_memory_slot *slot;
+	int index;
+
+	if (WARN_ON(!page_track_mode_is_valid(mode)))
+		return false;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	if (!slot)
+		return false;
+
+	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+}
+
+void kvm_page_track_init(struct kvm *kvm)
+{
+	struct kvm_page_track_notifier_head *head;
+
+	head = &kvm->arch.track_notifier_head;
+	init_srcu_struct(&head->track_srcu);
+	INIT_HLIST_HEAD(&head->track_notifier_list);
+}
+
+/*
+ * register the notifier so that event interception for the tracked guest
+ * pages can be received.
+ */
+void
+kvm_page_track_register_notifier(struct kvm *kvm,
+				 struct kvm_page_track_notifier_node *n)
+{
+	struct kvm_page_track_notifier_head *head;
+
+	head = &kvm->arch.track_notifier_head;
+
+	spin_lock(&kvm->mmu_lock);
+	hlist_add_head_rcu(&n->node, &head->track_notifier_list);
+	spin_unlock(&kvm->mmu_lock);
+}
+
+/*
+ * stop receiving the event interception. It is the opposed operation of
+ * kvm_page_track_register_notifier().
+ */
+void
+kvm_page_track_unregister_notifier(struct kvm *kvm,
+				   struct kvm_page_track_notifier_node *n)
+{
+	struct kvm_page_track_notifier_head *head;
+
+	head = &kvm->arch.track_notifier_head;
+
+	spin_lock(&kvm->mmu_lock);
+	hlist_del_rcu(&n->node);
+	spin_unlock(&kvm->mmu_lock);
+	synchronize_srcu(&head->track_srcu);
+}
+
+/*
+ * Notify the node that write access is intercepted and write emulation is
+ * finished at this time.
+ *
+ * The node should figure out if the written page is the one that node is
+ * interested in by itself.
+ */
+void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
+			  int bytes)
+{
+	struct kvm_page_track_notifier_head *head;
+	struct kvm_page_track_notifier_node *n;
+	int idx;
+
+	head = &vcpu->kvm->arch.track_notifier_head;
+
+	if (hlist_empty(&head->track_notifier_list))
+		return;
+
+	idx = srcu_read_lock(&head->track_srcu);
+	hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+		if (n->track_write)
+			n->track_write(vcpu, gpa, new, bytes);
+	srcu_read_unlock(&head->track_srcu, idx);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2ce4f05e8..bc019f70e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -189,8 +189,11 @@ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
 		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
 		ACC_USER_MASK;
 #else
-	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
-	access &= ~(gpte >> PT64_NX_SHIFT);
+	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
+	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
+	access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK);
+	/* Combine NX with P (which is set here) to get ACC_EXEC_MASK.  */
+	access ^= (gpte >> PT64_NX_SHIFT);
 #endif
 
 	return access;
@@ -254,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+	unsigned pkeys = 0;
+#if PTTYPE == 64
+	pte_t pte = {.pte = gpte};
+
+	pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+	return pkeys;
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -265,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	pt_element_t pte;
 	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access, accessed_dirty;
+	unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
 	gpa_t pte_gpa;
 	int offset;
 	const int write_fault = access & PFERR_WRITE_MASK;
@@ -346,7 +360,7 @@ retry_walk:
 			goto error;
 
 		if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
-			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+			errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
 			goto error;
 		}
 
@@ -356,10 +370,10 @@ retry_walk:
 		walker->ptes[walker->level - 1] = pte;
 	} while (!is_last_gpte(mmu, walker->level, pte));
 
-	if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
-		errcode |= PFERR_PRESENT_MASK;
+	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+	errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+	if (unlikely(errcode))
 		goto error;
-	}
 
 	gfn = gpte_to_gfn_lvl(pte, walker->level);
 	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
@@ -702,24 +716,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
-	if (unlikely(error_code & PFERR_RSVD_MASK)) {
-		r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu));
-		if (likely(r != RET_MMIO_PF_INVALID))
-			return r;
-
-		/*
-		 * page fault with PFEC.RSVD  = 1 is caused by shadow
-		 * page fault, should not be used to walk guest page
-		 * table.
-		 */
-		error_code &= ~PFERR_RSVD_MASK;
-	};
-
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
 
 	/*
+	 * If PFEC.RSVD is set, this is a shadow page fault.
+	 * The bit needs to be cleared before walking guest page tables.
+	 */
+	error_code &= ~PFERR_RSVD_MASK;
+
+	/*
 	 * Look up the guest pte for the faulting address.
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
@@ -735,6 +742,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		return 0;
 	}
 
+	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
+		shadow_page_table_clear_flood(vcpu, addr);
+		return 1;
+	}
+
 	vcpu->arch.write_fault_to_shadow_pgtable = false;
 
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
@@ -945,9 +957,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
 					       sizeof(pt_element_t)))
-			return -EINVAL;
+			return 0;
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+			/*
+			 * Update spte before increasing tlbs_dirty to make
+			 * sure no tlb flush is lost after spte is zapped; see
+			 * the comments in kvm_flush_remote_tlbs().
+			 */
+			smp_wmb();
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
@@ -963,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		if (gfn != sp->gfns[i]) {
 			drop_spte(vcpu->kvm, &sp->spt[i]);
+			/*
+			 * The same as above where we are doing
+			 * prefetch_invalid_gpte().
+			 */
+			smp_wmb();
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
@@ -977,7 +1000,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 			 host_writable);
 	}
 
-	return !nr_present;
+	return nr_present;
 }
 
 #undef pt_element_t
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 31aa2c85d..06ce377dc 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -257,7 +257,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 
 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.apic)
+	if (lapic_in_kernel(vcpu))
 		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
 }
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c13a64b7d..31346a3f2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	to_svm(vcpu)->vmcb->save.rflags = rflags;
 }
 
+static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 {
 	switch (reg) {
@@ -1858,8 +1863,7 @@ static int halt_interception(struct vcpu_svm *svm)
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	kvm_emulate_hypercall(&svm->vcpu);
-	return 1;
+	return kvm_emulate_hypercall(&svm->vcpu);
 }
 
 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
@@ -4348,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.cache_reg = svm_cache_reg,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
+
+	.get_pkru = svm_get_pkru,
+
 	.fpu_activate = svm_fpu_activate,
 	.fpu_deactivate = svm_fpu_deactivate,
 
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ad9f6a23f..2f1ea2f61 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -996,11 +996,13 @@ TRACE_EVENT(kvm_enter_smm,
  * Tracepoint for VT-d posted-interrupts.
  */
 TRACE_EVENT(kvm_pi_irte_update,
-	TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
-		 unsigned int gvec, u64 pi_desc_addr, bool set),
-	TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+	TP_PROTO(unsigned int host_irq, unsigned int vcpu_id,
+		 unsigned int gsi, unsigned int gvec,
+		 u64 pi_desc_addr, bool set),
+	TP_ARGS(host_irq, vcpu_id, gsi, gvec, pi_desc_addr, set),
 
 	TP_STRUCT__entry(
+		__field(	unsigned int,	host_irq	)
 		__field(	unsigned int,	vcpu_id		)
 		__field(	unsigned int,	gsi		)
 		__field(	unsigned int,	gvec		)
@@ -1009,6 +1011,7 @@ TRACE_EVENT(kvm_pi_irte_update,
 	),
 
 	TP_fast_assign(
+		__entry->host_irq	= host_irq;
 		__entry->vcpu_id	= vcpu_id;
 		__entry->gsi		= gsi;
 		__entry->gvec		= gvec;
@@ -1016,9 +1019,10 @@ TRACE_EVENT(kvm_pi_irte_update,
 		__entry->set		= set;
 	),
 
-	TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+	TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, "
 		  "gvec: 0x%x, pi_desc_addr: 0x%llx",
 		  __entry->set ? "enabled and being updated" : "disabled",
+		  __entry->host_irq,
 		  __entry->vcpu_id,
 		  __entry->gsi,
 		  __entry->gvec,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 539062e24..faf52bac1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -598,6 +598,10 @@ struct vcpu_vmx {
 	struct page *pml_pg;
 
 	u64 current_tsc_ratio;
+
+	bool guest_pkru_valid;
+	u32 guest_pkru;
+	u32 host_pkru;
 };
 
 enum segment_cache_field {
@@ -863,7 +867,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -963,25 +966,36 @@ static const u32 vmx_msr_index[] = {
 	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
 };
 
-static inline bool is_page_fault(u32 intr_info)
+static inline bool is_exception_n(u32 intr_info, u8 vector)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+		(INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+}
+
+static inline bool is_debug(u32 intr_info)
+{
+	return is_exception_n(intr_info, DB_VECTOR);
+}
+
+static inline bool is_breakpoint(u32 intr_info)
+{
+	return is_exception_n(intr_info, BP_VECTOR);
+}
+
+static inline bool is_page_fault(u32 intr_info)
+{
+	return is_exception_n(intr_info, PF_VECTOR);
 }
 
 static inline bool is_no_device(u32 intr_info)
 {
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+	return is_exception_n(intr_info, NM_VECTOR);
 }
 
 static inline bool is_invalid_opcode(u32 intr_info)
 {
-	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
-			     INTR_INFO_VALID_MASK)) ==
-		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+	return is_exception_n(intr_info, UD_VECTOR);
 }
 
 static inline bool is_external_interrupt(u32 intr_info)
@@ -2097,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
 	} while (cmpxchg(&pi_desc->control, old.control,
 			new.control) != old.control);
 }
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -2157,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	vmx_vcpu_pi_load(vcpu, cpu);
+	vmx->host_pkru = read_pkru();
 }
 
 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2276,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
+static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->guest_pkru;
+}
+
 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
 {
 	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2605,7 +2626,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
 		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
 
-	if (vmx_mpx_supported())
+	if (kvm_mpx_supported())
 		vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
 
 	/* We support free control of debug control saving. */
@@ -2626,7 +2647,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		VM_ENTRY_LOAD_IA32_PAT;
 	vmx->nested.nested_vmx_entry_ctls_high |=
 		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
-	if (vmx_mpx_supported())
+	if (kvm_mpx_supported())
 		vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
 
 	/* We support free control of debug control loading. */
@@ -2877,7 +2898,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	case MSR_IA32_BNDCFGS:
-		if (!vmx_mpx_supported())
+		if (!kvm_mpx_supported())
 			return 1;
 		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
@@ -2954,7 +2975,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		vmcs_writel(GUEST_SYSENTER_ESP, data);
 		break;
 	case MSR_IA32_BNDCFGS:
-		if (!vmx_mpx_supported())
+		if (!kvm_mpx_supported())
 			return 1;
 		vmcs_write64(GUEST_BNDCFGS, data);
 		break;
@@ -3082,6 +3103,8 @@ static __init int vmx_disabled_by_bios(void)
 
 static void kvm_cpu_vmxon(u64 addr)
 {
+	intel_pt_handle_vmx(1);
+
 	asm volatile (ASM_VMX_VMXON_RAX
 			: : "a"(&addr), "m"(addr)
 			: "memory", "cc");
@@ -3151,6 +3174,8 @@ static void vmclear_local_loaded_vmcss(void)
 static void kvm_cpu_vmxoff(void)
 {
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+
+	intel_pt_handle_vmx(0);
 }
 
 static void hardware_disable(void)
@@ -3427,7 +3452,7 @@ static void init_vmcs_shadow_fields(void)
 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
 		switch (shadow_read_write_fields[i]) {
 		case GUEST_BNDCFGS:
-			if (!vmx_mpx_supported())
+			if (!kvm_mpx_supported())
 				continue;
 			break;
 		default:
@@ -3883,13 +3908,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	if (!enable_unrestricted_guest && !is_paging(vcpu))
 		/*
-		 * SMEP/SMAP is disabled if CPU is in non-paging mode in
-		 * hardware.  However KVM always uses paging mode without
-		 * unrestricted guest.
-		 * To emulate this behavior, SMEP/SMAP needs to be manually
-		 * disabled when guest switches to non-paging mode.
+		 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+		 * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
+		 * to be manually disabled when guest switches to non-paging
+		 * mode.
+		 *
+		 * If !enable_unrestricted_guest, the CPU is always running
+		 * with CR0.PG=1 and CR4 needs to be modified.
+		 * If enable_unrestricted_guest, the CPU automatically
+		 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
 		 */
-		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
 
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
@@ -5021,8 +5050,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
 	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 	vmx->vcpu.arch.cr0 = cr0;
+	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 	vmx_set_cr4(vcpu, 0);
 	vmx_set_efer(vcpu, 0);
 	vmx_fpu_activate(vcpu);
@@ -5503,7 +5532,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 		return kvm_set_cr4(vcpu, val);
 }
 
-/* called to set cr0 as approriate for clts instruction exit. */
+/* called to set cr0 as appropriate for clts instruction exit. */
 static void handle_clts(struct kvm_vcpu *vcpu)
 {
 	if (is_guest_mode(vcpu)) {
@@ -5636,11 +5665,8 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 	}
 
 	if (vcpu->guest_debug == 0) {
-		u32 cpu_based_vm_exec_control;
-
-		cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-		cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
-		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+				CPU_BASED_MOV_DR_EXITING);
 
 		/*
 		 * No more DR vmexits; force a reload of the debug registers
@@ -5677,8 +5703,6 @@ static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
 
 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 {
-	u32 cpu_based_vm_exec_control;
-
 	get_debugreg(vcpu->arch.db[0], 0);
 	get_debugreg(vcpu->arch.db[1], 1);
 	get_debugreg(vcpu->arch.db[2], 2);
@@ -5687,10 +5711,7 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
 
 	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
 }
 
 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5775,8 +5796,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-	kvm_emulate_hypercall(vcpu);
-	return 1;
+	return kvm_emulate_hypercall(vcpu);
 }
 
 static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6463,8 +6483,8 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
 
 	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
 		/* Recycle the least recently used VMCS. */
-		item = list_entry(vmx->nested.vmcs02_pool.prev,
-			struct vmcs02_list, list);
+		item = list_last_entry(&vmx->nested.vmcs02_pool,
+				       struct vmcs02_list, list);
 		item->vmptr = vmx->nested.current_vmptr;
 		list_move(&item->list, &vmx->nested.vmcs02_pool);
 		return &item->vmcs02;
@@ -7251,7 +7271,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 	/* The value to write might be 32 or 64 bits, depending on L1's long
 	 * mode, and eventually we need to write that into a field of several
 	 * possible lengths. The code below first zero-extends the value to 64
-	 * bit (field_value), and then copies only the approriate number of
+	 * bit (field_value), and then copies only the appropriate number of
 	 * bits into the vmcs12 field.
 	 */
 	u64 field_value = 0;
@@ -7787,6 +7807,13 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		else if (is_no_device(intr_info) &&
 			 !(vmcs12->guest_cr0 & X86_CR0_TS))
 			return false;
+		else if (is_debug(intr_info) &&
+			 vcpu->guest_debug &
+			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+			return false;
+		else if (is_breakpoint(intr_info) &&
+			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			return false;
 		return vmcs12->exception_bitmap &
 				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
 	case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -8391,6 +8418,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 {
 	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	register void *__sp asm(_ASM_SP);
 
 	/*
 	 * If external interrupt exists, IF bit is set in rflags/eflags on the
@@ -8423,8 +8451,9 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			"call *%[entry]\n\t"
 			:
 #ifdef CONFIG_X86_64
-			[sp]"=&r"(tmp)
+			[sp]"=&r"(tmp),
 #endif
+			"+r"(__sp)
 			:
 			[entry]"r"(entry),
 			[ss]"i"(__KERNEL_DS),
@@ -8625,6 +8654,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	if (vmx->guest_pkru_valid)
+		__write_pkru(vmx->guest_pkru);
+
 	atomic_switch_perf_msrs(vmx);
 	debugctlmsr = get_debugctlmsr();
 
@@ -8765,6 +8797,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
 	/*
+	 * eager fpu is enabled if PKEY is supported and CR4 is switched
+	 * back on host, so it is safe to read guest PKRU from current
+	 * XSAVE.
+	 */
+	if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+		vmx->guest_pkru = __read_pkru();
+		if (vmx->guest_pkru != vmx->host_pkru) {
+			vmx->guest_pkru_valid = true;
+			__write_pkru(vmx->host_pkru);
+		} else
+			vmx->guest_pkru_valid = false;
+	}
+
+	/*
 	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
 	 * we did not inject a still-pending event to L1 now because of
 	 * nested_run_pending, we need to re-enable this bit.
@@ -10291,7 +10337,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
 	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-	if (vmx_mpx_supported())
+	if (kvm_mpx_supported())
 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
 	if (nested_cpu_has_xsaves(vmcs12))
 		vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@ -10799,13 +10845,26 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 		 */
 
 		kvm_set_msi_irq(e, &irq);
-		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+			/*
+			 * Make sure the IRTE is in remapped mode if
+			 * we don't handle it in posted mode.
+			 */
+			ret = irq_set_vcpu_affinity(host_irq, NULL);
+			if (ret < 0) {
+				printk(KERN_INFO
+				   "failed to back to remapped mode, irq: %u\n",
+				   host_irq);
+				goto out;
+			}
+
 			continue;
+		}
 
 		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
 		vcpu_info.vector = irq.vector;
 
-		trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+		trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
 				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
 
 		if (set)
@@ -10875,6 +10934,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
+
+	.get_pkru = vmx_get_pkru,
+
 	.fpu_activate = vmx_fpu_activate,
 	.fpu_deactivate = vmx_fpu_deactivate,
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac4963c38..9b7798c7b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -123,6 +123,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
 unsigned int __read_mostly lapic_timer_advance_ns = 0;
 module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 
+static bool __read_mostly vector_hashing = true;
+module_param(vector_hashing, bool, S_IRUGO);
+
 static bool __read_mostly backwards_tsc_observed = false;
 
 #define KVM_NR_SHARED_MSRS 16
@@ -719,7 +722,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
-				   X86_CR4_SMEP | X86_CR4_SMAP;
+				   X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
 
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
@@ -736,6 +739,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
 		return 1;
 
+	if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;
@@ -761,7 +767,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
 		kvm_mmu_reset_context(vcpu);
 
-	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
 		kvm_update_cpuid(vcpu);
 
 	return 0;
@@ -1195,17 +1201,11 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 
 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
 {
-	uint32_t quotient, remainder;
-
-	/* Don't try to replace with do_div(), this one calculates
-	 * "(dividend << 32) / divisor" */
-	__asm__ ( "divl %4"
-		  : "=a" (quotient), "=d" (remainder)
-		  : "0" (0), "1" (dividend), "r" (divisor) );
-	return quotient;
+	do_shl32_div32(dividend, divisor);
+	return dividend;
 }
 
-static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
 			       s8 *pshift, u32 *pmultiplier)
 {
 	uint64_t scaled64;
@@ -1213,8 +1213,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 	uint64_t tps64;
 	uint32_t tps32;
 
-	tps64 = base_khz * 1000LL;
-	scaled64 = scaled_khz * 1000LL;
+	tps64 = base_hz;
+	scaled64 = scaled_hz;
 	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
 		tps64 >>= 1;
 		shift--;
@@ -1232,8 +1232,8 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
 	*pshift = shift;
 	*pmultiplier = div_frac(scaled64, tps32);
 
-	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
-		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
+	pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+		 __func__, base_hz, scaled_hz, shift, *pmultiplier);
 }
 
 #ifdef CONFIG_X86_64
@@ -1292,23 +1292,23 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
 	return 0;
 }
 
-static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 {
 	u32 thresh_lo, thresh_hi;
 	int use_scaling = 0;
 
 	/* tsc_khz can be zero if TSC calibration fails */
-	if (this_tsc_khz == 0) {
+	if (user_tsc_khz == 0) {
 		/* set tsc_scaling_ratio to a safe value */
 		vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
 		return -1;
 	}
 
 	/* Compute a scale to convert nanoseconds in TSC cycles */
-	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
 			   &vcpu->arch.virtual_tsc_shift,
 			   &vcpu->arch.virtual_tsc_mult);
-	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
 
 	/*
 	 * Compute the variation in TSC rate which is acceptable
@@ -1318,11 +1318,11 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
 	 */
 	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
 	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
-	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
-		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
 		use_scaling = 1;
 	}
-	return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
 }
 
 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@ -1561,7 +1561,7 @@ static cycle_t read_tsc(void)
 
 	/*
 	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a funciton of time and the likely is
+	 * predictable (it's just a function of time and the likely is
 	 * very likely) and there's a data dependence, so force GCC
 	 * to generate a branch instead.  I don't barrier() because
 	 * we don't actually need a barrier, and if this function
@@ -1715,7 +1715,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-	unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+	unsigned long flags, tgt_tsc_khz;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
@@ -1741,8 +1741,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
-	if (unlikely(this_tsc_khz == 0)) {
+	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+	if (unlikely(tgt_tsc_khz == 0)) {
 		local_irq_restore(flags);
 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
 		return 1;
@@ -1777,13 +1777,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	if (!vcpu->pv_time_enabled)
 		return 0;
 
-	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
-		tgt_tsc_khz = kvm_has_tsc_control ?
-			vcpu->virtual_tsc_khz : this_tsc_khz;
-		kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+	if (kvm_has_tsc_control)
+		tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+
+	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
 				   &vcpu->hv_clock.tsc_shift,
 				   &vcpu->hv_clock.tsc_to_system_mul);
-		vcpu->hw_tsc_khz = this_tsc_khz;
+		vcpu->hw_tsc_khz = tgt_tsc_khz;
 	}
 
 	/* With all the info we got, fill in the values */
@@ -2751,7 +2752,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
-	vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2987,7 +2987,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
-	    kvm_vcpu_has_lapic(vcpu))
+	    lapic_in_kernel(vcpu))
 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
 	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@ -3000,7 +3000,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
 		else
 			vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
-		if (kvm_vcpu_has_lapic(vcpu)) {
+		if (lapic_in_kernel(vcpu)) {
 			if (events->smi.latched_init)
 				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 			else
@@ -3240,7 +3240,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
-		if (!vcpu->arch.apic)
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
@@ -3258,7 +3258,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 	case KVM_SET_LAPIC: {
 		r = -EINVAL;
-		if (!vcpu->arch.apic)
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 		u.lapic = memdup_user(argp, sizeof(*u.lapic));
 		if (IS_ERR(u.lapic))
@@ -3605,20 +3605,26 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+
+	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+
+	mutex_lock(&kps->lock);
+	memcpy(ps, &kps->channels, sizeof(*ps));
+	mutex_unlock(&kps->lock);
 	return 0;
 }
 
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 	int i;
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	mutex_lock(&pit->pit_state.lock);
+	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
 	for (i = 0; i < 3; i++)
-		kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 
@@ -3638,29 +3644,39 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 	int start = 0;
 	int i;
 	u32 prev_legacy, cur_legacy;
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	mutex_lock(&pit->pit_state.lock);
+	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	if (!prev_legacy && cur_legacy)
 		start = 1;
-	memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
-	       sizeof(kvm->arch.vpit->pit_state.channels));
-	kvm->arch.vpit->pit_state.flags = ps->flags;
+	memcpy(&pit->pit_state.channels, &ps->channels,
+	       sizeof(pit->pit_state.channels));
+	pit->pit_state.flags = ps->flags;
 	for (i = 0; i < 3; i++)
-		kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
 				   start && i == 0);
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+	mutex_unlock(&pit->pit_state.lock);
 	return 0;
 }
 
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 				 struct kvm_reinject_control *control)
 {
-	if (!kvm->arch.vpit)
+	struct kvm_pit *pit = kvm->arch.vpit;
+
+	if (!pit)
 		return -ENXIO;
-	mutex_lock(&kvm->arch.vpit->pit_state.lock);
-	kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
-	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+
+	/* pit->pit_state.lock was overloaded to prevent userspace from getting
+	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+	 */
+	mutex_lock(&pit->pit_state.lock);
+	kvm_pit_set_reinject(pit, control->pit_reinject);
+	mutex_unlock(&pit->pit_state.lock);
+
 	return 0;
 }
 
@@ -4093,7 +4109,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 
 	do {
 		n = min(len, 8);
-		if (!(vcpu->arch.apic &&
+		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
 		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
 			break;
@@ -4113,7 +4129,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 
 	do {
 		n = min(len, 8);
-		if (!(vcpu->arch.apic &&
+		if (!(lapic_in_kernel(vcpu) &&
 		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
 					 addr, n, v))
 		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@ -4312,9 +4328,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
 		| (write ? PFERR_WRITE_MASK : 0);
 
+	/*
+	 * currently PKRU is only applied to ept enabled guest so
+	 * there is no pkey in EPT page table for L1 guest or EPT
+	 * shadow page table for L2 guest.
+	 */
 	if (vcpu_match_mmio_gva(vcpu, gva)
 	    && !permission_fault(vcpu, vcpu->arch.walk_mmu,
-				 vcpu->arch.access, access)) {
+				 vcpu->arch.access, 0, access)) {
 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 					(gva & (PAGE_SIZE - 1));
 		trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -4346,7 +4367,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	kvm_page_track_write(vcpu, gpa, val, bytes);
 	return 1;
 }
 
@@ -4604,7 +4625,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 		return X86EMUL_CMPXCHG_FAILED;
 
 	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
-	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+	kvm_page_track_write(vcpu, gpa, new, bytes);
 
 	return X86EMUL_CONTINUE;
 
@@ -6010,7 +6031,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	if (!kvm_x86_ops->update_cr8_intercept)
 		return;
 
-	if (!vcpu->arch.apic)
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	if (vcpu->arch.apicv_active)
@@ -6572,8 +6593,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 
-	/* We should set ->mode before check ->requests,
-	 * see the comment in make_all_cpus_request.
+	/*
+	 * We should set ->mode before check ->requests,
+	 * Please see the comment in kvm_make_all_cpus_request.
+	 * This also orders the write to mode from any reads
+	 * to the page tables done while the VCPU is running.
+	 * Please see the comment in kvm_flush_remote_tlbs.
 	 */
 	smp_mb__after_srcu_read_unlock();
 
@@ -7040,7 +7065,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	if (!kvm_vcpu_has_lapic(vcpu) &&
+	if (!lapic_in_kernel(vcpu) &&
 	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
 		return -EINVAL;
 
@@ -7111,7 +7136,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
-	if (sregs->cr4 & X86_CR4_OSXSAVE)
+	if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
 		kvm_update_cpuid(vcpu);
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -7313,7 +7338,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	 * Every 255 times fpu_counter rolls over to 0; a guest that uses
 	 * the FPU in bursts will revert to loading it on demand.
 	 */
-	if (!vcpu->arch.eager_fpu) {
+	if (!use_eager_fpu()) {
 		if (++vcpu->fpu_counter < 5)
 			kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	}
@@ -7592,6 +7617,7 @@ bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 }
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
+EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
@@ -7723,6 +7749,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
 
+	kvm_page_track_init(kvm);
+	kvm_mmu_init_vm(kvm);
+
 	return 0;
 }
 
@@ -7849,6 +7878,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
 	kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+	kvm_mmu_uninit_vm(kvm);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -7870,6 +7900,8 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 			free->arch.lpage_info[i - 1] = NULL;
 		}
 	}
+
+	kvm_page_track_free_memslot(free, dont);
 }
 
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@ -7878,6 +7910,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	int i;
 
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+		struct kvm_lpage_info *linfo;
 		unsigned long ugfn;
 		int lpages;
 		int level = i + 1;
@@ -7892,15 +7925,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 		if (i == 0)
 			continue;
 
-		slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
-					sizeof(*slot->arch.lpage_info[i - 1]));
-		if (!slot->arch.lpage_info[i - 1])
+		linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+		if (!linfo)
 			goto out_free;
 
+		slot->arch.lpage_info[i - 1] = linfo;
+
 		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->arch.lpage_info[i - 1][0].write_count = 1;
+			linfo[0].disallow_lpage = 1;
 		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
-			slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+			linfo[lpages - 1].disallow_lpage = 1;
 		ugfn = slot->userspace_addr >> PAGE_SHIFT;
 		/*
 		 * If the gfn and userspace address are not aligned wrt each
@@ -7912,10 +7946,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			unsigned long j;
 
 			for (j = 0; j < lpages; ++j)
-				slot->arch.lpage_info[i - 1][j].write_count = 1;
+				linfo[j].disallow_lpage = 1;
 		}
 	}
 
+	if (kvm_page_track_create_memslot(slot, npages))
+		goto out_free;
+
 	return 0;
 
 out_free:
@@ -8369,6 +8406,12 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
 	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
 }
 
+bool kvm_vector_hashing_enabled(void)
+{
+	return vector_hashing;
+}
+EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f2afa5fe4..7ce3634ab 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -179,10 +179,12 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
 					  int page_num);
+bool kvm_vector_hashing_enabled(void);
 
 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
-				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
+				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+				| XFEATURE_MASK_PKRU)
 extern u64 host_xcr0;
 
 extern u64 kvm_supported_xcr0(void);
@@ -192,4 +194,19 @@ extern unsigned int min_timer_period_us;
 extern unsigned int lapic_timer_advance_ns;
 
 extern struct static_key kvm_no_apic_vcpu;
+
+/* Same "calling convention" as do_div:
+ * - divide (n << 32) by base
+ * - put result in n
+ * - return remainder
+ */
+#define do_shl32_div32(n, base)					\
+	({							\
+	    u32 __quot, __rem;					\
+	    asm("divl %2" : "=a" (__quot), "=d" (__rem)		\
+			: "rm" (base), "0" (0), "1" ((u32) n));	\
+	    n = __quot;						\
+	    __rem;						\
+	 })
+
 #endif