Initial import

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
commit: 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree: 5e910f0e82173f4ef4f51111366a3f1299037a7b /arch/powerpc/kvm
57 files changed, 33297 insertions, 0 deletions
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
new file mode 100644
index 000000000..3caec2c42
--- /dev/null
+++ b/arch/powerpc/kvm/Kconfig
@@ -0,0 +1,198 @@
+#
+# KVM configuration
+#
+
+source "virt/kvm/Kconfig"
+
+menuconfig VIRTUALIZATION
+	bool "Virtualization"
+	---help---
+	  Say Y here to get to see options for using your Linux host to run
+	  other operating systems inside virtual machines (guests).
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled.
+
+if VIRTUALIZATION
+
+config KVM
+	bool
+	select PREEMPT_NOTIFIERS
+	select ANON_INODES
+	select HAVE_KVM_EVENTFD
+	select SRCU
+
+config KVM_BOOK3S_HANDLER
+	bool
+
+config KVM_BOOK3S_32_HANDLER
+	bool
+	select KVM_BOOK3S_HANDLER
+	select KVM_MMIO
+
+config KVM_BOOK3S_64_HANDLER
+	bool
+	select KVM_BOOK3S_HANDLER
+
+config KVM_BOOK3S_PR_POSSIBLE
+	bool
+	select KVM_MMIO
+	select MMU_NOTIFIER
+
+config KVM_BOOK3S_HV_POSSIBLE
+	bool
+
+config KVM_BOOK3S_32
+	tristate "KVM support for PowerPC book3s_32 processors"
+	depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT
+	select KVM
+	select KVM_BOOK3S_32_HANDLER
+	select KVM_BOOK3S_PR_POSSIBLE
+	---help---
+	  Support running unmodified book3s_32 guest kernels
+	  in virtual machines on book3s_32 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64
+	tristate "KVM support for PowerPC book3s_64 processors"
+	depends on PPC_BOOK3S_64
+	select KVM_BOOK3S_64_HANDLER
+	select KVM
+	select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE
+	---help---
+	  Support running unmodified book3s_64 and book3s_32 guest kernels
+	  in virtual machines on book3s_64 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_HV
+	tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+	depends on KVM_BOOK3S_64 && PPC_POWERNV
+	select KVM_BOOK3S_HV_POSSIBLE
+	select MMU_NOTIFIER
+	select CMA
+	---help---
+	  Support running unmodified book3s_64 guest kernels in
+	  virtual machines on POWER7 and PPC970 processors that have
+	  hypervisor mode available to the host.
+
+	  If you say Y here, KVM will use the hardware virtualization
+	  facilities of POWER7 (and later) processors, meaning that
+	  guest operating systems will run at full hardware speed
+	  using supervisor and user modes.  However, this also means
+	  that KVM is not usable under PowerVM (pHyp), is only usable
+	  on POWER7 (or later) processors and PPC970-family processors,
+	  and cannot emulate a different processor from the host processor.
+
+	  If unsure, say N.
+
+config KVM_BOOK3S_64_PR
+	tristate "KVM support without using hypervisor mode in host"
+	depends on KVM_BOOK3S_64
+	select KVM_BOOK3S_PR_POSSIBLE
+	---help---
+	  Support running guest kernels in virtual machines on processors
+	  without using hypervisor mode in the host, by running the
+	  guest in user mode (problem state) and emulating all
+	  privileged instructions and registers.
+
+	  This is not as fast as using hypervisor mode, but works on
+	  machines where hypervisor mode is not available or not usable,
+	  and can emulate processors that are different from the host
+	  processor, including emulating 32-bit processors on a 64-bit
+	  host.
+
+config KVM_BOOK3S_HV_EXIT_TIMING
+	bool "Detailed timing for hypervisor real-mode code"
+	depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS
+	---help---
+	  Calculate time taken for each vcpu in the real-mode guest entry,
+	  exit, and interrupt handling code, plus time spent in the guest
+	  and in nap mode due to idle (cede) while other threads are still
+	  in the guest.  The total, minimum and maximum times in nanoseconds
+	  together with the number of executions are reported in debugfs in
+	  kvm/vm#/vcpu#/timings.  The overhead is of the order of 30 - 40
+	  ns per exit on POWER8.
+
+	  If unsure, say N.
+
+config KVM_BOOKE_HV
+	bool
+
+config KVM_EXIT_TIMING
+	bool "Detailed exit timing"
+	depends on KVM_E500V2 || KVM_E500MC
+	---help---
+	  Calculate elapsed time for every exit/enter cycle. A per-vcpu
+	  report is available in debugfs kvm/vm#_vcpu#_timing.
+	  The overhead is relatively small, however it is not recommended for
+	  production environments.
+
+	  If unsure, say N.
+
+config KVM_E500V2
+	bool "KVM support for PowerPC E500v2 processors"
+	depends on E500 && !PPC_E500MC
+	select KVM
+	select KVM_MMIO
+	select MMU_NOTIFIER
+	---help---
+	  Support running unmodified E500 guest kernels in virtual machines on
+	  E500v2 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_E500MC
+	bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
+	depends on PPC_E500MC
+	select KVM
+	select KVM_MMIO
+	select KVM_BOOKE_HV
+	select MMU_NOTIFIER
+	---help---
+	  Support running unmodified E500MC/E5500/E6500 guest kernels in
+	  virtual machines on E500MC/E5500/E6500 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
+config KVM_MPIC
+	bool "KVM in-kernel MPIC emulation"
+	depends on KVM && E500
+	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_IRQFD
+	select HAVE_KVM_IRQ_ROUTING
+	select HAVE_KVM_MSI
+	help
+	  Enable support for emulating MPIC devices inside the
+          host kernel, rather than relying on userspace to emulate.
+          Currently, support is limited to certain versions of
+          Freescale's MPIC implementation.
+
+config KVM_XICS
+	bool "KVM in-kernel XICS emulation"
+	depends on KVM_BOOK3S_64 && !KVM_MPIC
+	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_IRQFD
+	default y
+	---help---
+	  Include support for the XICS (eXternal Interrupt Controller
+	  Specification) interrupt controller architecture used on
+	  IBM POWER (pSeries) servers.
+
+source drivers/vhost/Kconfig
+
+endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
new file mode 100644
index 000000000..0570eef83
--- /dev/null
+++ b/arch/powerpc/kvm/Makefile
@@ -0,0 +1,128 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
+KVM := ../../../virt/kvm
+
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
+		$(KVM)/eventfd.o
+
+CFLAGS_e500_mmu.o := -I.
+CFLAGS_e500_mmu_host.o := -I.
+CFLAGS_emulate.o  := -I.
+CFLAGS_emulate_loadstore.o  := -I.
+
+common-objs-y += powerpc.o emulate.o emulate_loadstore.o
+obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
+obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
+
+AFLAGS_booke_interrupts.o := -I$(obj)
+
+kvm-e500-objs := \
+	$(common-objs-y) \
+	booke.o \
+	booke_emulate.o \
+	booke_interrupts.o \
+	e500.o \
+	e500_mmu.o \
+	e500_mmu_host.o \
+	e500_emulate.o
+kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
+
+kvm-e500mc-objs := \
+	$(common-objs-y) \
+	booke.o \
+	booke_emulate.o \
+	bookehv_interrupts.o \
+	e500mc.o \
+	e500_mmu.o \
+	e500_mmu_host.o \
+	e500_emulate.o
+kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
+
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \
+	book3s_64_vio_hv.o
+
+kvm-pr-y := \
+	fpu.o \
+	emulate.o \
+	book3s_paired_singles.o \
+	book3s_pr.o \
+	book3s_pr_papr.o \
+	book3s_emulate.o \
+	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
+	book3s_64_mmu_host.o \
+	book3s_64_mmu.o \
+	book3s_32_mmu.o
+
+ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+kvm-book3s_64-module-objs := \
+	$(KVM)/coalesced_mmio.o
+
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	book3s_rmhandlers.o
+endif
+
+kvm-hv-y += \
+	book3s_hv.o \
+	book3s_hv_interrupts.o \
+	book3s_64_mmu_hv.o
+
+kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
+	book3s_hv_rm_xics.o
+
+ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	book3s_hv_rmhandlers.o \
+	book3s_hv_rm_mmu.o \
+	book3s_hv_ras.o \
+	book3s_hv_builtin.o \
+	$(kvm-book3s_64-builtin-xics-objs-y)
+endif
+
+kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
+	book3s_xics.o
+
+kvm-book3s_64-module-objs += \
+	$(KVM)/kvm_main.o \
+	$(KVM)/eventfd.o \
+	powerpc.o \
+	emulate_loadstore.o \
+	book3s.o \
+	book3s_64_vio.o \
+	book3s_rtas.o \
+	$(kvm-book3s_64-objs-y)
+
+kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
+
+kvm-book3s_32-objs := \
+	$(common-objs-y) \
+	fpu.o \
+	book3s_paired_singles.o \
+	book3s.o \
+	book3s_pr.o \
+	book3s_emulate.o \
+	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
+	book3s_32_mmu_host.o \
+	book3s_32_mmu.o
+kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
+
+kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
+kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
+
+kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
+
+obj-$(CONFIG_KVM_E500V2) += kvm.o
+obj-$(CONFIG_KVM_E500MC) += kvm.o
+obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
+obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
+
+obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o
+obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o
+
+obj-y += $(kvm-book3s_64-builtin-objs-y)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
new file mode 100644
index 000000000..453a8a47a
--- /dev/null
+++ b/arch/powerpc/kvm/book3s.c
@@ -0,0 +1,944 @@
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/page.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+
+#include "book3s.h"
+#include "trace.h"
+
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+/* #define EXIT_DEBUG */
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+	{ "exits",       VCPU_STAT(sum_exits) },
+	{ "mmio",        VCPU_STAT(mmio_exits) },
+	{ "sig",         VCPU_STAT(signal_exits) },
+	{ "sysc",        VCPU_STAT(syscall_exits) },
+	{ "inst_emu",    VCPU_STAT(emulated_inst_exits) },
+	{ "dec",         VCPU_STAT(dec_exits) },
+	{ "ext_intr",    VCPU_STAT(ext_intr_exits) },
+	{ "queue_intr",  VCPU_STAT(queue_intr) },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
+	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+	{ "pf_storage",  VCPU_STAT(pf_storage) },
+	{ "sp_storage",  VCPU_STAT(sp_storage) },
+	{ "pf_instruc",  VCPU_STAT(pf_instruc) },
+	{ "sp_instruc",  VCPU_STAT(sp_instruc) },
+	{ "ld",          VCPU_STAT(ld) },
+	{ "ld_slow",     VCPU_STAT(ld_slow) },
+	{ "st",          VCPU_STAT(st) },
+	{ "st_slow",     VCPU_STAT(st_slow) },
+	{ NULL }
+};
+
+void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
+		ulong pc = kvmppc_get_pc(vcpu);
+		if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+			kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
+		vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
+	}
+}
+EXPORT_SYMBOL_GPL(kvmppc_unfixup_split_real);
+
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+	if (!is_kvmppc_hv_enabled(vcpu->kvm))
+		return to_book3s(vcpu)->hior;
+	return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+			unsigned long pending_now, unsigned long old_pending)
+{
+	if (is_kvmppc_hv_enabled(vcpu->kvm))
+		return;
+	if (pending_now)
+		kvmppc_set_int_pending(vcpu, 1);
+	else if (old_pending)
+		kvmppc_set_int_pending(vcpu, 0);
+}
+
+static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
+{
+	ulong crit_raw;
+	ulong crit_r1;
+	bool crit;
+
+	if (is_kvmppc_hv_enabled(vcpu->kvm))
+		return false;
+
+	crit_raw = kvmppc_get_critical(vcpu);
+	crit_r1 = kvmppc_get_gpr(vcpu, 1);
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR);
+
+	return crit;
+}
+
+void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
+{
+	kvmppc_unfixup_split_real(vcpu);
+	kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
+	kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags);
+	kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec);
+	vcpu->arch.mmu.reset_msr(vcpu);
+}
+
+static int kvmppc_book3s_vec2irqprio(unsigned int vec)
+{
+	unsigned int prio;
+
+	switch (vec) {
+	case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET;		break;
+	case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK;	break;
+	case 0x300: prio = BOOK3S_IRQPRIO_DATA_STORAGE;		break;
+	case 0x380: prio = BOOK3S_IRQPRIO_DATA_SEGMENT;		break;
+	case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;		break;
+	case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;		break;
+	case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;		break;
+	case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL;	break;
+	case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;		break;
+	case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;		break;
+	case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;		break;
+	case 0x900: prio = BOOK3S_IRQPRIO_DECREMENTER;		break;
+	case 0xc00: prio = BOOK3S_IRQPRIO_SYSCALL;		break;
+	case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;		break;
+	case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC;		break;
+	case 0xf40: prio = BOOK3S_IRQPRIO_VSX;			break;
+	case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL;		break;
+	default:    prio = BOOK3S_IRQPRIO_MAX;			break;
+	}
+
+	return prio;
+}
+
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+					  unsigned int vec)
+{
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
+
+	clear_bit(kvmppc_book3s_vec2irqprio(vec),
+		  &vcpu->arch.pending_exceptions);
+
+	kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions,
+				  old_pending);
+}
+
+void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
+{
+	vcpu->stat.queue_intr++;
+
+	set_bit(kvmppc_book3s_vec2irqprio(vec),
+		&vcpu->arch.pending_exceptions);
+#ifdef EXIT_DEBUG
+	printk(KERN_INFO "Queueing interrupt %x\n", vec);
+#endif
+}
+EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio);
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
+{
+	/* might as well deliver this straight away */
+	kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_program);
+
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
+{
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec);
+
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+	return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec);
+
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
+
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+	unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
+
+	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
+		vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
+
+	kvmppc_book3s_queue_irqprio(vcpu, vec);
+}
+
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
+{
+	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+}
+
+void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
+				    ulong flags)
+{
+	kvmppc_set_dar(vcpu, dar);
+	kvmppc_set_dsisr(vcpu, flags);
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
+}
+
+void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
+{
+	u64 msr = kvmppc_get_msr(vcpu);
+	msr &= ~(SRR1_ISI_NOPT | SRR1_ISI_N_OR_G | SRR1_ISI_PROT);
+	msr |= flags & (SRR1_ISI_NOPT | SRR1_ISI_N_OR_G | SRR1_ISI_PROT);
+	kvmppc_set_msr_fast(vcpu, msr);
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
+}
+
+int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	int deliver = 1;
+	int vec = 0;
+	bool crit = kvmppc_critical_section(vcpu);
+
+	switch (priority) {
+	case BOOK3S_IRQPRIO_DECREMENTER:
+		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
+		vec = BOOK3S_INTERRUPT_DECREMENTER;
+		break;
+	case BOOK3S_IRQPRIO_EXTERNAL:
+	case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
+		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
+		vec = BOOK3S_INTERRUPT_EXTERNAL;
+		break;
+	case BOOK3S_IRQPRIO_SYSTEM_RESET:
+		vec = BOOK3S_INTERRUPT_SYSTEM_RESET;
+		break;
+	case BOOK3S_IRQPRIO_MACHINE_CHECK:
+		vec = BOOK3S_INTERRUPT_MACHINE_CHECK;
+		break;
+	case BOOK3S_IRQPRIO_DATA_STORAGE:
+		vec = BOOK3S_INTERRUPT_DATA_STORAGE;
+		break;
+	case BOOK3S_IRQPRIO_INST_STORAGE:
+		vec = BOOK3S_INTERRUPT_INST_STORAGE;
+		break;
+	case BOOK3S_IRQPRIO_DATA_SEGMENT:
+		vec = BOOK3S_INTERRUPT_DATA_SEGMENT;
+		break;
+	case BOOK3S_IRQPRIO_INST_SEGMENT:
+		vec = BOOK3S_INTERRUPT_INST_SEGMENT;
+		break;
+	case BOOK3S_IRQPRIO_ALIGNMENT:
+		vec = BOOK3S_INTERRUPT_ALIGNMENT;
+		break;
+	case BOOK3S_IRQPRIO_PROGRAM:
+		vec = BOOK3S_INTERRUPT_PROGRAM;
+		break;
+	case BOOK3S_IRQPRIO_VSX:
+		vec = BOOK3S_INTERRUPT_VSX;
+		break;
+	case BOOK3S_IRQPRIO_ALTIVEC:
+		vec = BOOK3S_INTERRUPT_ALTIVEC;
+		break;
+	case BOOK3S_IRQPRIO_FP_UNAVAIL:
+		vec = BOOK3S_INTERRUPT_FP_UNAVAIL;
+		break;
+	case BOOK3S_IRQPRIO_SYSCALL:
+		vec = BOOK3S_INTERRUPT_SYSCALL;
+		break;
+	case BOOK3S_IRQPRIO_DEBUG:
+		vec = BOOK3S_INTERRUPT_TRACE;
+		break;
+	case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR:
+		vec = BOOK3S_INTERRUPT_PERFMON;
+		break;
+	case BOOK3S_IRQPRIO_FAC_UNAVAIL:
+		vec = BOOK3S_INTERRUPT_FAC_UNAVAIL;
+		break;
+	default:
+		deliver = 0;
+		printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority);
+		break;
+	}
+
+#if 0
+	printk(KERN_INFO "Deliver interrupt 0x%x? %x\n", vec, deliver);
+#endif
+
+	if (deliver)
+		kvmppc_inject_interrupt(vcpu, vec, 0);
+
+	return deliver;
+}
+
+/*
+ * This function determines if an irqprio should be cleared once issued.
+ */
+static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+	switch (priority) {
+		case BOOK3S_IRQPRIO_DECREMENTER:
+			/* DEC interrupts get cleared by mtdec */
+			return false;
+		case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
+			/* External interrupts get cleared by userspace */
+			return false;
+	}
+
+	return true;
+}
+
+int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned long old_pending = vcpu->arch.pending_exceptions;
+	unsigned int priority;
+
+#ifdef EXIT_DEBUG
+	if (vcpu->arch.pending_exceptions)
+		printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions);
+#endif
+	priority = __ffs(*pending);
+	while (priority < BOOK3S_IRQPRIO_MAX) {
+		if (kvmppc_book3s_irqprio_deliver(vcpu, priority) &&
+		    clear_irqprio(vcpu, priority)) {
+			clear_bit(priority, &vcpu->arch.pending_exceptions);
+			break;
+		}
+
+		priority = find_next_bit(pending,
+					 BITS_PER_BYTE * sizeof(*pending),
+					 priority + 1);
+	}
+
+	/* Tell the guest about our interrupt status */
+	kvmppc_update_int_pending(vcpu, *pending, old_pending);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter);
+
+pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
+			bool *writable)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM;
+	gfn_t gfn = gpa >> PAGE_SHIFT;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF))
+		mp_pa = (uint32_t)mp_pa;
+
+	/* Magic page override */
+	gpa &= ~0xFFFULL;
+	if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) {
+		ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+		pfn_t pfn;
+
+		pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT;
+		get_page(pfn_to_page(pfn));
+		if (writable)
+			*writable = true;
+		return pfn;
+	}
+
+	return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable);
+}
+EXPORT_SYMBOL_GPL(kvmppc_gpa_to_pfn);
+
+int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid,
+		 enum xlate_readwrite xlrw, struct kvmppc_pte *pte)
+{
+	bool data = (xlid == XLATE_DATA);
+	bool iswrite = (xlrw == XLATE_WRITE);
+	int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR));
+	int r;
+
+	if (relocated) {
+		r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite);
+	} else {
+		pte->eaddr = eaddr;
+		pte->raddr = eaddr & KVM_PAM;
+		pte->vpage = VSID_REAL | eaddr >> 12;
+		pte->may_read = true;
+		pte->may_write = true;
+		pte->may_execute = true;
+		r = 0;
+
+		if ((kvmppc_get_msr(vcpu) & (MSR_IR | MSR_DR)) == MSR_DR &&
+		    !data) {
+			if ((vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) &&
+			    ((eaddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS))
+			pte->raddr &= ~SPLIT_HACK_MASK;
+		}
+	}
+
+	return r;
+}
+
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type,
+					 u32 *inst)
+{
+	ulong pc = kvmppc_get_pc(vcpu);
+	int r;
+
+	if (type == INST_SC)
+		pc -= 4;
+
+	r = kvmppc_ld(vcpu, &pc, sizeof(u32), inst, false);
+	if (r == EMULATE_DONE)
+		return r;
+	else
+		return EMULATE_AGAIN;
+}
+EXPORT_SYMBOL_GPL(kvmppc_load_last_inst);
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	regs->pc = kvmppc_get_pc(vcpu);
+	regs->cr = kvmppc_get_cr(vcpu);
+	regs->ctr = kvmppc_get_ctr(vcpu);
+	regs->lr = kvmppc_get_lr(vcpu);
+	regs->xer = kvmppc_get_xer(vcpu);
+	regs->msr = kvmppc_get_msr(vcpu);
+	regs->srr0 = kvmppc_get_srr0(vcpu);
+	regs->srr1 = kvmppc_get_srr1(vcpu);
+	regs->pid = vcpu->arch.pid;
+	regs->sprg0 = kvmppc_get_sprg0(vcpu);
+	regs->sprg1 = kvmppc_get_sprg1(vcpu);
+	regs->sprg2 = kvmppc_get_sprg2(vcpu);
+	regs->sprg3 = kvmppc_get_sprg3(vcpu);
+	regs->sprg4 = kvmppc_get_sprg4(vcpu);
+	regs->sprg5 = kvmppc_get_sprg5(vcpu);
+	regs->sprg6 = kvmppc_get_sprg6(vcpu);
+	regs->sprg7 = kvmppc_get_sprg7(vcpu);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	kvmppc_set_pc(vcpu, regs->pc);
+	kvmppc_set_cr(vcpu, regs->cr);
+	kvmppc_set_ctr(vcpu, regs->ctr);
+	kvmppc_set_lr(vcpu, regs->lr);
+	kvmppc_set_xer(vcpu, regs->xer);
+	kvmppc_set_msr(vcpu, regs->msr);
+	kvmppc_set_srr0(vcpu, regs->srr0);
+	kvmppc_set_srr1(vcpu, regs->srr1);
+	kvmppc_set_sprg0(vcpu, regs->sprg0);
+	kvmppc_set_sprg1(vcpu, regs->sprg1);
+	kvmppc_set_sprg2(vcpu, regs->sprg2);
+	kvmppc_set_sprg3(vcpu, regs->sprg3);
+	kvmppc_set_sprg4(vcpu, regs->sprg4);
+	kvmppc_set_sprg5(vcpu, regs->sprg5);
+	kvmppc_set_sprg6(vcpu, regs->sprg6);
+	kvmppc_set_sprg7(vcpu, regs->sprg7);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -ENOTSUPP;
+}
+
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (id) {
+		case KVM_REG_PPC_DAR:
+			*val = get_reg_val(id, kvmppc_get_dar(vcpu));
+			break;
+		case KVM_REG_PPC_DSISR:
+			*val = get_reg_val(id, kvmppc_get_dsisr(vcpu));
+			break;
+		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+			i = id - KVM_REG_PPC_FPR0;
+			*val = get_reg_val(id, VCPU_FPR(vcpu, i));
+			break;
+		case KVM_REG_PPC_FPSCR:
+			*val = get_reg_val(id, vcpu->arch.fp.fpscr);
+			break;
+#ifdef CONFIG_VSX
+		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+			if (cpu_has_feature(CPU_FTR_VSX)) {
+				i = id - KVM_REG_PPC_VSR0;
+				val->vsxval[0] = vcpu->arch.fp.fpr[i][0];
+				val->vsxval[1] = vcpu->arch.fp.fpr[i][1];
+			} else {
+				r = -ENXIO;
+			}
+			break;
+#endif /* CONFIG_VSX */
+		case KVM_REG_PPC_DEBUG_INST:
+			*val = get_reg_val(id, INS_TW);
+			break;
+#ifdef CONFIG_KVM_XICS
+		case KVM_REG_PPC_ICP_STATE:
+			if (!vcpu->arch.icp) {
+				r = -ENXIO;
+				break;
+			}
+			*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
+			break;
+#endif /* CONFIG_KVM_XICS */
+		case KVM_REG_PPC_FSCR:
+			*val = get_reg_val(id, vcpu->arch.fscr);
+			break;
+		case KVM_REG_PPC_TAR:
+			*val = get_reg_val(id, vcpu->arch.tar);
+			break;
+		case KVM_REG_PPC_EBBHR:
+			*val = get_reg_val(id, vcpu->arch.ebbhr);
+			break;
+		case KVM_REG_PPC_EBBRR:
+			*val = get_reg_val(id, vcpu->arch.ebbrr);
+			break;
+		case KVM_REG_PPC_BESCR:
+			*val = get_reg_val(id, vcpu->arch.bescr);
+			break;
+		case KVM_REG_PPC_VTB:
+			*val = get_reg_val(id, vcpu->arch.vtb);
+			break;
+		case KVM_REG_PPC_IC:
+			*val = get_reg_val(id, vcpu->arch.ic);
+			break;
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (id) {
+		case KVM_REG_PPC_DAR:
+			kvmppc_set_dar(vcpu, set_reg_val(id, *val));
+			break;
+		case KVM_REG_PPC_DSISR:
+			kvmppc_set_dsisr(vcpu, set_reg_val(id, *val));
+			break;
+		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+			i = id - KVM_REG_PPC_FPR0;
+			VCPU_FPR(vcpu, i) = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_FPSCR:
+			vcpu->arch.fp.fpscr = set_reg_val(id, *val);
+			break;
+#ifdef CONFIG_VSX
+		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+			if (cpu_has_feature(CPU_FTR_VSX)) {
+				i = id - KVM_REG_PPC_VSR0;
+				vcpu->arch.fp.fpr[i][0] = val->vsxval[0];
+				vcpu->arch.fp.fpr[i][1] = val->vsxval[1];
+			} else {
+				r = -ENXIO;
+			}
+			break;
+#endif /* CONFIG_VSX */
+#ifdef CONFIG_KVM_XICS
+		case KVM_REG_PPC_ICP_STATE:
+			if (!vcpu->arch.icp) {
+				r = -ENXIO;
+				break;
+			}
+			r = kvmppc_xics_set_icp(vcpu,
+						set_reg_val(id, *val));
+			break;
+#endif /* CONFIG_KVM_XICS */
+		case KVM_REG_PPC_FSCR:
+			vcpu->arch.fscr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_TAR:
+			vcpu->arch.tar = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_EBBHR:
+			vcpu->arch.ebbhr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_EBBRR:
+			vcpu->arch.ebbrr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_BESCR:
+			vcpu->arch.bescr = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_VTB:
+			vcpu->arch.vtb = set_reg_val(id, *val);
+			break;
+		case KVM_REG_PPC_IC:
+			vcpu->arch.ic = set_reg_val(id, *val);
+			break;
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	return r;
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
+}
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr);
+}
+EXPORT_SYMBOL_GPL(kvmppc_set_msr);
+
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu);
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                  struct kvm_translation *tr)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
+{
+	vcpu->guest_debug = dbg->control;
+	return 0;
+}
+
+void kvmppc_decrementer_func(struct kvm_vcpu *vcpu)
+{
+	kvmppc_core_queue_dec(vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	return kvm->arch.kvm_ops->vcpu_create(kvm, id);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
+}
+
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.kvm_ops->check_requests(vcpu);
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+	return kvm->arch.kvm_ops->get_dirty_log(kvm, log);
+}
+
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+			      struct kvm_memory_slot *dont)
+{
+	kvm->arch.kvm_ops->free_memslot(free, dont);
+}
+
+int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+			       unsigned long npages)
+{
+	return kvm->arch.kvm_ops->create_memslot(slot, npages);
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+	kvm->arch.kvm_ops->flush_memslot(kvm, memslot);
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				struct kvm_memory_slot *memslot,
+				struct kvm_userspace_memory_region *mem)
+{
+	return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old)
+{
+	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old);
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm->arch.kvm_ops->unmap_hva(kvm, hva);
+}
+EXPORT_SYMBOL_GPL(kvm_unmap_hva);
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	return kvm->arch.kvm_ops->age_hva(kvm, start, end);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+
+#ifdef CONFIG_PPC64
+	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
+#endif
+
+	return kvm->arch.kvm_ops->init_vm(kvm);
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvm->arch.kvm_ops->destroy_vm(kvm);
+
+#ifdef CONFIG_PPC64
+	kvmppc_rtas_tokens_free(kvm);
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+#endif
+}
+
+int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	u64 buf;
+	int ret;
+
+	if (!is_power_of_2(size) || (size > sizeof(buf)))
+		return H_TOO_HARD;
+
+	ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	switch (size) {
+	case 1:
+		kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf);
+		break;
+
+	case 2:
+		kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf));
+		break;
+
+	case 4:
+		kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf));
+		break;
+
+	case 8:
+		kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf));
+		break;
+
+	default:
+		BUG();
+	}
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load);
+
+int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	unsigned long val = kvmppc_get_gpr(vcpu, 6);
+	u64 buf;
+	int ret;
+
+	switch (size) {
+	case 1:
+		*(u8 *)&buf = val;
+		break;
+
+	case 2:
+		*(__be16 *)&buf = cpu_to_be16(val);
+		break;
+
+	case 4:
+		*(__be32 *)&buf = cpu_to_be32(val);
+		break;
+
+	case 8:
+		*(__be64 *)&buf = cpu_to_be64(val);
+		break;
+
+	default:
+		return H_TOO_HARD;
+	}
+
+	ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
+
+int kvmppc_core_check_processor_compat(void)
+{
+	/*
+	 * We always return 0 for book3s. We check
+	 * for compatability while loading the HV
+	 * or PR module
+	 */
+	return 0;
+}
+
+int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
+{
+	return kvm->arch.kvm_ops->hcall_implemented(hcall);
+}
+
+static int kvmppc_book3s_init(void)
+{
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+	if (r)
+		return r;
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	r = kvmppc_book3s_init_pr();
+#endif
+	return r;
+
+}
+
+static void kvmppc_book3s_exit(void)
+{
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	kvmppc_book3s_exit_pr();
+#endif
+	kvm_exit();
+}
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);
+
+/* On 32bit this is our one and only kernel module */
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
+#endif
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
new file mode 100644
index 000000000..d2b3ec088
--- /dev/null
+++ b/arch/powerpc/kvm/book3s.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright IBM Corporation, 2013
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License or (at your optional) any later version of the license.
+ *
+ */
+
+#ifndef __POWERPC_KVM_BOOK3S_H__
+#define __POWERPC_KVM_BOOK3S_H__
+
+extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+					 struct kvm_memory_slot *memslot);
+extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start,
+				  unsigned long end);
+extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start,
+			  unsigned long end);
+extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva);
+extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte);
+
+extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				     unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu,
+					int sprn, ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu,
+					int sprn, ulong *spr_val);
+extern int kvmppc_book3s_init_pr(void);
+extern void kvmppc_book3s_exit_pr(void);
+
+#endif
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
new file mode 100644
index 000000000..a2eb6d354
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -0,0 +1,430 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_MMU_PTE */
+/* #define DEBUG_MMU_PTE_IP 0xfff14c40 */
+
+#ifdef DEBUG_MMU
+#define dprintk(X...) printk(KERN_INFO X)
+#else
+#define dprintk(X...) do { } while(0)
+#endif
+
+#ifdef DEBUG_MMU_PTE
+#define dprintk_pte(X...) printk(KERN_INFO X)
+#else
+#define dprintk_pte(X...) do { } while(0)
+#endif
+
+#define PTEG_FLAG_ACCESSED	0x00000100
+#define PTEG_FLAG_DIRTY		0x00000080
+#ifndef SID_SHIFT
+#define SID_SHIFT		28
+#endif
+
+static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
+{
+#ifdef DEBUG_MMU_PTE_IP
+	return vcpu->arch.pc == DEBUG_MMU_PTE_IP;
+#else
+	return true;
+#endif
+}
+
+static inline u32 sr_vsid(u32 sr_raw)
+{
+	return sr_raw & 0x0fffffff;
+}
+
+static inline bool sr_valid(u32 sr_raw)
+{
+	return (sr_raw & 0x80000000) ? false : true;
+}
+
+static inline bool sr_ks(u32 sr_raw)
+{
+	return (sr_raw & 0x40000000) ? true: false;
+}
+
+static inline bool sr_kp(u32 sr_raw)
+{
+	return (sr_raw & 0x20000000) ? true: false;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
+					  struct kvmppc_pte *pte, bool data,
+					  bool iswrite);
+static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
+					     u64 *vsid);
+
+static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf);
+}
+
+static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
+					 bool data)
+{
+	u64 vsid;
+	struct kvmppc_pte pte;
+
+	if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false))
+		return pte.vpage;
+
+	kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+	return (((u64)eaddr >> 12) & 0xffff) | (vsid << 16);
+}
+
+static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_set_msr(vcpu, 0);
+}
+
+static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu,
+				      u32 sre, gva_t eaddr,
+				      bool primary)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u32 page, hash, pteg, htabmask;
+	hva_t r;
+
+	page = (eaddr & 0x0FFFFFFF) >> 12;
+	htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0;
+
+	hash = ((sr_vsid(sre) ^ page) << 6);
+	if (!primary)
+		hash = ~hash;
+	hash &= htabmask;
+
+	pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;
+
+	dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
+		kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg,
+		sr_vsid(sre));
+
+	r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);
+	if (kvm_is_error_hva(r))
+		return r;
+	return r | (pteg & ~PAGE_MASK);
+}
+
+static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary)
+{
+	return ((eaddr & 0x0fffffff) >> 22) | (sr_vsid(sre) << 7) |
+	       (primary ? 0 : 0x40) | 0x80000000;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
+					  struct kvmppc_pte *pte, bool data,
+					  bool iswrite)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	struct kvmppc_bat *bat;
+	int i;
+
+	for (i = 0; i < 8; i++) {
+		if (data)
+			bat = &vcpu_book3s->dbat[i];
+		else
+			bat = &vcpu_book3s->ibat[i];
+
+		if (kvmppc_get_msr(vcpu) & MSR_PR) {
+			if (!bat->vp)
+				continue;
+		} else {
+			if (!bat->vs)
+				continue;
+		}
+
+		if (check_debug_ip(vcpu))
+		{
+			dprintk_pte("%cBAT %02d: 0x%lx - 0x%x (0x%x)\n",
+				    data ? 'd' : 'i', i, eaddr, bat->bepi,
+				    bat->bepi_mask);
+		}
+		if ((eaddr & bat->bepi_mask) == bat->bepi) {
+			u64 vsid;
+			kvmppc_mmu_book3s_32_esid_to_vsid(vcpu,
+				eaddr >> SID_SHIFT, &vsid);
+			vsid <<= 16;
+			pte->vpage = (((u64)eaddr >> 12) & 0xffff) | vsid;
+
+			pte->raddr = bat->brpn | (eaddr & ~bat->bepi_mask);
+			pte->may_read = bat->pp;
+			pte->may_write = bat->pp > 1;
+			pte->may_execute = true;
+			if (!pte->may_read) {
+				printk(KERN_INFO "BAT is not readable!\n");
+				continue;
+			}
+			if (iswrite && !pte->may_write) {
+				dprintk_pte("BAT is read-only!\n");
+				continue;
+			}
+
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
+				     struct kvmppc_pte *pte, bool data,
+				     bool iswrite, bool primary)
+{
+	u32 sre;
+	hva_t ptegp;
+	u32 pteg[16];
+	u32 pte0, pte1;
+	u32 ptem = 0;
+	int i;
+	int found = 0;
+
+	sre = find_sr(vcpu, eaddr);
+
+	dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28,
+		    sr_vsid(sre), sre);
+
+	pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
+
+	ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary);
+	if (kvm_is_error_hva(ptegp)) {
+		printk(KERN_INFO "KVM: Invalid PTEG!\n");
+		goto no_page_found;
+	}
+
+	ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary);
+
+	if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
+		printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp);
+		goto no_page_found;
+	}
+
+	for (i=0; i<16; i+=2) {
+		pte0 = be32_to_cpu(pteg[i]);
+		pte1 = be32_to_cpu(pteg[i + 1]);
+		if (ptem == pte0) {
+			u8 pp;
+
+			pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF);
+			pp = pte1 & 3;
+
+			if ((sr_kp(sre) &&  (kvmppc_get_msr(vcpu) & MSR_PR)) ||
+			    (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR)))
+				pp |= 4;
+
+			pte->may_write = false;
+			pte->may_read = false;
+			pte->may_execute = true;
+			switch (pp) {
+				case 0:
+				case 1:
+				case 2:
+				case 6:
+					pte->may_write = true;
+				case 3:
+				case 5:
+				case 7:
+					pte->may_read = true;
+					break;
+			}
+
+			dprintk_pte("MMU: Found PTE -> %x %x - %x\n",
+				    pte0, pte1, pp);
+			found = 1;
+			break;
+		}
+	}
+
+	/* Update PTE C and A bits, so the guest's swapper knows we used the
+	   page */
+	if (found) {
+		u32 pte_r = pte1;
+		char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32));
+
+		/*
+		 * Use single-byte writes to update the HPTE, to
+		 * conform to what real hardware does.
+		 */
+		if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) {
+			pte_r |= PTEG_FLAG_ACCESSED;
+			put_user(pte_r >> 8, addr + 2);
+		}
+		if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) {
+			pte_r |= PTEG_FLAG_DIRTY;
+			put_user(pte_r, addr + 3);
+		}
+		if (!pte->may_read || (iswrite && !pte->may_write))
+			return -EPERM;
+		return 0;
+	}
+
+no_page_found:
+
+	if (check_debug_ip(vcpu)) {
+		dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n",
+			    to_book3s(vcpu)->sdr1, ptegp);
+		for (i=0; i<16; i+=2) {
+			dprintk_pte("   %02d: 0x%x - 0x%x (0x%x)\n",
+				    i, be32_to_cpu(pteg[i]),
+				    be32_to_cpu(pteg[i+1]), ptem);
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				      struct kvmppc_pte *pte, bool data,
+				      bool iswrite)
+{
+	int r;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	pte->eaddr = eaddr;
+	pte->page_size = MMU_PAGE_4K;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
+		pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff);
+		pte->raddr &= KVM_PAM;
+		pte->may_execute = true;
+		pte->may_read = true;
+		pte->may_write = true;
+
+		return 0;
+	}
+
+	r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite);
+	if (r < 0)
+		r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte,
+						   data, iswrite, true);
+	if (r == -ENOENT)
+		r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte,
+						   data, iswrite, false);
+
+	return r;
+}
+
+
+static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
+{
+	return kvmppc_get_sr(vcpu, srnum);
+}
+
+static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
+					ulong value)
+{
+	kvmppc_set_sr(vcpu, srnum, value);
+	kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
+}
+
+static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large)
+{
+	int i;
+	struct kvm_vcpu *v;
+
+	/* flush this VA on all cpus */
+	kvm_for_each_vcpu(i, v, vcpu->kvm)
+		kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000);
+}
+
+static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
+					     u64 *vsid)
+{
+	ulong ea = esid << SID_SHIFT;
+	u32 sr;
+	u64 gvsid = esid;
+	u64 msr = kvmppc_get_msr(vcpu);
+
+	if (msr & (MSR_DR|MSR_IR)) {
+		sr = find_sr(vcpu, ea);
+		if (sr_valid(sr))
+			gvsid = sr_vsid(sr);
+	}
+
+	/* In case we only have one of MSR_IR or MSR_DR set, let's put
+	   that in the real-mode context (and hope RM doesn't access
+	   high memory) */
+	switch (msr & (MSR_DR|MSR_IR)) {
+	case 0:
+		*vsid = VSID_REAL | esid;
+		break;
+	case MSR_IR:
+		*vsid = VSID_REAL_IR | gvsid;
+		break;
+	case MSR_DR:
+		*vsid = VSID_REAL_DR | gvsid;
+		break;
+	case MSR_DR|MSR_IR:
+		if (sr_valid(sr))
+			*vsid = sr_vsid(sr);
+		else
+			*vsid = VSID_BAT | gvsid;
+		break;
+	default:
+		BUG();
+	}
+
+	if (msr & MSR_PR)
+		*vsid |= VSID_PR;
+
+	return 0;
+}
+
+static bool kvmppc_mmu_book3s_32_is_dcbz32(struct kvm_vcpu *vcpu)
+{
+	return true;
+}
+
+
+void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin;
+	mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin;
+	mmu->xlate = kvmppc_mmu_book3s_32_xlate;
+	mmu->reset_msr = kvmppc_mmu_book3s_32_reset_msr;
+	mmu->tlbie = kvmppc_mmu_book3s_32_tlbie;
+	mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid;
+	mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp;
+	mmu->is_dcbz32 = kvmppc_mmu_book3s_32_is_dcbz32;
+
+	mmu->slbmte = NULL;
+	mmu->slbmfee = NULL;
+	mmu->slbmfev = NULL;
+	mmu->slbie = NULL;
+	mmu->slbia = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
new file mode 100644
index 000000000..2035d16a9
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash32.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_SR */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+#ifdef DEBUG_SR
+#define dprintk_sr(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_sr(a, ...) do { } while(0)
+#endif
+
+#if PAGE_SHIFT != 12
+#error Unknown page size
+#endif
+
+#ifdef CONFIG_SMP
+#error XXX need to grab mmu_hash_lock
+#endif
+
+#ifdef CONFIG_PTE_64BIT
+#error Only 32 bit pages are supported for now
+#endif
+
+static ulong htab;
+static u32 htabmask;
+
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	volatile u32 *pteg;
+
+	/* Remove from host HTAB */
+	pteg = (u32*)pte->slot;
+	pteg[0] = 0;
+
+	/* And make sure it's gone from the TLB too */
+	asm volatile ("sync");
+	asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
+	asm volatile ("sync");
+	asm volatile ("tlbsync");
+}
+
+/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
+ * a hash, so we don't waste cycles on looping */
+static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+}
+
+
+static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	u16 sid_map_mask;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+	if (map->guest_vsid == gvsid) {
+		dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n",
+			    gvsid, map->host_vsid);
+		return map;
+	}
+
+	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
+	if (map->guest_vsid == gvsid) {
+		dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n",
+			    gvsid, map->host_vsid);
+		return map;
+	}
+
+	dprintk_sr("SR: Searching 0x%llx -> not found\n", gvsid);
+	return NULL;
+}
+
+static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr,
+				bool primary)
+{
+	u32 page, hash;
+	ulong pteg = htab;
+
+	page = (eaddr & ~ESID_MASK) >> 12;
+
+	hash = ((vsid ^ page) << 6);
+	if (!primary)
+		hash = ~hash;
+
+	hash &= htabmask;
+
+	pteg |= hash;
+
+	dprintk_mmu("htab: %lx | hash: %x | htabmask: %x | pteg: %lx\n",
+		htab, hash, htabmask, pteg);
+
+	return (u32*)pteg;
+}
+
+extern char etext[];
+
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
+			bool iswrite)
+{
+	pfn_t hpaddr;
+	u64 vpn;
+	u64 vsid;
+	struct kvmppc_sid_map *map;
+	volatile u32 *pteg;
+	u32 eaddr = orig_pte->eaddr;
+	u32 pteg0, pteg1;
+	register int rr = 0;
+	bool primary = false;
+	bool evict = false;
+	struct hpte_cache *pte;
+	int r = 0;
+	bool writable;
+
+	/* Get host physical address for gpa */
+	hpaddr = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable);
+	if (is_error_noslot_pfn(hpaddr)) {
+		printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n",
+				 orig_pte->raddr);
+		r = -EINVAL;
+		goto out;
+	}
+	hpaddr <<= PAGE_SHIFT;
+
+	/* and write the mapping ea -> hpa into the pt */
+	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
+	map = find_sid_vsid(vcpu, vsid);
+	if (!map) {
+		kvmppc_mmu_map_segment(vcpu, eaddr);
+		map = find_sid_vsid(vcpu, vsid);
+	}
+	BUG_ON(!map);
+
+	vsid = map->host_vsid;
+	vpn = (vsid << (SID_SHIFT - VPN_SHIFT)) |
+		((eaddr & ~ESID_MASK) >> VPN_SHIFT);
+next_pteg:
+	if (rr == 16) {
+		primary = !primary;
+		evict = true;
+		rr = 0;
+	}
+
+	pteg = kvmppc_mmu_get_pteg(vcpu, vsid, eaddr, primary);
+
+	/* not evicting yet */
+	if (!evict && (pteg[rr] & PTE_V)) {
+		rr += 2;
+		goto next_pteg;
+	}
+
+	dprintk_mmu("KVM: old PTEG: %p (%d)\n", pteg, rr);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[0], pteg[1]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[2], pteg[3]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[4], pteg[5]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[6], pteg[7]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[8], pteg[9]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[10], pteg[11]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[12], pteg[13]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[14], pteg[15]);
+
+	pteg0 = ((eaddr & 0x0fffffff) >> 22) | (vsid << 7) | PTE_V |
+		(primary ? 0 : PTE_SEC);
+	pteg1 = hpaddr | PTE_M | PTE_R | PTE_C;
+
+	if (orig_pte->may_write && writable) {
+		pteg1 |= PP_RWRW;
+		mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+	} else {
+		pteg1 |= PP_RWRX;
+	}
+
+	if (orig_pte->may_execute)
+		kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT);
+
+	local_irq_disable();
+
+	if (pteg[rr]) {
+		pteg[rr] = 0;
+		asm volatile ("sync");
+	}
+	pteg[rr + 1] = pteg1;
+	pteg[rr] = pteg0;
+	asm volatile ("sync");
+
+	local_irq_enable();
+
+	dprintk_mmu("KVM: new PTEG: %p\n", pteg);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[0], pteg[1]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[2], pteg[3]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[4], pteg[5]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[6], pteg[7]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[8], pteg[9]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[10], pteg[11]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[12], pteg[13]);
+	dprintk_mmu("KVM:   %08x - %08x\n", pteg[14], pteg[15]);
+
+
+	/* Now tell our Shadow PTE code about the new page */
+
+	pte = kvmppc_mmu_hpte_cache_next(vcpu);
+	if (!pte) {
+		kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
+		r = -EAGAIN;
+		goto out;
+	}
+
+	dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
+		    orig_pte->may_write ? 'w' : '-',
+		    orig_pte->may_execute ? 'x' : '-',
+		    orig_pte->eaddr, (ulong)pteg, vpn,
+		    orig_pte->vpage, hpaddr);
+
+	pte->slot = (ulong)&pteg[rr];
+	pte->host_vpn = vpn;
+	pte->pte = *orig_pte;
+	pte->pfn = hpaddr >> PAGE_SHIFT;
+
+	kvmppc_mmu_hpte_cache_map(vcpu, pte);
+
+	kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
+out:
+	return r;
+}
+
+void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL);
+}
+
+static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u16 sid_map_mask;
+	static int backwards_map = 0;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	/* We might get collisions that trap in preceding order, so let's
+	   map them differently */
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	if (backwards_map)
+		sid_map_mask = SID_MAP_MASK - sid_map_mask;
+
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+
+	/* Make sure we're taking the other map next time */
+	backwards_map = !backwards_map;
+
+	/* Uh-oh ... out of mappings. Let's flush! */
+	if (vcpu_book3s->vsid_next >= VSID_POOL_SIZE) {
+		vcpu_book3s->vsid_next = 0;
+		memset(vcpu_book3s->sid_map, 0,
+		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		kvmppc_mmu_flush_segments(vcpu);
+	}
+	map->host_vsid = vcpu_book3s->vsid_pool[vcpu_book3s->vsid_next];
+	vcpu_book3s->vsid_next++;
+
+	map->guest_vsid = gvsid;
+	map->valid = true;
+
+	return map;
+}
+
+int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+	u32 esid = eaddr >> SID_SHIFT;
+	u64 gvsid;
+	u32 sr;
+	struct kvmppc_sid_map *map;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	int r = 0;
+
+	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
+		/* Invalidate an entry */
+		svcpu->sr[esid] = SR_INVALID;
+		r = -ENOENT;
+		goto out;
+	}
+
+	map = find_sid_vsid(vcpu, gvsid);
+	if (!map)
+		map = create_sid_map(vcpu, gvsid);
+
+	map->guest_esid = esid;
+	sr = map->host_vsid | SR_KP;
+	svcpu->sr[esid] = sr;
+
+	dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr);
+
+out:
+	svcpu_put(svcpu);
+	return r;
+}
+
+void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+
+	dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr));
+	for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++)
+		svcpu->sr[i] = SR_INVALID;
+
+	svcpu_put(svcpu);
+}
+
+void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	kvmppc_mmu_hpte_destroy(vcpu);
+	preempt_disable();
+	for (i = 0; i < SID_CONTEXTS; i++)
+		__destroy_context(to_book3s(vcpu)->context_id[i]);
+	preempt_enable();
+}
+
+/* From mm/mmu_context_hash32.c */
+#define CTX_TO_VSID(c, id)	((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff)
+
+int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int err;
+	ulong sdr1;
+	int i;
+	int j;
+
+	for (i = 0; i < SID_CONTEXTS; i++) {
+		err = __init_new_context();
+		if (err < 0)
+			goto init_fail;
+		vcpu3s->context_id[i] = err;
+
+		/* Remember context id for this combination */
+		for (j = 0; j < 16; j++)
+			vcpu3s->vsid_pool[(i * 16) + j] = CTX_TO_VSID(err, j);
+	}
+
+	vcpu3s->vsid_next = 0;
+
+	/* Remember where the HTAB is */
+	asm ( "mfsdr1 %0" : "=r"(sdr1) );
+	htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
+	htab = (ulong)__va(sdr1 & 0xffff0000);
+
+	kvmppc_mmu_hpte_init(vcpu);
+
+	return 0;
+
+init_fail:
+	for (j = 0; j < i; j++) {
+		if (!vcpu3s->context_id[j])
+			continue;
+
+		__destroy_context(to_book3s(vcpu)->context_id[j]);
+	}
+
+	return -1;
+}
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
new file mode 100644
index 000000000..7e06a6fc8
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -0,0 +1,143 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_GUEST_SEGMENTS
+
+	/* Required state:
+	 *
+	 * MSR = ~IR|DR
+	 * R1 = host R1
+	 * R2 = host R2
+	 * R3 = shadow vcpu
+	 * all other volatile GPRS = free except R4, R6
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 */
+
+#define XCHG_SR(n)	lwz	r9, (SVCPU_SR+(n*4))(r3);  \
+			mtsr	n, r9
+
+	XCHG_SR(0)
+	XCHG_SR(1)
+	XCHG_SR(2)
+	XCHG_SR(3)
+	XCHG_SR(4)
+	XCHG_SR(5)
+	XCHG_SR(6)
+	XCHG_SR(7)
+	XCHG_SR(8)
+	XCHG_SR(9)
+	XCHG_SR(10)
+	XCHG_SR(11)
+	XCHG_SR(12)
+	XCHG_SR(13)
+	XCHG_SR(14)
+	XCHG_SR(15)
+
+	/* Clear BATs. */
+
+#define KVM_KILL_BAT(n, reg)		\
+        mtspr   SPRN_IBAT##n##U,reg;	\
+        mtspr   SPRN_IBAT##n##L,reg;	\
+        mtspr   SPRN_DBAT##n##U,reg;	\
+        mtspr   SPRN_DBAT##n##L,reg;	\
+
+        li	r9, 0
+	KVM_KILL_BAT(0, r9)
+	KVM_KILL_BAT(1, r9)
+	KVM_KILL_BAT(2, r9)
+	KVM_KILL_BAT(3, r9)
+
+.endm
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_HOST_SEGMENTS
+
+	/* Register usage at this point:
+	 *
+	 * R1         = host R1
+	 * R2         = host R2
+	 * R12        = exit handler id
+	 * R13        = shadow vcpu - SHADOW_VCPU_OFF
+	 * SVCPU.*    = guest *
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 *
+	 */
+
+	/* Restore BATs */
+
+	/* We only overwrite the upper part, so we only restoree
+	   the upper part. */
+#define KVM_LOAD_BAT(n, reg, RA, RB)	\
+	lwz	RA,(n*16)+0(reg);	\
+	lwz	RB,(n*16)+4(reg);	\
+	mtspr	SPRN_IBAT##n##U,RA;	\
+	mtspr	SPRN_IBAT##n##L,RB;	\
+	lwz	RA,(n*16)+8(reg);	\
+	lwz	RB,(n*16)+12(reg);	\
+	mtspr	SPRN_DBAT##n##U,RA;	\
+	mtspr	SPRN_DBAT##n##L,RB;	\
+
+	lis     r9, BATS@ha
+	addi    r9, r9, BATS@l
+	tophys(r9, r9)
+	KVM_LOAD_BAT(0, r9, r10, r11)
+	KVM_LOAD_BAT(1, r9, r10, r11)
+	KVM_LOAD_BAT(2, r9, r10, r11)
+	KVM_LOAD_BAT(3, r9, r10, r11)
+
+	/* Restore Segment Registers */
+
+	/* 0xc - 0xf */
+
+        li      r0, 4
+        mtctr   r0
+	LOAD_REG_IMMEDIATE(r3, 0x20000000 | (0x111 * 0xc))
+        lis     r4, 0xc000
+3:      mtsrin  r3, r4
+        addi    r3, r3, 0x111     /* increment VSID */
+        addis   r4, r4, 0x1000    /* address of next segment */
+        bdnz    3b
+
+	/* 0x0 - 0xb */
+
+	/* 'current->mm' needs to be in r4 */
+	tophys(r4, r2)
+	lwz	r4, MM(r4)
+	tophys(r4, r4)
+	/* This only clobbers r0, r3, r4 and r5 */
+	bl	switch_mmu_context
+
+.endm
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
new file mode 100644
index 000000000..774a253ca
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -0,0 +1,675 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+
+/* #define DEBUG_MMU */
+
+#ifdef DEBUG_MMU
+#define dprintk(X...) printk(KERN_INFO X)
+#else
+#define dprintk(X...) do { } while(0)
+#endif
+
+static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
+}
+
+static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
+				struct kvm_vcpu *vcpu,
+				gva_t eaddr)
+{
+	int i;
+	u64 esid = GET_ESID(eaddr);
+	u64 esid_1t = GET_ESID_1T(eaddr);
+
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		u64 cmp_esid = esid;
+
+		if (!vcpu->arch.slb[i].valid)
+			continue;
+
+		if (vcpu->arch.slb[i].tb)
+			cmp_esid = esid_1t;
+
+		if (vcpu->arch.slb[i].esid == cmp_esid)
+			return &vcpu->arch.slb[i];
+	}
+
+	dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
+		eaddr, esid, esid_1t);
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+	    if (vcpu->arch.slb[i].vsid)
+		dprintk("  %d: %c%c%c %llx %llx\n", i,
+			vcpu->arch.slb[i].valid ? 'v' : ' ',
+			vcpu->arch.slb[i].large ? 'l' : ' ',
+			vcpu->arch.slb[i].tb    ? 't' : ' ',
+			vcpu->arch.slb[i].esid,
+			vcpu->arch.slb[i].vsid);
+	}
+
+	return NULL;
+}
+
+static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe)
+{
+	return slbe->tb ? SID_SHIFT_1T : SID_SHIFT;
+}
+
+static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe)
+{
+	return (1ul << kvmppc_slb_sid_shift(slbe)) - 1;
+}
+
+static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr)
+{
+	eaddr &= kvmppc_slb_offset_mask(slb);
+
+	return (eaddr >> VPN_SHIFT) |
+		((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT));
+}
+
+static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
+					 bool data)
+{
+	struct kvmppc_slb *slb;
+
+	slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
+	if (!slb)
+		return 0;
+
+	return kvmppc_slb_calc_vpn(slb, eaddr);
+}
+
+static int mmu_pagesize(int mmu_pg)
+{
+	switch (mmu_pg) {
+	case MMU_PAGE_64K:
+		return 16;
+	case MMU_PAGE_16M:
+		return 24;
+	}
+	return 12;
+}
+
+static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
+{
+	return mmu_pagesize(slbe->base_page_size);
+}
+
+static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
+{
+	int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
+
+	return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p);
+}
+
+static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu,
+				struct kvmppc_slb *slbe, gva_t eaddr,
+				bool second)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u64 hash, pteg, htabsize;
+	u32 ssize;
+	hva_t r;
+	u64 vpn;
+
+	htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1);
+
+	vpn = kvmppc_slb_calc_vpn(slbe, eaddr);
+	ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M;
+	hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize);
+	if (second)
+		hash = ~hash;
+	hash &= ((1ULL << 39ULL) - 1ULL);
+	hash &= htabsize;
+	hash <<= 7ULL;
+
+	pteg = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
+	pteg |= hash;
+
+	dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",
+		page, vcpu_book3s->sdr1, pteg, slbe->vsid);
+
+	/* When running a PAPR guest, SDR1 contains a HVA address instead
+           of a GPA */
+	if (vcpu->arch.papr_enabled)
+		r = pteg;
+	else
+		r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT);
+
+	if (kvm_is_error_hva(r))
+		return r;
+	return r | (pteg & ~PAGE_MASK);
+}
+
+static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr)
+{
+	int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
+	u64 avpn;
+
+	avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
+	avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p);
+
+	if (p < 16)
+		avpn >>= ((80 - p) - 56) - 8;	/* 16 - p */
+	else
+		avpn <<= p - 16;
+
+	return avpn;
+}
+
+/*
+ * Return page size encoded in the second word of a HPTE, or
+ * -1 for an invalid encoding for the base page size indicated by
+ * the SLB entry.  This doesn't handle mixed pagesize segments yet.
+ */
+static int decode_pagesize(struct kvmppc_slb *slbe, u64 r)
+{
+	switch (slbe->base_page_size) {
+	case MMU_PAGE_64K:
+		if ((r & 0xf000) == 0x1000)
+			return MMU_PAGE_64K;
+		break;
+	case MMU_PAGE_16M:
+		if ((r & 0xff000) == 0)
+			return MMU_PAGE_16M;
+		break;
+	}
+	return -1;
+}
+
+static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+				      struct kvmppc_pte *gpte, bool data,
+				      bool iswrite)
+{
+	struct kvmppc_slb *slbe;
+	hva_t ptegp;
+	u64 pteg[16];
+	u64 avpn = 0;
+	u64 v, r;
+	u64 v_val, v_mask;
+	u64 eaddr_mask;
+	int i;
+	u8 pp, key = 0;
+	bool found = false;
+	bool second = false;
+	int pgsize;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	/* Magic page override */
+	if (unlikely(mp_ea) &&
+	    unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		gpte->eaddr = eaddr;
+		gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+		gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff);
+		gpte->raddr &= KVM_PAM;
+		gpte->may_execute = true;
+		gpte->may_read = true;
+		gpte->may_write = true;
+		gpte->page_size = MMU_PAGE_4K;
+
+		return 0;
+	}
+
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
+	if (!slbe)
+		goto no_seg_found;
+
+	avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+	v_val = avpn & HPTE_V_AVPN;
+
+	if (slbe->tb)
+		v_val |= SLB_VSID_B_1T;
+	if (slbe->large)
+		v_val |= HPTE_V_LARGE;
+	v_val |= HPTE_V_VALID;
+
+	v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID |
+		HPTE_V_SECONDARY;
+
+	pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K;
+
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+
+do_second:
+	ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second);
+	if (kvm_is_error_hva(ptegp))
+		goto no_page_found;
+
+	if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
+		printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);
+		goto no_page_found;
+	}
+
+	if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp)
+		key = 4;
+	else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks)
+		key = 4;
+
+	for (i=0; i<16; i+=2) {
+		u64 pte0 = be64_to_cpu(pteg[i]);
+		u64 pte1 = be64_to_cpu(pteg[i + 1]);
+
+		/* Check all relevant fields of 1st dword */
+		if ((pte0 & v_mask) == v_val) {
+			/* If large page bit is set, check pgsize encoding */
+			if (slbe->large &&
+			    (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) {
+				pgsize = decode_pagesize(slbe, pte1);
+				if (pgsize < 0)
+					continue;
+			}
+			found = true;
+			break;
+		}
+	}
+
+	if (!found) {
+		if (second)
+			goto no_page_found;
+		v_val |= HPTE_V_SECONDARY;
+		second = true;
+		goto do_second;
+	}
+
+	v = be64_to_cpu(pteg[i]);
+	r = be64_to_cpu(pteg[i+1]);
+	pp = (r & HPTE_R_PP) | key;
+	if (r & HPTE_R_PP0)
+		pp |= 8;
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data);
+
+	eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1;
+	gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask);
+	gpte->page_size = pgsize;
+	gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+	if (unlikely(vcpu->arch.disable_kernel_nx) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR))
+		gpte->may_execute = true;
+	gpte->may_read = false;
+	gpte->may_write = false;
+
+	switch (pp) {
+	case 0:
+	case 1:
+	case 2:
+	case 6:
+		gpte->may_write = true;
+		/* fall through */
+	case 3:
+	case 5:
+	case 7:
+	case 10:
+		gpte->may_read = true;
+		break;
+	}
+
+	dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
+		"-> 0x%lx\n",
+		eaddr, avpn, gpte->vpage, gpte->raddr);
+
+	/* Update PTE R and C bits, so the guest's swapper knows we used the
+	 * page */
+	if (gpte->may_read && !(r & HPTE_R_R)) {
+		/*
+		 * Set the accessed flag.
+		 * We have to write this back with a single byte write
+		 * because another vcpu may be accessing this on
+		 * non-PAPR platforms such as mac99, and this is
+		 * what real hardware does.
+		 */
+                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64));
+		r |= HPTE_R_R;
+		put_user(r >> 8, addr + 6);
+	}
+	if (iswrite && gpte->may_write && !(r & HPTE_R_C)) {
+		/* Set the dirty flag */
+		/* Use a single byte write */
+                char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64));
+		r |= HPTE_R_C;
+		put_user(r, addr + 7);
+	}
+
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+
+	if (!gpte->may_read || (iswrite && !gpte->may_write))
+		return -EPERM;
+	return 0;
+
+no_page_found:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	return -ENOENT;
+
+no_seg_found:
+	dprintk("KVM MMU: Trigger segment fault\n");
+	return -EINVAL;
+}
+
+static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s;
+	u64 esid, esid_1t;
+	int slb_nr;
+	struct kvmppc_slb *slbe;
+
+	dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb);
+
+	vcpu_book3s = to_book3s(vcpu);
+
+	esid = GET_ESID(rb);
+	esid_1t = GET_ESID_1T(rb);
+	slb_nr = rb & 0xfff;
+
+	if (slb_nr > vcpu->arch.slb_nr)
+		return;
+
+	slbe = &vcpu->arch.slb[slb_nr];
+
+	slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
+	slbe->tb    = (rs & SLB_VSID_B_1T) ? 1 : 0;
+	slbe->esid  = slbe->tb ? esid_1t : esid;
+	slbe->vsid  = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16);
+	slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
+	slbe->Ks    = (rs & SLB_VSID_KS) ? 1 : 0;
+	slbe->Kp    = (rs & SLB_VSID_KP) ? 1 : 0;
+	slbe->nx    = (rs & SLB_VSID_N) ? 1 : 0;
+	slbe->class = (rs & SLB_VSID_C) ? 1 : 0;
+
+	slbe->base_page_size = MMU_PAGE_4K;
+	if (slbe->large) {
+		if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) {
+			switch (rs & SLB_VSID_LP) {
+			case SLB_VSID_LP_00:
+				slbe->base_page_size = MMU_PAGE_16M;
+				break;
+			case SLB_VSID_LP_01:
+				slbe->base_page_size = MMU_PAGE_64K;
+				break;
+			}
+		} else
+			slbe->base_page_size = MMU_PAGE_16M;
+	}
+
+	slbe->orige = rb & (ESID_MASK | SLB_ESID_V);
+	slbe->origv = rs;
+
+	/* Map the new segment */
+	kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT);
+}
+
+static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
+{
+	struct kvmppc_slb *slbe;
+
+	if (slb_nr > vcpu->arch.slb_nr)
+		return 0;
+
+	slbe = &vcpu->arch.slb[slb_nr];
+
+	return slbe->orige;
+}
+
+static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
+{
+	struct kvmppc_slb *slbe;
+
+	if (slb_nr > vcpu->arch.slb_nr)
+		return 0;
+
+	slbe = &vcpu->arch.slb[slb_nr];
+
+	return slbe->origv;
+}
+
+static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
+{
+	struct kvmppc_slb *slbe;
+	u64 seg_size;
+
+	dprintk("KVM MMU: slbie(0x%llx)\n", ea);
+
+	slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
+
+	if (!slbe)
+		return;
+
+	dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid);
+
+	slbe->valid = false;
+	slbe->orige = 0;
+	slbe->origv = 0;
+
+	seg_size = 1ull << kvmppc_slb_sid_shift(slbe);
+	kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size);
+}
+
+static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	dprintk("KVM MMU: slbia()\n");
+
+	for (i = 1; i < vcpu->arch.slb_nr; i++) {
+		vcpu->arch.slb[i].valid = false;
+		vcpu->arch.slb[i].orige = 0;
+		vcpu->arch.slb[i].origv = 0;
+	}
+
+	if (kvmppc_get_msr(vcpu) & MSR_IR) {
+		kvmppc_mmu_flush_segments(vcpu);
+		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+	}
+}
+
+static void kvmppc_mmu_book3s_64_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
+					ulong value)
+{
+	u64 rb = 0, rs = 0;
+
+	/*
+	 * According to Book3 2.01 mtsrin is implemented as:
+	 *
+	 * The SLB entry specified by (RB)32:35 is loaded from register
+	 * RS, as follows.
+	 *
+	 * SLBE Bit	Source			SLB Field
+	 *
+	 * 0:31		0x0000_0000		ESID-0:31
+	 * 32:35	(RB)32:35		ESID-32:35
+	 * 36		0b1			V
+	 * 37:61	0x00_0000|| 0b0		VSID-0:24
+	 * 62:88	(RS)37:63		VSID-25:51
+	 * 89:91	(RS)33:35		Ks Kp N
+	 * 92		(RS)36			L ((RS)36 must be 0b0)
+	 * 93		0b0			C
+	 */
+
+	dprintk("KVM MMU: mtsrin(0x%x, 0x%lx)\n", srnum, value);
+
+	/* ESID = srnum */
+	rb |= (srnum & 0xf) << 28;
+	/* Set the valid bit */
+	rb |= 1 << 27;
+	/* Index = ESID */
+	rb |= srnum;
+
+	/* VSID = VSID */
+	rs |= (value & 0xfffffff) << 12;
+	/* flags = flags */
+	rs |= ((value >> 28) & 0x7) << 9;
+
+	kvmppc_mmu_book3s_64_slbmte(vcpu, rs, rb);
+}
+
+static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va,
+				       bool large)
+{
+	u64 mask = 0xFFFFFFFFFULL;
+	long i;
+	struct kvm_vcpu *v;
+
+	dprintk("KVM MMU: tlbie(0x%lx)\n", va);
+
+	/*
+	 * The tlbie instruction changed behaviour starting with
+	 * POWER6.  POWER6 and later don't have the large page flag
+	 * in the instruction but in the RB value, along with bits
+	 * indicating page and segment sizes.
+	 */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) {
+		/* POWER6 or later */
+		if (va & 1) {		/* L bit */
+			if ((va & 0xf000) == 0x1000)
+				mask = 0xFFFFFFFF0ULL;	/* 64k page */
+			else
+				mask = 0xFFFFFF000ULL;	/* 16M page */
+		}
+	} else {
+		/* older processors, e.g. PPC970 */
+		if (large)
+			mask = 0xFFFFFF000ULL;
+	}
+	/* flush this VA on all vcpus */
+	kvm_for_each_vcpu(i, v, vcpu->kvm)
+		kvmppc_mmu_pte_vflush(v, va >> 12, mask);
+}
+
+#ifdef CONFIG_PPC_64K_PAGES
+static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid)
+{
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+
+	return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) &&
+		(mp_ea >> SID_SHIFT) == esid;
+}
+#endif
+
+static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
+					     u64 *vsid)
+{
+	ulong ea = esid << SID_SHIFT;
+	struct kvmppc_slb *slb;
+	u64 gvsid = esid;
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+	int pagesize = MMU_PAGE_64K;
+	u64 msr = kvmppc_get_msr(vcpu);
+
+	if (msr & (MSR_DR|MSR_IR)) {
+		slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea);
+		if (slb) {
+			gvsid = slb->vsid;
+			pagesize = slb->base_page_size;
+			if (slb->tb) {
+				gvsid <<= SID_SHIFT_1T - SID_SHIFT;
+				gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1);
+				gvsid |= VSID_1T;
+			}
+		}
+	}
+
+	switch (msr & (MSR_DR|MSR_IR)) {
+	case 0:
+		gvsid = VSID_REAL | esid;
+		break;
+	case MSR_IR:
+		gvsid |= VSID_REAL_IR;
+		break;
+	case MSR_DR:
+		gvsid |= VSID_REAL_DR;
+		break;
+	case MSR_DR|MSR_IR:
+		if (!slb)
+			goto no_slb;
+
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/*
+	 * Mark this as a 64k segment if the host is using
+	 * 64k pages, the host MMU supports 64k pages and
+	 * the guest segment page size is >= 64k,
+	 * but not if this segment contains the magic page.
+	 */
+	if (pagesize >= MMU_PAGE_64K &&
+	    mmu_psize_defs[MMU_PAGE_64K].shift &&
+	    !segment_contains_magic_page(vcpu, esid))
+		gvsid |= VSID_64K;
+#endif
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	*vsid = gvsid;
+	return 0;
+
+no_slb:
+	/* Catch magic page case */
+	if (unlikely(mp_ea) &&
+	    unlikely(esid == (mp_ea >> SID_SHIFT)) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		*vsid = VSID_REAL | esid;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu)
+{
+	return (to_book3s(vcpu)->hid[5] & 0x80);
+}
+
+void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	mmu->mfsrin = NULL;
+	mmu->mtsrin = kvmppc_mmu_book3s_64_mtsrin;
+	mmu->slbmte = kvmppc_mmu_book3s_64_slbmte;
+	mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee;
+	mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev;
+	mmu->slbie = kvmppc_mmu_book3s_64_slbie;
+	mmu->slbia = kvmppc_mmu_book3s_64_slbia;
+	mmu->xlate = kvmppc_mmu_book3s_64_xlate;
+	mmu->reset_msr = kvmppc_mmu_book3s_64_reset_msr;
+	mmu->tlbie = kvmppc_mmu_book3s_64_tlbie;
+	mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid;
+	mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp;
+	mmu->is_dcbz32 = kvmppc_mmu_book3s_64_is_dcbz32;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
new file mode 100644
index 000000000..b982d925c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (C) 2009 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *     Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+#include "trace_pr.h"
+
+#define PTE_SIZE 12
+
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	ppc_md.hpte_invalidate(pte->slot, pte->host_vpn,
+			       pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M,
+			       false);
+}
+
+/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
+ * a hash, so we don't waste cycles on looping */
+static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+}
+
+
+static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	u16 sid_map_mask;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+	if (map->valid && (map->guest_vsid == gvsid)) {
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
+		return map;
+	}
+
+	map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
+	if (map->valid && (map->guest_vsid == gvsid)) {
+		trace_kvm_book3s_slb_found(gvsid, map->host_vsid);
+		return map;
+	}
+
+	trace_kvm_book3s_slb_fail(sid_map_mask, gvsid);
+	return NULL;
+}
+
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte,
+			bool iswrite)
+{
+	unsigned long vpn;
+	pfn_t hpaddr;
+	ulong hash, hpteg;
+	u64 vsid;
+	int ret;
+	int rflags = 0x192;
+	int vflags = 0;
+	int attempt = 0;
+	struct kvmppc_sid_map *map;
+	int r = 0;
+	int hpsize = MMU_PAGE_4K;
+	bool writable;
+	unsigned long mmu_seq;
+	struct kvm *kvm = vcpu->kvm;
+	struct hpte_cache *cpte;
+	unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT;
+	unsigned long pfn;
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	/* Get host physical address for gpa */
+	pfn = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable);
+	if (is_error_noslot_pfn(pfn)) {
+		printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n",
+		       orig_pte->raddr);
+		r = -EINVAL;
+		goto out;
+	}
+	hpaddr = pfn << PAGE_SHIFT;
+
+	/* and write the mapping ea -> hpa into the pt */
+	vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
+	map = find_sid_vsid(vcpu, vsid);
+	if (!map) {
+		ret = kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr);
+		WARN_ON(ret < 0);
+		map = find_sid_vsid(vcpu, vsid);
+	}
+	if (!map) {
+		printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n",
+				vsid, orig_pte->eaddr);
+		WARN_ON(true);
+		r = -EINVAL;
+		goto out;
+	}
+
+	vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M);
+
+	kvm_set_pfn_accessed(pfn);
+	if (!orig_pte->may_write || !writable)
+		rflags |= PP_RXRX;
+	else {
+		mark_page_dirty(vcpu->kvm, gfn);
+		kvm_set_pfn_dirty(pfn);
+	}
+
+	if (!orig_pte->may_execute)
+		rflags |= HPTE_R_N;
+	else
+		kvmppc_mmu_flush_icache(pfn);
+
+	/*
+	 * Use 64K pages if possible; otherwise, on 64K page kernels,
+	 * we need to transfer 4 more bits from guest real to host real addr.
+	 */
+	if (vsid & VSID_64K)
+		hpsize = MMU_PAGE_64K;
+	else
+		hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK);
+
+	hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M);
+
+	cpte = kvmppc_mmu_hpte_cache_next(vcpu);
+
+	spin_lock(&kvm->mmu_lock);
+	if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) {
+		r = -EAGAIN;
+		goto out_unlock;
+	}
+
+map_again:
+	hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+	/* In case we tried normal mapping already, let's nuke old entries */
+	if (attempt > 1)
+		if (ppc_md.hpte_remove(hpteg) < 0) {
+			r = -1;
+			goto out_unlock;
+		}
+
+	ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags,
+				 hpsize, hpsize, MMU_SEGSIZE_256M);
+
+	if (ret < 0) {
+		/* If we couldn't map a primary PTE, try a secondary */
+		hash = ~hash;
+		vflags ^= HPTE_V_SECONDARY;
+		attempt++;
+		goto map_again;
+	} else {
+		trace_kvm_book3s_64_mmu_map(rflags, hpteg,
+					    vpn, hpaddr, orig_pte);
+
+		/* The ppc_md code may give us a secondary entry even though we
+		   asked for a primary. Fix up. */
+		if ((ret & _PTEIDX_SECONDARY) && !(vflags & HPTE_V_SECONDARY)) {
+			hash = ~hash;
+			hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+		}
+
+		cpte->slot = hpteg + (ret & 7);
+		cpte->host_vpn = vpn;
+		cpte->pte = *orig_pte;
+		cpte->pfn = pfn;
+		cpte->pagesize = hpsize;
+
+		kvmppc_mmu_hpte_cache_map(vcpu, cpte);
+		cpte = NULL;
+	}
+
+out_unlock:
+	spin_unlock(&kvm->mmu_lock);
+	kvm_release_pfn_clean(pfn);
+	if (cpte)
+		kvmppc_mmu_hpte_cache_free(cpte);
+
+out:
+	return r;
+}
+
+void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	u64 mask = 0xfffffffffULL;
+	u64 vsid;
+
+	vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid);
+	if (vsid & VSID_64K)
+		mask = 0xffffffff0ULL;
+	kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask);
+}
+
+static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+	struct kvmppc_sid_map *map;
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	u16 sid_map_mask;
+	static int backwards_map = 0;
+
+	if (kvmppc_get_msr(vcpu) & MSR_PR)
+		gvsid |= VSID_PR;
+
+	/* We might get collisions that trap in preceding order, so let's
+	   map them differently */
+
+	sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+	if (backwards_map)
+		sid_map_mask = SID_MAP_MASK - sid_map_mask;
+
+	map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+
+	/* Make sure we're taking the other map next time */
+	backwards_map = !backwards_map;
+
+	/* Uh-oh ... out of mappings. Let's flush! */
+	if (vcpu_book3s->proto_vsid_next == vcpu_book3s->proto_vsid_max) {
+		vcpu_book3s->proto_vsid_next = vcpu_book3s->proto_vsid_first;
+		memset(vcpu_book3s->sid_map, 0,
+		       sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		kvmppc_mmu_flush_segments(vcpu);
+	}
+	map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M);
+
+	map->guest_vsid = gvsid;
+	map->valid = true;
+
+	trace_kvm_book3s_slb_map(sid_map_mask, gvsid, map->host_vsid);
+
+	return map;
+}
+
+static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	int i;
+	int max_slb_size = 64;
+	int found_inval = -1;
+	int r;
+
+	/* Are we overwriting? */
+	for (i = 0; i < svcpu->slb_max; i++) {
+		if (!(svcpu->slb[i].esid & SLB_ESID_V))
+			found_inval = i;
+		else if ((svcpu->slb[i].esid & ESID_MASK) == esid) {
+			r = i;
+			goto out;
+		}
+	}
+
+	/* Found a spare entry that was invalidated before */
+	if (found_inval >= 0) {
+		r = found_inval;
+		goto out;
+	}
+
+	/* No spare invalid entry, so create one */
+
+	if (mmu_slb_size < 64)
+		max_slb_size = mmu_slb_size;
+
+	/* Overflowing -> purge */
+	if ((svcpu->slb_max) == max_slb_size)
+		kvmppc_mmu_flush_segments(vcpu);
+
+	r = svcpu->slb_max;
+	svcpu->slb_max++;
+
+out:
+	svcpu_put(svcpu);
+	return r;
+}
+
+int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	u64 esid = eaddr >> SID_SHIFT;
+	u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;
+	u64 slb_vsid = SLB_VSID_USER;
+	u64 gvsid;
+	int slb_index;
+	struct kvmppc_sid_map *map;
+	int r = 0;
+
+	slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);
+
+	if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
+		/* Invalidate an entry */
+		svcpu->slb[slb_index].esid = 0;
+		r = -ENOENT;
+		goto out;
+	}
+
+	map = find_sid_vsid(vcpu, gvsid);
+	if (!map)
+		map = create_sid_map(vcpu, gvsid);
+
+	map->guest_esid = esid;
+
+	slb_vsid |= (map->host_vsid << 12);
+	slb_vsid &= ~SLB_VSID_KP;
+	slb_esid |= slb_index;
+
+#ifdef CONFIG_PPC_64K_PAGES
+	/* Set host segment base page size to 64K if possible */
+	if (gvsid & VSID_64K)
+		slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp;
+#endif
+
+	svcpu->slb[slb_index].esid = slb_esid;
+	svcpu->slb[slb_index].vsid = slb_vsid;
+
+	trace_kvm_book3s_slbmte(slb_vsid, slb_esid);
+
+out:
+	svcpu_put(svcpu);
+	return r;
+}
+
+void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	ulong seg_mask = -seg_size;
+	int i;
+
+	for (i = 0; i < svcpu->slb_max; i++) {
+		if ((svcpu->slb[i].esid & SLB_ESID_V) &&
+		    (svcpu->slb[i].esid & seg_mask) == ea) {
+			/* Invalidate this entry */
+			svcpu->slb[i].esid = 0;
+		}
+	}
+
+	svcpu_put(svcpu);
+}
+
+void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	svcpu->slb_max = 0;
+	svcpu->slb[0].esid = 0;
+	svcpu_put(svcpu);
+}
+
+void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_hpte_destroy(vcpu);
+	__destroy_context(to_book3s(vcpu)->context_id[0]);
+}
+
+int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int err;
+
+	err = __init_new_context();
+	if (err < 0)
+		return -1;
+	vcpu3s->context_id[0] = err;
+
+	vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1)
+				  << ESID_BITS) - 1;
+	vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS;
+	vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first;
+
+	kvmppc_mmu_hpte_init(vcpu);
+
+	return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
new file mode 100644
index 000000000..1a4acf8bf
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -0,0 +1,1637 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/vmalloc.h>
+#include <linux/srcu.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/debugfs.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/cputable.h>
+
+#include "trace_hv.h"
+
+/* Power architecture requires HPT is at least 256kB */
+#define PPC_MIN_HPT_ORDER	18
+
+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+				long pte_index, unsigned long pteh,
+				unsigned long ptel, unsigned long *pte_idx_ret);
+static void kvmppc_rmap_reset(struct kvm *kvm);
+
+long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
+{
+	unsigned long hpt = 0;
+	struct revmap_entry *rev;
+	struct page *page = NULL;
+	long order = KVM_DEFAULT_HPT_ORDER;
+
+	if (htab_orderp) {
+		order = *htab_orderp;
+		if (order < PPC_MIN_HPT_ORDER)
+			order = PPC_MIN_HPT_ORDER;
+	}
+
+	kvm->arch.hpt_cma_alloc = 0;
+	page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
+	if (page) {
+		hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
+		memset((void *)hpt, 0, (1ul << order));
+		kvm->arch.hpt_cma_alloc = 1;
+	}
+
+	/* Lastly try successively smaller sizes from the page allocator */
+	while (!hpt && order > PPC_MIN_HPT_ORDER) {
+		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
+				       __GFP_NOWARN, order - PAGE_SHIFT);
+		if (!hpt)
+			--order;
+	}
+
+	if (!hpt)
+		return -ENOMEM;
+
+	kvm->arch.hpt_virt = hpt;
+	kvm->arch.hpt_order = order;
+	/* HPTEs are 2**4 bytes long */
+	kvm->arch.hpt_npte = 1ul << (order - 4);
+	/* 128 (2**7) bytes in each HPTEG */
+	kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
+
+	/* Allocate reverse map array */
+	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
+	if (!rev) {
+		pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
+		goto out_freehpt;
+	}
+	kvm->arch.revmap = rev;
+	kvm->arch.sdr1 = __pa(hpt) | (order - 18);
+
+	pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
+		hpt, order, kvm->arch.lpid);
+
+	if (htab_orderp)
+		*htab_orderp = order;
+	return 0;
+
+ out_freehpt:
+	if (kvm->arch.hpt_cma_alloc)
+		kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
+	else
+		free_pages(hpt, order - PAGE_SHIFT);
+	return -ENOMEM;
+}
+
+long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
+{
+	long err = -EBUSY;
+	long order;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->arch.hpte_setup_done) {
+		kvm->arch.hpte_setup_done = 0;
+		/* order hpte_setup_done vs. vcpus_running */
+		smp_mb();
+		if (atomic_read(&kvm->arch.vcpus_running)) {
+			kvm->arch.hpte_setup_done = 1;
+			goto out;
+		}
+	}
+	if (kvm->arch.hpt_virt) {
+		order = kvm->arch.hpt_order;
+		/* Set the entire HPT to 0, i.e. invalid HPTEs */
+		memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
+		/*
+		 * Reset all the reverse-mapping chains for all memslots
+		 */
+		kvmppc_rmap_reset(kvm);
+		/* Ensure that each vcpu will flush its TLB on next entry. */
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+		*htab_orderp = order;
+		err = 0;
+	} else {
+		err = kvmppc_alloc_hpt(kvm, htab_orderp);
+		order = *htab_orderp;
+	}
+ out:
+	mutex_unlock(&kvm->lock);
+	return err;
+}
+
+void kvmppc_free_hpt(struct kvm *kvm)
+{
+	kvmppc_free_lpid(kvm->arch.lpid);
+	vfree(kvm->arch.revmap);
+	if (kvm->arch.hpt_cma_alloc)
+		kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
+				1 << (kvm->arch.hpt_order - PAGE_SHIFT));
+	else
+		free_pages(kvm->arch.hpt_virt,
+			   kvm->arch.hpt_order - PAGE_SHIFT);
+}
+
+/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
+}
+
+/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
+static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
+{
+	return (pgsize == 0x10000) ? 0x1000 : 0;
+}
+
+void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+		     unsigned long porder)
+{
+	unsigned long i;
+	unsigned long npages;
+	unsigned long hp_v, hp_r;
+	unsigned long addr, hash;
+	unsigned long psize;
+	unsigned long hp0, hp1;
+	unsigned long idx_ret;
+	long ret;
+	struct kvm *kvm = vcpu->kvm;
+
+	psize = 1ul << porder;
+	npages = memslot->npages >> (porder - PAGE_SHIFT);
+
+	/* VRMA can't be > 1TB */
+	if (npages > 1ul << (40 - porder))
+		npages = 1ul << (40 - porder);
+	/* Can't use more than 1 HPTE per HPTEG */
+	if (npages > kvm->arch.hpt_mask + 1)
+		npages = kvm->arch.hpt_mask + 1;
+
+	hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
+		HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
+	hp1 = hpte1_pgsize_encoding(psize) |
+		HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
+
+	for (i = 0; i < npages; ++i) {
+		addr = i << porder;
+		/* can't use hpt_hash since va > 64 bits */
+		hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
+		/*
+		 * We assume that the hash table is empty and no
+		 * vcpus are using it at this stage.  Since we create
+		 * at most one HPTE per HPTEG, we just assume entry 7
+		 * is available and use it.
+		 */
+		hash = (hash << 3) + 7;
+		hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
+		hp_r = hp1 | addr;
+		ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
+						 &idx_ret);
+		if (ret != H_SUCCESS) {
+			pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
+			       addr, ret);
+			break;
+		}
+	}
+}
+
+int kvmppc_mmu_hv_init(void)
+{
+	unsigned long host_lpid, rsvd_lpid;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return -EINVAL;
+
+	/* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
+	host_lpid = mfspr(SPRN_LPID);
+	rsvd_lpid = LPID_RSVD;
+
+	kvmppc_init_lpid(rsvd_lpid + 1);
+
+	kvmppc_claim_lpid(host_lpid);
+	/* rsvd_lpid is reserved for use in partition switching */
+	kvmppc_claim_lpid(rsvd_lpid);
+
+	return 0;
+}
+
+static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
+{
+	unsigned long msr = vcpu->arch.intr_msr;
+
+	/* If transactional, change to suspend mode on IRQ delivery */
+	if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
+		msr |= MSR_TS_S;
+	else
+		msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
+	kvmppc_set_msr(vcpu, msr);
+}
+
+long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+				long pte_index, unsigned long pteh,
+				unsigned long ptel, unsigned long *pte_idx_ret)
+{
+	long ret;
+
+	/* Protect linux PTE lookup from page table destruction */
+	rcu_read_lock_sched();	/* this disables preemption too */
+	ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
+				current->mm->pgd, false, pte_idx_ret);
+	rcu_read_unlock_sched();
+	if (ret == H_TOO_HARD) {
+		/* this can't happen */
+		pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
+		ret = H_RESOURCE;	/* or something */
+	}
+	return ret;
+
+}
+
+static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
+							 gva_t eaddr)
+{
+	u64 mask;
+	int i;
+
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
+			continue;
+
+		if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
+			mask = ESID_MASK_1T;
+		else
+			mask = ESID_MASK;
+
+		if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
+			return &vcpu->arch.slb[i];
+	}
+	return NULL;
+}
+
+static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
+			unsigned long ea)
+{
+	unsigned long ra_mask;
+
+	ra_mask = hpte_page_size(v, r) - 1;
+	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
+}
+
+static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+			struct kvmppc_pte *gpte, bool data, bool iswrite)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_slb *slbe;
+	unsigned long slb_v;
+	unsigned long pp, key;
+	unsigned long v, gr;
+	__be64 *hptep;
+	int index;
+	int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
+
+	/* Get SLB entry */
+	if (virtmode) {
+		slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
+		if (!slbe)
+			return -EINVAL;
+		slb_v = slbe->origv;
+	} else {
+		/* real mode access */
+		slb_v = vcpu->kvm->arch.vrma_slb_v;
+	}
+
+	preempt_disable();
+	/* Find the HPTE in the hash table */
+	index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
+					 HPTE_V_VALID | HPTE_V_ABSENT);
+	if (index < 0) {
+		preempt_enable();
+		return -ENOENT;
+	}
+	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+	v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
+	gr = kvm->arch.revmap[index].guest_rpte;
+
+	unlock_hpte(hptep, v);
+	preempt_enable();
+
+	gpte->eaddr = eaddr;
+	gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
+
+	/* Get PP bits and key for permission check */
+	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
+	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
+	key &= slb_v;
+
+	/* Calculate permissions */
+	gpte->may_read = hpte_read_permission(pp, key);
+	gpte->may_write = hpte_write_permission(pp, key);
+	gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
+
+	/* Storage key permission check for POWER7 */
+	if (data && virtmode) {
+		int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
+		if (amrfield & 1)
+			gpte->may_read = 0;
+		if (amrfield & 2)
+			gpte->may_write = 0;
+	}
+
+	/* Get the guest physical address */
+	gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
+	return 0;
+}
+
+/*
+ * Quick test for whether an instruction is a load or a store.
+ * If the instruction is a load or a store, then this will indicate
+ * which it is, at least on server processors.  (Embedded processors
+ * have some external PID instructions that don't follow the rule
+ * embodied here.)  If the instruction isn't a load or store, then
+ * this doesn't return anything useful.
+ */
+static int instruction_is_store(unsigned int instr)
+{
+	unsigned int mask;
+
+	mask = 0x10000000;
+	if ((instr & 0xfc000000) == 0x7c000000)
+		mask = 0x100;		/* major opcode 31 */
+	return (instr & mask) != 0;
+}
+
+static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				  unsigned long gpa, gva_t ea, int is_store)
+{
+	u32 last_inst;
+
+	/*
+	 * If we fail, we just return to the guest and try executing it again.
+	 */
+	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
+		EMULATE_DONE)
+		return RESUME_GUEST;
+
+	/*
+	 * WARNING: We do not know for sure whether the instruction we just
+	 * read from memory is the same that caused the fault in the first
+	 * place.  If the instruction we read is neither an load or a store,
+	 * then it can't access memory, so we don't need to worry about
+	 * enforcing access permissions.  So, assuming it is a load or
+	 * store, we just check that its direction (load or store) is
+	 * consistent with the original fault, since that's what we
+	 * checked the access permissions against.  If there is a mismatch
+	 * we just return and retry the instruction.
+	 */
+
+	if (instruction_is_store(last_inst) != !!is_store)
+		return RESUME_GUEST;
+
+	/*
+	 * Emulated accesses are emulated by looking at the hash for
+	 * translation once, then performing the access later. The
+	 * translation could be invalidated in the meantime in which
+	 * point performing the subsequent memory access on the old
+	 * physical address could possibly be a security hole for the
+	 * guest (but not the host).
+	 *
+	 * This is less of an issue for MMIO stores since they aren't
+	 * globally visible. It could be an issue for MMIO loads to
+	 * a certain extent but we'll ignore it for now.
+	 */
+
+	vcpu->arch.paddr_accessed = gpa;
+	vcpu->arch.vaddr_accessed = ea;
+	return kvmppc_emulate_mmio(run, vcpu);
+}
+
+int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				unsigned long ea, unsigned long dsisr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long hpte[3], r;
+	__be64 *hptep;
+	unsigned long mmu_seq, psize, pte_size;
+	unsigned long gpa_base, gfn_base;
+	unsigned long gpa, gfn, hva, pfn;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+	struct revmap_entry *rev;
+	struct page *page, *pages[1];
+	long index, ret, npages;
+	unsigned long is_io;
+	unsigned int writing, write_ok;
+	struct vm_area_struct *vma;
+	unsigned long rcbits;
+
+	/*
+	 * Real-mode code has already searched the HPT and found the
+	 * entry we're interested in.  Lock the entry and check that
+	 * it hasn't changed.  If it has, just return and re-execute the
+	 * instruction.
+	 */
+	if (ea != vcpu->arch.pgfault_addr)
+		return RESUME_GUEST;
+	index = vcpu->arch.pgfault_index;
+	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+	rev = &kvm->arch.revmap[index];
+	preempt_disable();
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
+	hpte[1] = be64_to_cpu(hptep[1]);
+	hpte[2] = r = rev->guest_rpte;
+	unlock_hpte(hptep, hpte[0]);
+	preempt_enable();
+
+	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
+	    hpte[1] != vcpu->arch.pgfault_hpte[1])
+		return RESUME_GUEST;
+
+	/* Translate the logical address and get the page */
+	psize = hpte_page_size(hpte[0], r);
+	gpa_base = r & HPTE_R_RPN & ~(psize - 1);
+	gfn_base = gpa_base >> PAGE_SHIFT;
+	gpa = gpa_base | (ea & (psize - 1));
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = gfn_to_memslot(kvm, gfn);
+
+	trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
+
+	/* No memslot means it's an emulated MMIO region */
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
+					      dsisr & DSISR_ISSTORE);
+
+	/*
+	 * This should never happen, because of the slot_is_aligned()
+	 * check in kvmppc_do_h_enter().
+	 */
+	if (gfn_base < memslot->base_gfn)
+		return -EFAULT;
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	ret = -EFAULT;
+	is_io = 0;
+	pfn = 0;
+	page = NULL;
+	pte_size = PAGE_SIZE;
+	writing = (dsisr & DSISR_ISSTORE) != 0;
+	/* If writing != 0, then the HPTE must allow writing, if we get here */
+	write_ok = writing;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	npages = get_user_pages_fast(hva, 1, writing, pages);
+	if (npages < 1) {
+		/* Check if it's an I/O mapping */
+		down_read(&current->mm->mmap_sem);
+		vma = find_vma(current->mm, hva);
+		if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			pfn = vma->vm_pgoff +
+				((hva - vma->vm_start) >> PAGE_SHIFT);
+			pte_size = psize;
+			is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
+			write_ok = vma->vm_flags & VM_WRITE;
+		}
+		up_read(&current->mm->mmap_sem);
+		if (!pfn)
+			goto out_put;
+	} else {
+		page = pages[0];
+		pfn = page_to_pfn(page);
+		if (PageHuge(page)) {
+			page = compound_head(page);
+			pte_size <<= compound_order(page);
+		}
+		/* if the guest wants write access, see if that is OK */
+		if (!writing && hpte_is_writable(r)) {
+			pte_t *ptep, pte;
+			unsigned long flags;
+			/*
+			 * We need to protect against page table destruction
+			 * hugepage split and collapse.
+			 */
+			local_irq_save(flags);
+			ptep = find_linux_pte_or_hugepte(current->mm->pgd,
+							 hva, NULL);
+			if (ptep) {
+				pte = kvmppc_read_update_linux_pte(ptep, 1);
+				if (pte_write(pte))
+					write_ok = 1;
+			}
+			local_irq_restore(flags);
+		}
+	}
+
+	if (psize > pte_size)
+		goto out_put;
+
+	/* Check WIMG vs. the actual page we're accessing */
+	if (!hpte_cache_flags_ok(r, is_io)) {
+		if (is_io)
+			goto out_put;
+
+		/*
+		 * Allow guest to map emulated device memory as
+		 * uncacheable, but actually make it cacheable.
+		 */
+		r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
+	}
+
+	/*
+	 * Set the HPTE to point to pfn.
+	 * Since the pfn is at PAGE_SIZE granularity, make sure we
+	 * don't mask out lower-order bits if psize < PAGE_SIZE.
+	 */
+	if (psize < PAGE_SIZE)
+		psize = PAGE_SIZE;
+	r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1));
+	if (hpte_is_writable(r) && !write_ok)
+		r = hpte_make_readonly(r);
+	ret = RESUME_GUEST;
+	preempt_disable();
+	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+		cpu_relax();
+	if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
+		be64_to_cpu(hptep[1]) != hpte[1] ||
+		rev->guest_rpte != hpte[2])
+		/* HPTE has been changed under us; let the guest retry */
+		goto out_unlock;
+	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+
+	/* Always put the HPTE in the rmap chain for the page base address */
+	rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
+	lock_rmap(rmap);
+
+	/* Check if we might have been invalidated; let the guest retry if so */
+	ret = RESUME_GUEST;
+	if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
+		unlock_rmap(rmap);
+		goto out_unlock;
+	}
+
+	/* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
+	rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+	r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
+
+	if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
+		/* HPTE was previously valid, so we need to invalidate it */
+		unlock_rmap(rmap);
+		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hptep, index);
+		/* don't lose previous R and C bits */
+		r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+	} else {
+		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
+	}
+
+	hptep[1] = cpu_to_be64(r);
+	eieio();
+	__unlock_hpte(hptep, hpte[0]);
+	asm volatile("ptesync" : : : "memory");
+	preempt_enable();
+	if (page && hpte_is_writable(r))
+		SetPageDirty(page);
+
+ out_put:
+	trace_kvm_page_fault_exit(vcpu, hpte, ret);
+
+	if (page) {
+		/*
+		 * We drop pages[0] here, not page because page might
+		 * have been set to the head page of a compound, but
+		 * we have to drop the reference on the correct tail
+		 * page to match the get inside gup()
+		 */
+		put_page(pages[0]);
+	}
+	return ret;
+
+ out_unlock:
+	__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+	preempt_enable();
+	goto out_put;
+}
+
+static void kvmppc_rmap_reset(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	slots = kvm->memslots;
+	kvm_for_each_memslot(memslot, slots) {
+		/*
+		 * This assumes it is acceptable to lose reference and
+		 * change bits across a reset.
+		 */
+		memset(memslot->arch.rmap, 0,
+		       memslot->npages * sizeof(*memslot->arch.rmap));
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+static int kvm_handle_hva_range(struct kvm *kvm,
+				unsigned long start,
+				unsigned long end,
+				int (*handler)(struct kvm *kvm,
+					       unsigned long *rmapp,
+					       unsigned long gfn))
+{
+	int ret;
+	int retval = 0;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		unsigned long hva_start, hva_end;
+		gfn_t gfn, gfn_end;
+
+		hva_start = max(start, memslot->userspace_addr);
+		hva_end = min(end, memslot->userspace_addr +
+					(memslot->npages << PAGE_SHIFT));
+		if (hva_start >= hva_end)
+			continue;
+		/*
+		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+		 * {gfn, gfn+1, ..., gfn_end-1}.
+		 */
+		gfn = hva_to_gfn_memslot(hva_start, memslot);
+		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+		for (; gfn < gfn_end; ++gfn) {
+			gfn_t gfn_offset = gfn - memslot->base_gfn;
+
+			ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
+			retval |= ret;
+		}
+	}
+
+	return retval;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+			  int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+					 unsigned long gfn))
+{
+	return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			   unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long h, i, j;
+	__be64 *hptep;
+	unsigned long ptel, psize, rcbits;
+
+	for (;;) {
+		lock_rmap(rmapp);
+		if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+			unlock_rmap(rmapp);
+			break;
+		}
+
+		/*
+		 * To avoid an ABBA deadlock with the HPTE lock bit,
+		 * we can't spin on the HPTE lock while holding the
+		 * rmap chain lock.
+		 */
+		i = *rmapp & KVMPPC_RMAP_INDEX;
+		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
+				cpu_relax();
+			continue;
+		}
+		j = rev[i].forw;
+		if (j == i) {
+			/* chain is now empty */
+			*rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+		} else {
+			/* remove i from chain */
+			h = rev[i].back;
+			rev[h].forw = j;
+			rev[j].back = h;
+			rev[i].forw = rev[i].back = i;
+			*rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
+		}
+
+		/* Now check and modify the HPTE */
+		ptel = rev[i].guest_rpte;
+		psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+		    hpte_rpn(ptel, psize) == gfn) {
+			hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+			kvmppc_invalidate_hpte(kvm, hptep, i);
+			/* Harvest R and C */
+			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+			if (rcbits & ~rev[i].guest_rpte) {
+				rev[i].guest_rpte = ptel | rcbits;
+				note_hpte_modification(kvm, &rev[i]);
+			}
+		}
+		unlock_rmap(rmapp);
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+	}
+	return 0;
+}
+
+int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
+{
+	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+	return 0;
+}
+
+int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+	return 0;
+}
+
+void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
+				  struct kvm_memory_slot *memslot)
+{
+	unsigned long *rmapp;
+	unsigned long gfn;
+	unsigned long n;
+
+	rmapp = memslot->arch.rmap;
+	gfn = memslot->base_gfn;
+	for (n = memslot->npages; n; --n) {
+		/*
+		 * Testing the present bit without locking is OK because
+		 * the memslot has been marked invalid already, and hence
+		 * no new HPTEs referencing this page can be created,
+		 * thus the present bit can't go from 0 to 1.
+		 */
+		if (*rmapp & KVMPPC_RMAP_PRESENT)
+			kvm_unmap_rmapp(kvm, rmapp, gfn);
+		++rmapp;
+		++gfn;
+	}
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			 unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long head, i, j;
+	__be64 *hptep;
+	int ret = 0;
+
+ retry:
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_REFERENCED) {
+		*rmapp &= ~KVMPPC_RMAP_REFERENCED;
+		ret = 1;
+	}
+	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+		unlock_rmap(rmapp);
+		return ret;
+	}
+
+	i = head = *rmapp & KVMPPC_RMAP_INDEX;
+	do {
+		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		j = rev[i].forw;
+
+		/* If this HPTE isn't referenced, ignore it */
+		if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
+			continue;
+
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
+				cpu_relax();
+			goto retry;
+		}
+
+		/* Now check and modify the HPTE */
+		if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+		    (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
+			kvmppc_clear_ref_hpte(kvm, hptep, i);
+			if (!(rev[i].guest_rpte & HPTE_R_R)) {
+				rev[i].guest_rpte |= HPTE_R_R;
+				note_hpte_modification(kvm, &rev[i]);
+			}
+			ret = 1;
+		}
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+	} while ((i = j) != head);
+
+	unlock_rmap(rmapp);
+	return ret;
+}
+
+int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp);
+}
+
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			      unsigned long gfn)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long head, i, j;
+	unsigned long *hp;
+	int ret = 1;
+
+	if (*rmapp & KVMPPC_RMAP_REFERENCED)
+		return 1;
+
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_REFERENCED)
+		goto out;
+
+	if (*rmapp & KVMPPC_RMAP_PRESENT) {
+		i = head = *rmapp & KVMPPC_RMAP_INDEX;
+		do {
+			hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
+			j = rev[i].forw;
+			if (be64_to_cpu(hp[1]) & HPTE_R_R)
+				goto out;
+		} while ((i = j) != head);
+	}
+	ret = 0;
+
+ out:
+	unlock_rmap(rmapp);
+	return ret;
+}
+
+int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
+{
+	return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
+}
+
+void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
+}
+
+static int vcpus_running(struct kvm *kvm)
+{
+	return atomic_read(&kvm->arch.vcpus_running) != 0;
+}
+
+/*
+ * Returns the number of system pages that are dirty.
+ * This can be more than 1 if we find a huge-page HPTE.
+ */
+static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
+{
+	struct revmap_entry *rev = kvm->arch.revmap;
+	unsigned long head, i, j;
+	unsigned long n;
+	unsigned long v, r;
+	__be64 *hptep;
+	int npages_dirty = 0;
+
+ retry:
+	lock_rmap(rmapp);
+	if (*rmapp & KVMPPC_RMAP_CHANGED) {
+		*rmapp &= ~KVMPPC_RMAP_CHANGED;
+		npages_dirty = 1;
+	}
+	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
+		unlock_rmap(rmapp);
+		return npages_dirty;
+	}
+
+	i = head = *rmapp & KVMPPC_RMAP_INDEX;
+	do {
+		unsigned long hptep1;
+		hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+		j = rev[i].forw;
+
+		/*
+		 * Checking the C (changed) bit here is racy since there
+		 * is no guarantee about when the hardware writes it back.
+		 * If the HPTE is not writable then it is stable since the
+		 * page can't be written to, and we would have done a tlbie
+		 * (which forces the hardware to complete any writeback)
+		 * when making the HPTE read-only.
+		 * If vcpus are running then this call is racy anyway
+		 * since the page could get dirtied subsequently, so we
+		 * expect there to be a further call which would pick up
+		 * any delayed C bit writeback.
+		 * Otherwise we need to do the tlbie even if C==0 in
+		 * order to pick up any delayed writeback of C.
+		 */
+		hptep1 = be64_to_cpu(hptep[1]);
+		if (!(hptep1 & HPTE_R_C) &&
+		    (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
+			continue;
+
+		if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
+			/* unlock rmap before spinning on the HPTE lock */
+			unlock_rmap(rmapp);
+			while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
+				cpu_relax();
+			goto retry;
+		}
+
+		/* Now check and modify the HPTE */
+		if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
+			__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
+			continue;
+		}
+
+		/* need to make it temporarily absent so C is stable */
+		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hptep, i);
+		v = be64_to_cpu(hptep[0]);
+		r = be64_to_cpu(hptep[1]);
+		if (r & HPTE_R_C) {
+			hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
+			if (!(rev[i].guest_rpte & HPTE_R_C)) {
+				rev[i].guest_rpte |= HPTE_R_C;
+				note_hpte_modification(kvm, &rev[i]);
+			}
+			n = hpte_page_size(v, r);
+			n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
+			if (n > npages_dirty)
+				npages_dirty = n;
+			eieio();
+		}
+		v &= ~HPTE_V_ABSENT;
+		v |= HPTE_V_VALID;
+		__unlock_hpte(hptep, v);
+	} while ((i = j) != head);
+
+	unlock_rmap(rmapp);
+	return npages_dirty;
+}
+
+static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+			      struct kvm_memory_slot *memslot,
+			      unsigned long *map)
+{
+	unsigned long gfn;
+
+	if (!vpa->dirty || !vpa->pinned_addr)
+		return;
+	gfn = vpa->gpa >> PAGE_SHIFT;
+	if (gfn < memslot->base_gfn ||
+	    gfn >= memslot->base_gfn + memslot->npages)
+		return;
+
+	vpa->dirty = false;
+	if (map)
+		__set_bit_le(gfn - memslot->base_gfn, map);
+}
+
+long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			     unsigned long *map)
+{
+	unsigned long i, j;
+	unsigned long *rmapp;
+	struct kvm_vcpu *vcpu;
+
+	preempt_disable();
+	rmapp = memslot->arch.rmap;
+	for (i = 0; i < memslot->npages; ++i) {
+		int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
+		/*
+		 * Note that if npages > 0 then i must be a multiple of npages,
+		 * since we always put huge-page HPTEs in the rmap chain
+		 * corresponding to their page base address.
+		 */
+		if (npages && map)
+			for (j = i; npages; ++j, --npages)
+				__set_bit_le(j, map);
+		++rmapp;
+	}
+
+	/* Harvest dirty bits from VPA and DTL updates */
+	/* Note: we never modify the SLB shadow buffer areas */
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
+		harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+	}
+	preempt_enable();
+	return 0;
+}
+
+void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
+			    unsigned long *nb_ret)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	struct page *page, *pages[1];
+	int npages;
+	unsigned long hva, offset;
+	int srcu_idx;
+
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		goto err;
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	npages = get_user_pages_fast(hva, 1, 1, pages);
+	if (npages < 1)
+		goto err;
+	page = pages[0];
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+	offset = gpa & (PAGE_SIZE - 1);
+	if (nb_ret)
+		*nb_ret = PAGE_SIZE - offset;
+	return page_address(page) + offset;
+
+ err:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+	return NULL;
+}
+
+void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
+			     bool dirty)
+{
+	struct page *page = virt_to_page(va);
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn;
+	unsigned long *rmap;
+	int srcu_idx;
+
+	put_page(page);
+
+	if (!dirty)
+		return;
+
+	/* We need to mark this page dirty in the rmap chain */
+	gfn = gpa >> PAGE_SHIFT;
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, gfn);
+	if (memslot) {
+		rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+		lock_rmap(rmap);
+		*rmap |= KVMPPC_RMAP_CHANGED;
+		unlock_rmap(rmap);
+	}
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/*
+ * Functions for reading and writing the hash table via reads and
+ * writes on a file descriptor.
+ *
+ * Reads return the guest view of the hash table, which has to be
+ * pieced together from the real hash table and the guest_rpte
+ * values in the revmap array.
+ *
+ * On writes, each HPTE written is considered in turn, and if it
+ * is valid, it is written to the HPT as if an H_ENTER with the
+ * exact flag set was done.  When the invalid count is non-zero
+ * in the header written to the stream, the kernel will make
+ * sure that that many HPTEs are invalid, and invalidate them
+ * if not.
+ */
+
+struct kvm_htab_ctx {
+	unsigned long	index;
+	unsigned long	flags;
+	struct kvm	*kvm;
+	int		first_pass;
+};
+
+#define HPTE_SIZE	(2 * sizeof(unsigned long))
+
+/*
+ * Returns 1 if this HPT entry has been modified or has pending
+ * R/C bit changes.
+ */
+static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
+{
+	unsigned long rcbits_unset;
+
+	if (revp->guest_rpte & HPTE_GR_MODIFIED)
+		return 1;
+
+	/* Also need to consider changes in reference and changed bits */
+	rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+	if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
+	    (be64_to_cpu(hptp[1]) & rcbits_unset))
+		return 1;
+
+	return 0;
+}
+
+static long record_hpte(unsigned long flags, __be64 *hptp,
+			unsigned long *hpte, struct revmap_entry *revp,
+			int want_valid, int first_pass)
+{
+	unsigned long v, r;
+	unsigned long rcbits_unset;
+	int ok = 1;
+	int valid, dirty;
+
+	/* Unmodified entries are uninteresting except on the first pass */
+	dirty = hpte_dirty(revp, hptp);
+	if (!first_pass && !dirty)
+		return 0;
+
+	valid = 0;
+	if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+		valid = 1;
+		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
+		    !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
+			valid = 0;
+	}
+	if (valid != want_valid)
+		return 0;
+
+	v = r = 0;
+	if (valid || dirty) {
+		/* lock the HPTE so it's stable and read it */
+		preempt_disable();
+		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+			cpu_relax();
+		v = be64_to_cpu(hptp[0]);
+
+		/* re-evaluate valid and dirty from synchronized HPTE value */
+		valid = !!(v & HPTE_V_VALID);
+		dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+		/* Harvest R and C into guest view if necessary */
+		rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+		if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
+			revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
+				(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
+			dirty = 1;
+		}
+
+		if (v & HPTE_V_ABSENT) {
+			v &= ~HPTE_V_ABSENT;
+			v |= HPTE_V_VALID;
+			valid = 1;
+		}
+		if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
+			valid = 0;
+
+		r = revp->guest_rpte;
+		/* only clear modified if this is the right sort of entry */
+		if (valid == want_valid && dirty) {
+			r &= ~HPTE_GR_MODIFIED;
+			revp->guest_rpte = r;
+		}
+		unlock_hpte(hptp, be64_to_cpu(hptp[0]));
+		preempt_enable();
+		if (!(valid == want_valid && (first_pass || dirty)))
+			ok = 0;
+	}
+	hpte[0] = cpu_to_be64(v);
+	hpte[1] = cpu_to_be64(r);
+	return ok;
+}
+
+static ssize_t kvm_htab_read(struct file *file, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	struct kvm_htab_ctx *ctx = file->private_data;
+	struct kvm *kvm = ctx->kvm;
+	struct kvm_get_htab_header hdr;
+	__be64 *hptp;
+	struct revmap_entry *revp;
+	unsigned long i, nb, nw;
+	unsigned long __user *lbuf;
+	struct kvm_get_htab_header __user *hptr;
+	unsigned long flags;
+	int first_pass;
+	unsigned long hpte[2];
+
+	if (!access_ok(VERIFY_WRITE, buf, count))
+		return -EFAULT;
+
+	first_pass = ctx->first_pass;
+	flags = ctx->flags;
+
+	i = ctx->index;
+	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+	revp = kvm->arch.revmap + i;
+	lbuf = (unsigned long __user *)buf;
+
+	nb = 0;
+	while (nb + sizeof(hdr) + HPTE_SIZE < count) {
+		/* Initialize header */
+		hptr = (struct kvm_get_htab_header __user *)buf;
+		hdr.n_valid = 0;
+		hdr.n_invalid = 0;
+		nw = nb;
+		nb += sizeof(hdr);
+		lbuf = (unsigned long __user *)(buf + sizeof(hdr));
+
+		/* Skip uninteresting entries, i.e. clean on not-first pass */
+		if (!first_pass) {
+			while (i < kvm->arch.hpt_npte &&
+			       !hpte_dirty(revp, hptp)) {
+				++i;
+				hptp += 2;
+				++revp;
+			}
+		}
+		hdr.index = i;
+
+		/* Grab a series of valid entries */
+		while (i < kvm->arch.hpt_npte &&
+		       hdr.n_valid < 0xffff &&
+		       nb + HPTE_SIZE < count &&
+		       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
+			/* valid entry, write it out */
+			++hdr.n_valid;
+			if (__put_user(hpte[0], lbuf) ||
+			    __put_user(hpte[1], lbuf + 1))
+				return -EFAULT;
+			nb += HPTE_SIZE;
+			lbuf += 2;
+			++i;
+			hptp += 2;
+			++revp;
+		}
+		/* Now skip invalid entries while we can */
+		while (i < kvm->arch.hpt_npte &&
+		       hdr.n_invalid < 0xffff &&
+		       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
+			/* found an invalid entry */
+			++hdr.n_invalid;
+			++i;
+			hptp += 2;
+			++revp;
+		}
+
+		if (hdr.n_valid || hdr.n_invalid) {
+			/* write back the header */
+			if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
+				return -EFAULT;
+			nw = nb;
+			buf = (char __user *)lbuf;
+		} else {
+			nb = nw;
+		}
+
+		/* Check if we've wrapped around the hash table */
+		if (i >= kvm->arch.hpt_npte) {
+			i = 0;
+			ctx->first_pass = 0;
+			break;
+		}
+	}
+
+	ctx->index = i;
+
+	return nb;
+}
+
+static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	struct kvm_htab_ctx *ctx = file->private_data;
+	struct kvm *kvm = ctx->kvm;
+	struct kvm_get_htab_header hdr;
+	unsigned long i, j;
+	unsigned long v, r;
+	unsigned long __user *lbuf;
+	__be64 *hptp;
+	unsigned long tmp[2];
+	ssize_t nb;
+	long int err, ret;
+	int hpte_setup;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	/* lock out vcpus from running while we're doing this */
+	mutex_lock(&kvm->lock);
+	hpte_setup = kvm->arch.hpte_setup_done;
+	if (hpte_setup) {
+		kvm->arch.hpte_setup_done = 0;	/* temporarily */
+		/* order hpte_setup_done vs. vcpus_running */
+		smp_mb();
+		if (atomic_read(&kvm->arch.vcpus_running)) {
+			kvm->arch.hpte_setup_done = 1;
+			mutex_unlock(&kvm->lock);
+			return -EBUSY;
+		}
+	}
+
+	err = 0;
+	for (nb = 0; nb + sizeof(hdr) <= count; ) {
+		err = -EFAULT;
+		if (__copy_from_user(&hdr, buf, sizeof(hdr)))
+			break;
+
+		err = 0;
+		if (nb + hdr.n_valid * HPTE_SIZE > count)
+			break;
+
+		nb += sizeof(hdr);
+		buf += sizeof(hdr);
+
+		err = -EINVAL;
+		i = hdr.index;
+		if (i >= kvm->arch.hpt_npte ||
+		    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+			break;
+
+		hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+		lbuf = (unsigned long __user *)buf;
+		for (j = 0; j < hdr.n_valid; ++j) {
+			__be64 hpte_v;
+			__be64 hpte_r;
+
+			err = -EFAULT;
+			if (__get_user(hpte_v, lbuf) ||
+			    __get_user(hpte_r, lbuf + 1))
+				goto out;
+			v = be64_to_cpu(hpte_v);
+			r = be64_to_cpu(hpte_r);
+			err = -EINVAL;
+			if (!(v & HPTE_V_VALID))
+				goto out;
+			lbuf += 2;
+			nb += HPTE_SIZE;
+
+			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
+				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+			err = -EIO;
+			ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
+							 tmp);
+			if (ret != H_SUCCESS) {
+				pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
+				       "r=%lx\n", ret, i, v, r);
+				goto out;
+			}
+			if (!hpte_setup && is_vrma_hpte(v)) {
+				unsigned long psize = hpte_base_page_size(v, r);
+				unsigned long senc = slb_pgsize_encoding(psize);
+				unsigned long lpcr;
+
+				kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+					(VRMA_VSID << SLB_VSID_SHIFT_1T);
+				lpcr = senc << (LPCR_VRMASD_SH - 4);
+				kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
+				hpte_setup = 1;
+			}
+			++i;
+			hptp += 2;
+		}
+
+		for (j = 0; j < hdr.n_invalid; ++j) {
+			if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
+				kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+			++i;
+			hptp += 2;
+		}
+		err = 0;
+	}
+
+ out:
+	/* Order HPTE updates vs. hpte_setup_done */
+	smp_wmb();
+	kvm->arch.hpte_setup_done = hpte_setup;
+	mutex_unlock(&kvm->lock);
+
+	if (err)
+		return err;
+	return nb;
+}
+
+static int kvm_htab_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_htab_ctx *ctx = filp->private_data;
+
+	filp->private_data = NULL;
+	if (!(ctx->flags & KVM_GET_HTAB_WRITE))
+		atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
+	kvm_put_kvm(ctx->kvm);
+	kfree(ctx);
+	return 0;
+}
+
+static const struct file_operations kvm_htab_fops = {
+	.read		= kvm_htab_read,
+	.write		= kvm_htab_write,
+	.llseek		= default_llseek,
+	.release	= kvm_htab_release,
+};
+
+int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
+{
+	int ret;
+	struct kvm_htab_ctx *ctx;
+	int rwflag;
+
+	/* reject flags we don't recognize */
+	if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
+		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	kvm_get_kvm(kvm);
+	ctx->kvm = kvm;
+	ctx->index = ghf->start_index;
+	ctx->flags = ghf->flags;
+	ctx->first_pass = 1;
+
+	rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
+	ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
+	if (ret < 0) {
+		kvm_put_kvm(kvm);
+		return ret;
+	}
+
+	if (rwflag == O_RDONLY) {
+		mutex_lock(&kvm->slots_lock);
+		atomic_inc(&kvm->arch.hpte_mod_interest);
+		/* make sure kvmppc_do_h_enter etc. see the increment */
+		synchronize_srcu_expedited(&kvm->srcu);
+		mutex_unlock(&kvm->slots_lock);
+	}
+
+	return ret;
+}
+
+struct debugfs_htab_state {
+	struct kvm	*kvm;
+	struct mutex	mutex;
+	unsigned long	hpt_index;
+	int		chars_left;
+	int		buf_index;
+	char		buf[64];
+};
+
+static int debugfs_htab_open(struct inode *inode, struct file *file)
+{
+	struct kvm *kvm = inode->i_private;
+	struct debugfs_htab_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(kvm);
+	p->kvm = kvm;
+	mutex_init(&p->mutex);
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_htab_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_htab_state *p = file->private_data;
+
+	kvm_put_kvm(p->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
+				 size_t len, loff_t *ppos)
+{
+	struct debugfs_htab_state *p = file->private_data;
+	ssize_t ret, r;
+	unsigned long i, n;
+	unsigned long v, hr, gr;
+	struct kvm *kvm;
+	__be64 *hptp;
+
+	ret = mutex_lock_interruptible(&p->mutex);
+	if (ret)
+		return ret;
+
+	if (p->chars_left) {
+		n = p->chars_left;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf + p->buf_index, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index += n;
+		buf += n;
+		len -= n;
+		ret = n;
+		if (r) {
+			if (!n)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	kvm = p->kvm;
+	i = p->hpt_index;
+	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+	for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		/* lock the HPTE so it's stable and read it */
+		preempt_disable();
+		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+			cpu_relax();
+		v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
+		hr = be64_to_cpu(hptp[1]);
+		gr = kvm->arch.revmap[i].guest_rpte;
+		unlock_hpte(hptp, v);
+		preempt_enable();
+
+		if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		n = scnprintf(p->buf, sizeof(p->buf),
+			      "%6lx %.16lx %.16lx %.16lx\n",
+			      i, v, hr, gr);
+		p->chars_left = n;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index = n;
+		buf += n;
+		len -= n;
+		ret += n;
+		if (r) {
+			if (!ret)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+	p->hpt_index = i;
+
+ out:
+	mutex_unlock(&p->mutex);
+	return ret;
+}
+
+ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_htab_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_htab_open,
+	.release = debugfs_htab_release,
+	.read	 = debugfs_htab_read,
+	.write	 = debugfs_htab_write,
+	.llseek	 = generic_file_llseek,
+};
+
+void kvmppc_mmu_debugfs_init(struct kvm *kvm)
+{
+	kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
+						    kvm->arch.debugfs_dir, kvm,
+						    &debugfs_htab_fops);
+}
+
+void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+	vcpu->arch.slb_nr = 32;		/* POWER7/POWER8 */
+
+	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
+}
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
new file mode 100644
index 000000000..3589c4e3d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -0,0 +1,153 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#define SHADOW_SLB_ENTRY_LEN	0x10
+#define OFFSET_ESID(x)		(SHADOW_SLB_ENTRY_LEN * x)
+#define OFFSET_VSID(x)		((SHADOW_SLB_ENTRY_LEN * x) + 8)
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_GUEST_SEGMENTS
+
+	/* Required state:
+	 *
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * R2 = host R2
+	 * R3 = shadow vcpu
+	 * all other volatile GPRS = free except R4, R6
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 */
+
+BEGIN_FW_FTR_SECTION
+
+	/* Declare SLB shadow as 0 entries big */
+
+	ld	r11, PACA_SLBSHADOWPTR(r13)
+	li	r8, 0
+	stb	r8, 3(r11)
+
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR)
+
+	/* Flush SLB */
+
+	li	r10, 0
+	slbmte	r10, r10
+	slbia
+
+	/* Fill SLB with our shadow */
+
+	lbz	r12, SVCPU_SLB_MAX(r3)
+	mulli	r12, r12, 16
+	addi	r12, r12, SVCPU_SLB
+	add	r12, r12, r3
+
+	/* for (r11 = kvm_slb; r11 < kvm_slb + kvm_slb_size; r11+=slb_entry) */
+	li	r11, SVCPU_SLB
+	add	r11, r11, r3
+
+slb_loop_enter:
+
+	ld	r10, 0(r11)
+
+	andis.	r9, r10, SLB_ESID_V@h
+	beq	slb_loop_enter_skip
+
+	ld	r9, 8(r11)
+	slbmte	r9, r10
+
+slb_loop_enter_skip:
+	addi	r11, r11, 16
+	cmpd	cr0, r11, r12
+	blt	slb_loop_enter
+
+slb_do_enter:
+
+.endm
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+.macro LOAD_HOST_SEGMENTS
+
+	/* Register usage at this point:
+	 *
+	 * R1         = host R1
+	 * R2         = host R2
+	 * R12        = exit handler id
+	 * R13        = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64]
+	 * SVCPU.*    = guest *
+	 * SVCPU[CR]  = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR]  = guest LR
+	 *
+	 */
+
+	/* Remove all SLB entries that are in use. */
+
+	li	r0, r0
+	slbmte	r0, r0
+	slbia
+
+	/* Restore bolted entries from the shadow */
+
+	ld	r11, PACA_SLBSHADOWPTR(r13)
+
+BEGIN_FW_FTR_SECTION
+
+	/* Declare SLB shadow as SLB_NUM_BOLTED entries big */
+
+	li	r8, SLB_NUM_BOLTED
+	stb	r8, 3(r11)
+
+END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR)
+
+	/* Manually load all entries from shadow SLB */
+
+	li	r8, SLBSHADOW_SAVEAREA
+	li	r7, SLBSHADOW_SAVEAREA + 8
+
+	.rept	SLB_NUM_BOLTED
+	LDX_BE	r10, r11, r8
+	cmpdi	r10, 0
+	beq	1f
+	LDX_BE	r9, r11, r7
+	slbmte	r9, r10
+1:	addi	r7, r7, SHADOW_SLB_ENTRY_LEN
+	addi	r8, r8, SHADOW_SLB_ENTRY_LEN
+	.endr
+
+	isync
+	sync
+
+slb_do_exit:
+
+.endm
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
new file mode 100644
index 000000000..54cf9bc94
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -0,0 +1,150 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+
+static long kvmppc_stt_npages(unsigned long window_size)
+{
+	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
+		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
+
+static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+{
+	struct kvm *kvm = stt->kvm;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	list_del(&stt->list);
+	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+		__free_page(stt->pages[i]);
+	kfree(stt);
+	mutex_unlock(&kvm->lock);
+
+	kvm_put_kvm(kvm);
+}
+
+static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
+	struct page *page;
+
+	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+		return VM_FAULT_SIGBUS;
+
+	page = stt->pages[vmf->pgoff];
+	get_page(page);
+	vmf->page = page;
+	return 0;
+}
+
+static const struct vm_operations_struct kvm_spapr_tce_vm_ops = {
+	.fault = kvm_spapr_tce_fault,
+};
+
+static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &kvm_spapr_tce_vm_ops;
+	return 0;
+}
+
+static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
+{
+	struct kvmppc_spapr_tce_table *stt = filp->private_data;
+
+	release_spapr_tce_table(stt);
+	return 0;
+}
+
+static const struct file_operations kvm_spapr_tce_fops = {
+	.mmap           = kvm_spapr_tce_mmap,
+	.release	= kvm_spapr_tce_release,
+};
+
+long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
+				   struct kvm_create_spapr_tce *args)
+{
+	struct kvmppc_spapr_tce_table *stt = NULL;
+	long npages;
+	int ret = -ENOMEM;
+	int i;
+
+	/* Check this LIOBN hasn't been previously allocated */
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == args->liobn)
+			return -EBUSY;
+	}
+
+	npages = kvmppc_stt_npages(args->window_size);
+
+	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!stt)
+		goto fail;
+
+	stt->liobn = args->liobn;
+	stt->window_size = args->window_size;
+	stt->kvm = kvm;
+
+	for (i = 0; i < npages; i++) {
+		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!stt->pages[i])
+			goto fail;
+	}
+
+	kvm_get_kvm(kvm);
+
+	mutex_lock(&kvm->lock);
+	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	mutex_unlock(&kvm->lock);
+
+	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				stt, O_RDWR | O_CLOEXEC);
+
+fail:
+	if (stt) {
+		for (i = 0; i < npages; i++)
+			if (stt->pages[i])
+				__free_page(stt->pages[i]);
+
+		kfree(stt);
+	}
+	return ret;
+}
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
new file mode 100644
index 000000000..89e96b3e0
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -0,0 +1,105 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/list.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+#include <asm/kvm_host.h>
+#include <asm/udbg.h>
+
+#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+
+/* WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_spapr_tce_table *stt;
+
+	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
+	/* 	    liobn, ioba, tce); */
+
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == liobn) {
+			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+			struct page *page;
+			u64 *tbl;
+
+			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
+			/* 	    liobn, stt, stt->window_size); */
+			if (ioba >= stt->window_size)
+				return H_PARAMETER;
+
+			page = stt->pages[idx / TCES_PER_PAGE];
+			tbl = (u64 *)page_address(page);
+
+			/* FIXME: Need to validate the TCE itself */
+			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+			tbl[idx % TCES_PER_PAGE] = tce;
+			return H_SUCCESS;
+		}
+	}
+
+	/* Didn't find the liobn, punt it to userspace */
+	return H_TOO_HARD;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_spapr_tce_table *stt;
+
+	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
+		if (stt->liobn == liobn) {
+			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+			struct page *page;
+			u64 *tbl;
+
+			if (ioba >= stt->window_size)
+				return H_PARAMETER;
+
+			page = stt->pages[idx / TCES_PER_PAGE];
+			tbl = (u64 *)page_address(page);
+
+			vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+			return H_SUCCESS;
+		}
+	}
+
+	/* Didn't find the liobn, punt it to userspace */
+	return H_TOO_HARD;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
new file mode 100644
index 000000000..5a2bc4b0d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -0,0 +1,696 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_book3s.h>
+#include <asm/reg.h>
+#include <asm/switch_to.h>
+#include <asm/time.h>
+
+#define OP_19_XOP_RFID		18
+#define OP_19_XOP_RFI		50
+
+#define OP_31_XOP_MFMSR		83
+#define OP_31_XOP_MTMSR		146
+#define OP_31_XOP_MTMSRD	178
+#define OP_31_XOP_MTSR		210
+#define OP_31_XOP_MTSRIN	242
+#define OP_31_XOP_TLBIEL	274
+#define OP_31_XOP_TLBIE		306
+/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
+#define OP_31_XOP_FAKE_SC1	308
+#define OP_31_XOP_SLBMTE	402
+#define OP_31_XOP_SLBIE		434
+#define OP_31_XOP_SLBIA		498
+#define OP_31_XOP_MFSR		595
+#define OP_31_XOP_MFSRIN	659
+#define OP_31_XOP_DCBA		758
+#define OP_31_XOP_SLBMFEV	851
+#define OP_31_XOP_EIOIO		854
+#define OP_31_XOP_SLBMFEE	915
+
+/* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
+#define OP_31_XOP_DCBZ		1010
+
+#define OP_LFS			48
+#define OP_LFD			50
+#define OP_STFS			52
+#define OP_STFD			54
+
+#define SPRN_GQR0		912
+#define SPRN_GQR1		913
+#define SPRN_GQR2		914
+#define SPRN_GQR3		915
+#define SPRN_GQR4		916
+#define SPRN_GQR5		917
+#define SPRN_GQR6		918
+#define SPRN_GQR7		919
+
+/* Book3S_32 defines mfsrin(v) - but that messes up our abstract
+ * function pointers, so let's just disable the define. */
+#undef mfsrin
+
+enum priv_level {
+	PRIV_PROBLEM = 0,
+	PRIV_SUPER = 1,
+	PRIV_HYPER = 2,
+};
+
+static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level)
+{
+	/* PAPR VMs only access supervisor SPRs */
+	if (vcpu->arch.papr_enabled && (level > PRIV_SUPER))
+		return false;
+
+	/* Limit user space to its own small SPR set */
+	if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM)
+		return false;
+
+	return true;
+}
+
+int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			      unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int rt = get_rt(inst);
+	int rs = get_rs(inst);
+	int ra = get_ra(inst);
+	int rb = get_rb(inst);
+	u32 inst_sc = 0x44000002;
+
+	switch (get_op(inst)) {
+	case 0:
+		emulated = EMULATE_FAIL;
+		if ((kvmppc_get_msr(vcpu) & MSR_LE) &&
+		    (inst == swab32(inst_sc))) {
+			/*
+			 * This is the byte reversed syscall instruction of our
+			 * hypercall handler. Early versions of LE Linux didn't
+			 * swap the instructions correctly and ended up in
+			 * illegal instructions.
+			 * Just always fail hypercalls on these broken systems.
+			 */
+			kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED);
+			kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+			emulated = EMULATE_DONE;
+		}
+		break;
+	case 19:
+		switch (get_xop(inst)) {
+		case OP_19_XOP_RFID:
+		case OP_19_XOP_RFI:
+			kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu));
+			kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu));
+			*advance = 0;
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+			break;
+		}
+		break;
+	case 31:
+		switch (get_xop(inst)) {
+		case OP_31_XOP_MFMSR:
+			kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu));
+			break;
+		case OP_31_XOP_MTMSRD:
+		{
+			ulong rs_val = kvmppc_get_gpr(vcpu, rs);
+			if (inst & 0x10000) {
+				ulong new_msr = kvmppc_get_msr(vcpu);
+				new_msr &= ~(MSR_RI | MSR_EE);
+				new_msr |= rs_val & (MSR_RI | MSR_EE);
+				kvmppc_set_msr_fast(vcpu, new_msr);
+			} else
+				kvmppc_set_msr(vcpu, rs_val);
+			break;
+		}
+		case OP_31_XOP_MTMSR:
+			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
+			break;
+		case OP_31_XOP_MFSR:
+		{
+			int srnum;
+
+			srnum = kvmppc_get_field(inst, 12 + 32, 15 + 32);
+			if (vcpu->arch.mmu.mfsrin) {
+				u32 sr;
+				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
+				kvmppc_set_gpr(vcpu, rt, sr);
+			}
+			break;
+		}
+		case OP_31_XOP_MFSRIN:
+		{
+			int srnum;
+
+			srnum = (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf;
+			if (vcpu->arch.mmu.mfsrin) {
+				u32 sr;
+				sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
+				kvmppc_set_gpr(vcpu, rt, sr);
+			}
+			break;
+		}
+		case OP_31_XOP_MTSR:
+			vcpu->arch.mmu.mtsrin(vcpu,
+				(inst >> 16) & 0xf,
+				kvmppc_get_gpr(vcpu, rs));
+			break;
+		case OP_31_XOP_MTSRIN:
+			vcpu->arch.mmu.mtsrin(vcpu,
+				(kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf,
+				kvmppc_get_gpr(vcpu, rs));
+			break;
+		case OP_31_XOP_TLBIE:
+		case OP_31_XOP_TLBIEL:
+		{
+			bool large = (inst & 0x00200000) ? true : false;
+			ulong addr = kvmppc_get_gpr(vcpu, rb);
+			vcpu->arch.mmu.tlbie(vcpu, addr, large);
+			break;
+		}
+#ifdef CONFIG_PPC_BOOK3S_64
+		case OP_31_XOP_FAKE_SC1:
+		{
+			/* SC 1 papr hypercalls */
+			ulong cmd = kvmppc_get_gpr(vcpu, 3);
+			int i;
+
+		        if ((kvmppc_get_msr(vcpu) & MSR_PR) ||
+			    !vcpu->arch.papr_enabled) {
+				emulated = EMULATE_FAIL;
+				break;
+			}
+
+			if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE)
+				break;
+
+			run->papr_hcall.nr = cmd;
+			for (i = 0; i < 9; ++i) {
+				ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+				run->papr_hcall.args[i] = gpr;
+			}
+
+			run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			vcpu->arch.hcall_needed = 1;
+			emulated = EMULATE_EXIT_USER;
+			break;
+		}
+#endif
+		case OP_31_XOP_EIOIO:
+			break;
+		case OP_31_XOP_SLBMTE:
+			if (!vcpu->arch.mmu.slbmte)
+				return EMULATE_FAIL;
+
+			vcpu->arch.mmu.slbmte(vcpu,
+					kvmppc_get_gpr(vcpu, rs),
+					kvmppc_get_gpr(vcpu, rb));
+			break;
+		case OP_31_XOP_SLBIE:
+			if (!vcpu->arch.mmu.slbie)
+				return EMULATE_FAIL;
+
+			vcpu->arch.mmu.slbie(vcpu,
+					kvmppc_get_gpr(vcpu, rb));
+			break;
+		case OP_31_XOP_SLBIA:
+			if (!vcpu->arch.mmu.slbia)
+				return EMULATE_FAIL;
+
+			vcpu->arch.mmu.slbia(vcpu);
+			break;
+		case OP_31_XOP_SLBMFEE:
+			if (!vcpu->arch.mmu.slbmfee) {
+				emulated = EMULATE_FAIL;
+			} else {
+				ulong t, rb_val;
+
+				rb_val = kvmppc_get_gpr(vcpu, rb);
+				t = vcpu->arch.mmu.slbmfee(vcpu, rb_val);
+				kvmppc_set_gpr(vcpu, rt, t);
+			}
+			break;
+		case OP_31_XOP_SLBMFEV:
+			if (!vcpu->arch.mmu.slbmfev) {
+				emulated = EMULATE_FAIL;
+			} else {
+				ulong t, rb_val;
+
+				rb_val = kvmppc_get_gpr(vcpu, rb);
+				t = vcpu->arch.mmu.slbmfev(vcpu, rb_val);
+				kvmppc_set_gpr(vcpu, rt, t);
+			}
+			break;
+		case OP_31_XOP_DCBA:
+			/* Gets treated as NOP */
+			break;
+		case OP_31_XOP_DCBZ:
+		{
+			ulong rb_val = kvmppc_get_gpr(vcpu, rb);
+			ulong ra_val = 0;
+			ulong addr, vaddr;
+			u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+			u32 dsisr;
+			int r;
+
+			if (ra)
+				ra_val = kvmppc_get_gpr(vcpu, ra);
+
+			addr = (ra_val + rb_val) & ~31ULL;
+			if (!(kvmppc_get_msr(vcpu) & MSR_SF))
+				addr &= 0xffffffff;
+			vaddr = addr;
+
+			r = kvmppc_st(vcpu, &addr, 32, zeros, true);
+			if ((r == -ENOENT) || (r == -EPERM)) {
+				*advance = 0;
+				kvmppc_set_dar(vcpu, vaddr);
+				vcpu->arch.fault_dar = vaddr;
+
+				dsisr = DSISR_ISSTORE;
+				if (r == -ENOENT)
+					dsisr |= DSISR_NOHPTE;
+				else if (r == -EPERM)
+					dsisr |= DSISR_PROTFAULT;
+
+				kvmppc_set_dsisr(vcpu, dsisr);
+				vcpu->arch.fault_dsisr = dsisr;
+
+				kvmppc_book3s_queue_irqprio(vcpu,
+					BOOK3S_INTERRUPT_DATA_STORAGE);
+			}
+
+			break;
+		}
+		default:
+			emulated = EMULATE_FAIL;
+		}
+		break;
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (emulated == EMULATE_FAIL)
+		emulated = kvmppc_emulate_paired_single(run, vcpu);
+
+	return emulated;
+}
+
+void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper,
+                    u32 val)
+{
+	if (upper) {
+		/* Upper BAT */
+		u32 bl = (val >> 2) & 0x7ff;
+		bat->bepi_mask = (~bl << 17);
+		bat->bepi = val & 0xfffe0000;
+		bat->vs = (val & 2) ? 1 : 0;
+		bat->vp = (val & 1) ? 1 : 0;
+		bat->raw = (bat->raw & 0xffffffff00000000ULL) | val;
+	} else {
+		/* Lower BAT */
+		bat->brpn = val & 0xfffe0000;
+		bat->wimg = (val >> 3) & 0xf;
+		bat->pp = val & 3;
+		bat->raw = (bat->raw & 0x00000000ffffffffULL) | ((u64)val << 32);
+	}
+}
+
+static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	struct kvmppc_bat *bat;
+
+	switch (sprn) {
+	case SPRN_IBAT0U ... SPRN_IBAT3L:
+		bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
+		break;
+	case SPRN_IBAT4U ... SPRN_IBAT7L:
+		bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)];
+		break;
+	case SPRN_DBAT0U ... SPRN_DBAT3L:
+		bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
+		break;
+	case SPRN_DBAT4U ... SPRN_DBAT7L:
+		bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)];
+		break;
+	default:
+		BUG();
+	}
+
+	return bat;
+}
+
+int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_SDR1:
+		if (!spr_allowed(vcpu, PRIV_HYPER))
+			goto unprivileged;
+		to_book3s(vcpu)->sdr1 = spr_val;
+		break;
+	case SPRN_DSISR:
+		kvmppc_set_dsisr(vcpu, spr_val);
+		break;
+	case SPRN_DAR:
+		kvmppc_set_dar(vcpu, spr_val);
+		break;
+	case SPRN_HIOR:
+		to_book3s(vcpu)->hior = spr_val;
+		break;
+	case SPRN_IBAT0U ... SPRN_IBAT3L:
+	case SPRN_IBAT4U ... SPRN_IBAT7L:
+	case SPRN_DBAT0U ... SPRN_DBAT3L:
+	case SPRN_DBAT4U ... SPRN_DBAT7L:
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		kvmppc_set_bat(vcpu, bat, !(sprn % 2), (u32)spr_val);
+		/* BAT writes happen so rarely that we're ok to flush
+		 * everything here */
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+		kvmppc_mmu_flush_segments(vcpu);
+		break;
+	}
+	case SPRN_HID0:
+		to_book3s(vcpu)->hid[0] = spr_val;
+		break;
+	case SPRN_HID1:
+		to_book3s(vcpu)->hid[1] = spr_val;
+		break;
+	case SPRN_HID2:
+		to_book3s(vcpu)->hid[2] = spr_val;
+		break;
+	case SPRN_HID2_GEKKO:
+		to_book3s(vcpu)->hid[2] = spr_val;
+		/* HID2.PSE controls paired single on gekko */
+		switch (vcpu->arch.pvr) {
+		case 0x00080200:	/* lonestar 2.0 */
+		case 0x00088202:	/* lonestar 2.2 */
+		case 0x70000100:	/* gekko 1.0 */
+		case 0x00080100:	/* gekko 2.0 */
+		case 0x00083203:	/* gekko 2.3a */
+		case 0x00083213:	/* gekko 2.3b */
+		case 0x00083204:	/* gekko 2.4 */
+		case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
+		case 0x00087200:	/* broadway */
+			if (vcpu->arch.hflags & BOOK3S_HFLAG_NATIVE_PS) {
+				/* Native paired singles */
+			} else if (spr_val & (1 << 29)) { /* HID2.PSE */
+				vcpu->arch.hflags |= BOOK3S_HFLAG_PAIRED_SINGLE;
+				kvmppc_giveup_ext(vcpu, MSR_FP);
+			} else {
+				vcpu->arch.hflags &= ~BOOK3S_HFLAG_PAIRED_SINGLE;
+			}
+			break;
+		}
+		break;
+	case SPRN_HID4:
+	case SPRN_HID4_GEKKO:
+		to_book3s(vcpu)->hid[4] = spr_val;
+		break;
+	case SPRN_HID5:
+		to_book3s(vcpu)->hid[5] = spr_val;
+		/* guest HID5 set can change is_dcbz32 */
+		if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+		    (mfmsr() & MSR_HV))
+			vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+		break;
+	case SPRN_GQR0:
+	case SPRN_GQR1:
+	case SPRN_GQR2:
+	case SPRN_GQR3:
+	case SPRN_GQR4:
+	case SPRN_GQR5:
+	case SPRN_GQR6:
+	case SPRN_GQR7:
+		to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val;
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_FSCR:
+		kvmppc_set_fscr(vcpu, spr_val);
+		break;
+	case SPRN_BESCR:
+		vcpu->arch.bescr = spr_val;
+		break;
+	case SPRN_EBBHR:
+		vcpu->arch.ebbhr = spr_val;
+		break;
+	case SPRN_EBBRR:
+		vcpu->arch.ebbrr = spr_val;
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case SPRN_TFHAR:
+		vcpu->arch.tfhar = spr_val;
+		break;
+	case SPRN_TEXASR:
+		vcpu->arch.texasr = spr_val;
+		break;
+	case SPRN_TFIAR:
+		vcpu->arch.tfiar = spr_val;
+		break;
+#endif
+#endif
+	case SPRN_ICTC:
+	case SPRN_THRM1:
+	case SPRN_THRM2:
+	case SPRN_THRM3:
+	case SPRN_CTRLF:
+	case SPRN_CTRLT:
+	case SPRN_L2CR:
+	case SPRN_DSCR:
+	case SPRN_MMCR0_GEKKO:
+	case SPRN_MMCR1_GEKKO:
+	case SPRN_PMC1_GEKKO:
+	case SPRN_PMC2_GEKKO:
+	case SPRN_PMC3_GEKKO:
+	case SPRN_PMC4_GEKKO:
+	case SPRN_WPAR_GEKKO:
+	case SPRN_MSSSR0:
+	case SPRN_DABR:
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_MMCRS:
+	case SPRN_MMCRA:
+	case SPRN_MMCR0:
+	case SPRN_MMCR1:
+	case SPRN_MMCR2:
+#endif
+		break;
+unprivileged:
+	default:
+		printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
+#ifndef DEBUG_SPR
+		emulated = EMULATE_FAIL;
+#endif
+		break;
+	}
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_IBAT0U ... SPRN_IBAT3L:
+	case SPRN_IBAT4U ... SPRN_IBAT7L:
+	case SPRN_DBAT0U ... SPRN_DBAT3L:
+	case SPRN_DBAT4U ... SPRN_DBAT7L:
+	{
+		struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn);
+
+		if (sprn % 2)
+			*spr_val = bat->raw >> 32;
+		else
+			*spr_val = bat->raw;
+
+		break;
+	}
+	case SPRN_SDR1:
+		if (!spr_allowed(vcpu, PRIV_HYPER))
+			goto unprivileged;
+		*spr_val = to_book3s(vcpu)->sdr1;
+		break;
+	case SPRN_DSISR:
+		*spr_val = kvmppc_get_dsisr(vcpu);
+		break;
+	case SPRN_DAR:
+		*spr_val = kvmppc_get_dar(vcpu);
+		break;
+	case SPRN_HIOR:
+		*spr_val = to_book3s(vcpu)->hior;
+		break;
+	case SPRN_HID0:
+		*spr_val = to_book3s(vcpu)->hid[0];
+		break;
+	case SPRN_HID1:
+		*spr_val = to_book3s(vcpu)->hid[1];
+		break;
+	case SPRN_HID2:
+	case SPRN_HID2_GEKKO:
+		*spr_val = to_book3s(vcpu)->hid[2];
+		break;
+	case SPRN_HID4:
+	case SPRN_HID4_GEKKO:
+		*spr_val = to_book3s(vcpu)->hid[4];
+		break;
+	case SPRN_HID5:
+		*spr_val = to_book3s(vcpu)->hid[5];
+		break;
+	case SPRN_CFAR:
+	case SPRN_DSCR:
+		*spr_val = 0;
+		break;
+	case SPRN_PURR:
+		/*
+		 * On exit we would have updated purr
+		 */
+		*spr_val = vcpu->arch.purr;
+		break;
+	case SPRN_SPURR:
+		/*
+		 * On exit we would have updated spurr
+		 */
+		*spr_val = vcpu->arch.spurr;
+		break;
+	case SPRN_VTB:
+		*spr_val = vcpu->arch.vtb;
+		break;
+	case SPRN_IC:
+		*spr_val = vcpu->arch.ic;
+		break;
+	case SPRN_GQR0:
+	case SPRN_GQR1:
+	case SPRN_GQR2:
+	case SPRN_GQR3:
+	case SPRN_GQR4:
+	case SPRN_GQR5:
+	case SPRN_GQR6:
+	case SPRN_GQR7:
+		*spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0];
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_FSCR:
+		*spr_val = vcpu->arch.fscr;
+		break;
+	case SPRN_BESCR:
+		*spr_val = vcpu->arch.bescr;
+		break;
+	case SPRN_EBBHR:
+		*spr_val = vcpu->arch.ebbhr;
+		break;
+	case SPRN_EBBRR:
+		*spr_val = vcpu->arch.ebbrr;
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case SPRN_TFHAR:
+		*spr_val = vcpu->arch.tfhar;
+		break;
+	case SPRN_TEXASR:
+		*spr_val = vcpu->arch.texasr;
+		break;
+	case SPRN_TFIAR:
+		*spr_val = vcpu->arch.tfiar;
+		break;
+#endif
+#endif
+	case SPRN_THRM1:
+	case SPRN_THRM2:
+	case SPRN_THRM3:
+	case SPRN_CTRLF:
+	case SPRN_CTRLT:
+	case SPRN_L2CR:
+	case SPRN_MMCR0_GEKKO:
+	case SPRN_MMCR1_GEKKO:
+	case SPRN_PMC1_GEKKO:
+	case SPRN_PMC2_GEKKO:
+	case SPRN_PMC3_GEKKO:
+	case SPRN_PMC4_GEKKO:
+	case SPRN_WPAR_GEKKO:
+	case SPRN_MSSSR0:
+	case SPRN_DABR:
+#ifdef CONFIG_PPC_BOOK3S_64
+	case SPRN_MMCRS:
+	case SPRN_MMCRA:
+	case SPRN_MMCR0:
+	case SPRN_MMCR1:
+	case SPRN_MMCR2:
+	case SPRN_TIR:
+#endif
+		*spr_val = 0;
+		break;
+	default:
+unprivileged:
+		printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
+#ifndef DEBUG_SPR
+		emulated = EMULATE_FAIL;
+#endif
+		break;
+	}
+
+	return emulated;
+}
+
+u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
+{
+	return make_dsisr(inst);
+}
+
+ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * Linux's fix_alignment() assumes that DAR is valid, so can we
+	 */
+	return vcpu->arch.fault_dar;
+#else
+	ulong dar = 0;
+	ulong ra = get_ra(inst);
+	ulong rb = get_rb(inst);
+
+	switch (get_op(inst)) {
+	case OP_LFS:
+	case OP_LFD:
+	case OP_STFD:
+	case OP_STFS:
+		if (ra)
+			dar = kvmppc_get_gpr(vcpu, ra);
+		dar += (s32)((s16)inst);
+		break;
+	case 31:
+		if (ra)
+			dar = kvmppc_get_gpr(vcpu, ra);
+		dar += kvmppc_get_gpr(vcpu, rb);
+		break;
+	default:
+		printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst);
+		break;
+	}
+
+	return dar;
+#endif
+}
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
new file mode 100644
index 000000000..0d013fbc2
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -0,0 +1,30 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/export.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
+#endif
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
+#endif
+
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
new file mode 100644
index 000000000..df81caab7
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -0,0 +1,2766 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Paul Mackerras <paulus@au1.ibm.com>
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description: KVM functions specific to running on Book 3S
+ * processors in hypervisor mode (specifically POWER7 and later).
+ *
+ * This file is derived from arch/powerpc/kvm/book3s.c,
+ * by Alexander Graf <agraf@suse.de>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <linux/srcu.h>
+#include <linux/miscdevice.h>
+#include <linux/debugfs.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cache.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/lppaca.h>
+#include <asm/processor.h>
+#include <asm/cputhreads.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+#include <asm/switch_to.h>
+#include <asm/smp.h>
+#include <asm/dbell.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/module.h>
+
+#include "book3s.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_hv.h"
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+/* #define EXIT_DEBUG_INT */
+
+/* Used to indicate that a guest page fault needs to be handled */
+#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
+
+/* Used as a "null" value for timebase values */
+#define TB_NIL	(~(u64)0)
+
+static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
+
+#if defined(CONFIG_PPC_64K_PAGES)
+#define MPP_BUFFER_ORDER	0
+#elif defined(CONFIG_PPC_4K_PAGES)
+#define MPP_BUFFER_ORDER	3
+#endif
+
+
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+
+static bool kvmppc_ipi_thread(int cpu)
+{
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		preempt_disable();
+		if (cpu_first_thread_sibling(cpu) ==
+		    cpu_first_thread_sibling(smp_processor_id())) {
+			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+			msg |= cpu_thread_in_core(cpu);
+			smp_mb();
+			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+			preempt_enable();
+			return true;
+		}
+		preempt_enable();
+	}
+
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
+	if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
+		xics_wake_cpu(cpu);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
+static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
+{
+	int cpu = vcpu->cpu;
+	wait_queue_head_t *wqp;
+
+	wqp = kvm_arch_vcpu_wq(vcpu);
+	if (waitqueue_active(wqp)) {
+		wake_up_interruptible(wqp);
+		++vcpu->stat.halt_wakeup;
+	}
+
+	if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+		return;
+
+	/* CPU points to the first thread of the core */
+	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
+		smp_send_reschedule(cpu);
+}
+
+/*
+ * We use the vcpu_load/put functions to measure stolen time.
+ * Stolen time is counted as time when either the vcpu is able to
+ * run as part of a virtual core, but the task running the vcore
+ * is preempted or sleeping, or when the vcpu needs something done
+ * in the kernel by the task running the vcpu, but that task is
+ * preempted or sleeping.  Those two things have to be counted
+ * separately, since one of the vcpu tasks will take on the job
+ * of running the core, and the other vcpu tasks in the vcore will
+ * sleep waiting for it to do that, but that sleep shouldn't count
+ * as stolen time.
+ *
+ * Hence we accumulate stolen time when the vcpu can run as part of
+ * a vcore using vc->stolen_tb, and the stolen time when the vcpu
+ * needs its task to do other things in the kernel (for example,
+ * service a page fault) in busy_stolen.  We don't accumulate
+ * stolen time for a vcore when it is inactive, or for a vcpu
+ * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
+ * a misnomer; it means that the vcpu task is not executing in
+ * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
+ * the kernel.  We don't have any way of dividing up that time
+ * between time that the vcpu is genuinely stopped, time that
+ * the task is actively working on behalf of the vcpu, and time
+ * that the task is preempted, so we don't count any of it as
+ * stolen.
+ *
+ * Updates to busy_stolen are protected by arch.tbacct_lock;
+ * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
+ * lock.  The stolen times are measured in units of timebase ticks.
+ * (Note that the != TB_NIL checks below are purely defensive;
+ * they should never fail.)
+ */
+
+static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	unsigned long flags;
+
+	/*
+	 * We can test vc->runner without taking the vcore lock,
+	 * because only this task ever sets vc->runner to this
+	 * vcpu, and once it is set to this vcpu, only this task
+	 * ever sets it to NULL.
+	 */
+	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
+		spin_lock_irqsave(&vc->stoltb_lock, flags);
+		if (vc->preempt_tb != TB_NIL) {
+			vc->stolen_tb += mftb() - vc->preempt_tb;
+			vc->preempt_tb = TB_NIL;
+		}
+		spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+	}
+	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
+	    vcpu->arch.busy_preempt != TB_NIL) {
+		vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
+		vcpu->arch.busy_preempt = TB_NIL;
+	}
+	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+}
+
+static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	unsigned long flags;
+
+	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
+		spin_lock_irqsave(&vc->stoltb_lock, flags);
+		vc->preempt_tb = mftb();
+		spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+	}
+	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
+		vcpu->arch.busy_preempt = mftb();
+	spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+}
+
+static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
+{
+	vcpu->arch.shregs.msr = msr;
+	kvmppc_end_cede(vcpu);
+}
+
+void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	vcpu->arch.pvr = pvr;
+}
+
+int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+{
+	unsigned long pcr = 0;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	if (arch_compat) {
+		switch (arch_compat) {
+		case PVR_ARCH_205:
+			/*
+			 * If an arch bit is set in PCR, all the defined
+			 * higher-order arch bits also have to be set.
+			 */
+			pcr = PCR_ARCH_206 | PCR_ARCH_205;
+			break;
+		case PVR_ARCH_206:
+		case PVR_ARCH_206p:
+			pcr = PCR_ARCH_206;
+			break;
+		case PVR_ARCH_207:
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
+			/* POWER7 can't emulate POWER8 */
+			if (!(pcr & PCR_ARCH_206))
+				return -EINVAL;
+			pcr &= ~PCR_ARCH_206;
+		}
+	}
+
+	spin_lock(&vc->lock);
+	vc->arch_compat = arch_compat;
+	vc->pcr = pcr;
+	spin_unlock(&vc->lock);
+
+	return 0;
+}
+
+void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
+	pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
+	       vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
+	for (r = 0; r < 16; ++r)
+		pr_err("r%2d = %.16lx  r%d = %.16lx\n",
+		       r, kvmppc_get_gpr(vcpu, r),
+		       r+16, kvmppc_get_gpr(vcpu, r+16));
+	pr_err("ctr = %.16lx  lr  = %.16lx\n",
+	       vcpu->arch.ctr, vcpu->arch.lr);
+	pr_err("srr0 = %.16llx srr1 = %.16llx\n",
+	       vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
+	pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
+	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
+	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
+	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
+	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
+	       vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
+	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
+	pr_err("fault dar = %.16lx dsisr = %.8x\n",
+	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+	pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
+	for (r = 0; r < vcpu->arch.slb_max; ++r)
+		pr_err("  ESID = %.16llx VSID = %.16llx\n",
+		       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
+	pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
+	       vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
+	       vcpu->arch.last_inst);
+}
+
+struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+{
+	int r;
+	struct kvm_vcpu *v, *ret = NULL;
+
+	mutex_lock(&kvm->lock);
+	kvm_for_each_vcpu(r, v, kvm) {
+		if (v->vcpu_id == id) {
+			ret = v;
+			break;
+		}
+	}
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
+{
+	vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
+	vpa->yield_count = cpu_to_be32(1);
+}
+
+static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
+		   unsigned long addr, unsigned long len)
+{
+	/* check address is cacheline aligned */
+	if (addr & (L1_CACHE_BYTES - 1))
+		return -EINVAL;
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	if (v->next_gpa != addr || v->len != len) {
+		v->next_gpa = addr;
+		v->len = addr ? len : 0;
+		v->update_pending = 1;
+	}
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+	return 0;
+}
+
+/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
+struct reg_vpa {
+	u32 dummy;
+	union {
+		__be16 hword;
+		__be32 word;
+	} length;
+};
+
+static int vpa_is_registered(struct kvmppc_vpa *vpap)
+{
+	if (vpap->update_pending)
+		return vpap->next_gpa != 0;
+	return vpap->pinned_addr != NULL;
+}
+
+static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
+				       unsigned long flags,
+				       unsigned long vcpuid, unsigned long vpa)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long len, nb;
+	void *va;
+	struct kvm_vcpu *tvcpu;
+	int err;
+	int subfunc;
+	struct kvmppc_vpa *vpap;
+
+	tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
+	if (!tvcpu)
+		return H_PARAMETER;
+
+	subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
+	if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
+	    subfunc == H_VPA_REG_SLB) {
+		/* Registering new area - address must be cache-line aligned */
+		if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
+			return H_PARAMETER;
+
+		/* convert logical addr to kernel addr and read length */
+		va = kvmppc_pin_guest_page(kvm, vpa, &nb);
+		if (va == NULL)
+			return H_PARAMETER;
+		if (subfunc == H_VPA_REG_VPA)
+			len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
+		else
+			len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
+		kvmppc_unpin_guest_page(kvm, va, vpa, false);
+
+		/* Check length */
+		if (len > nb || len < sizeof(struct reg_vpa))
+			return H_PARAMETER;
+	} else {
+		vpa = 0;
+		len = 0;
+	}
+
+	err = H_PARAMETER;
+	vpap = NULL;
+	spin_lock(&tvcpu->arch.vpa_update_lock);
+
+	switch (subfunc) {
+	case H_VPA_REG_VPA:		/* register VPA */
+		if (len < sizeof(struct lppaca))
+			break;
+		vpap = &tvcpu->arch.vpa;
+		err = 0;
+		break;
+
+	case H_VPA_REG_DTL:		/* register DTL */
+		if (len < sizeof(struct dtl_entry))
+			break;
+		len -= len % sizeof(struct dtl_entry);
+
+		/* Check that they have previously registered a VPA */
+		err = H_RESOURCE;
+		if (!vpa_is_registered(&tvcpu->arch.vpa))
+			break;
+
+		vpap = &tvcpu->arch.dtl;
+		err = 0;
+		break;
+
+	case H_VPA_REG_SLB:		/* register SLB shadow buffer */
+		/* Check that they have previously registered a VPA */
+		err = H_RESOURCE;
+		if (!vpa_is_registered(&tvcpu->arch.vpa))
+			break;
+
+		vpap = &tvcpu->arch.slb_shadow;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_VPA:		/* deregister VPA */
+		/* Check they don't still have a DTL or SLB buf registered */
+		err = H_RESOURCE;
+		if (vpa_is_registered(&tvcpu->arch.dtl) ||
+		    vpa_is_registered(&tvcpu->arch.slb_shadow))
+			break;
+
+		vpap = &tvcpu->arch.vpa;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_DTL:		/* deregister DTL */
+		vpap = &tvcpu->arch.dtl;
+		err = 0;
+		break;
+
+	case H_VPA_DEREG_SLB:		/* deregister SLB shadow buffer */
+		vpap = &tvcpu->arch.slb_shadow;
+		err = 0;
+		break;
+	}
+
+	if (vpap) {
+		vpap->next_gpa = vpa;
+		vpap->len = len;
+		vpap->update_pending = 1;
+	}
+
+	spin_unlock(&tvcpu->arch.vpa_update_lock);
+
+	return err;
+}
+
+static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
+{
+	struct kvm *kvm = vcpu->kvm;
+	void *va;
+	unsigned long nb;
+	unsigned long gpa;
+
+	/*
+	 * We need to pin the page pointed to by vpap->next_gpa,
+	 * but we can't call kvmppc_pin_guest_page under the lock
+	 * as it does get_user_pages() and down_read().  So we
+	 * have to drop the lock, pin the page, then get the lock
+	 * again and check that a new area didn't get registered
+	 * in the meantime.
+	 */
+	for (;;) {
+		gpa = vpap->next_gpa;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		va = NULL;
+		nb = 0;
+		if (gpa)
+			va = kvmppc_pin_guest_page(kvm, gpa, &nb);
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		if (gpa == vpap->next_gpa)
+			break;
+		/* sigh... unpin that one and try again */
+		if (va)
+			kvmppc_unpin_guest_page(kvm, va, gpa, false);
+	}
+
+	vpap->update_pending = 0;
+	if (va && nb < vpap->len) {
+		/*
+		 * If it's now too short, it must be that userspace
+		 * has changed the mappings underlying guest memory,
+		 * so unregister the region.
+		 */
+		kvmppc_unpin_guest_page(kvm, va, gpa, false);
+		va = NULL;
+	}
+	if (vpap->pinned_addr)
+		kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
+					vpap->dirty);
+	vpap->gpa = gpa;
+	vpap->pinned_addr = va;
+	vpap->dirty = false;
+	if (va)
+		vpap->pinned_end = va + vpap->len;
+}
+
+static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.vpa.update_pending ||
+	      vcpu->arch.slb_shadow.update_pending ||
+	      vcpu->arch.dtl.update_pending))
+		return;
+
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	if (vcpu->arch.vpa.update_pending) {
+		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
+		if (vcpu->arch.vpa.pinned_addr)
+			init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
+	}
+	if (vcpu->arch.dtl.update_pending) {
+		kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
+		vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
+		vcpu->arch.dtl_index = 0;
+	}
+	if (vcpu->arch.slb_shadow.update_pending)
+		kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+}
+
+/*
+ * Return the accumulated stolen time for the vcore up until `now'.
+ * The caller should hold the vcore lock.
+ */
+static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
+{
+	u64 p;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vc->stoltb_lock, flags);
+	p = vc->stolen_tb;
+	if (vc->vcore_state != VCORE_INACTIVE &&
+	    vc->preempt_tb != TB_NIL)
+		p += now - vc->preempt_tb;
+	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+	return p;
+}
+
+static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+				    struct kvmppc_vcore *vc)
+{
+	struct dtl_entry *dt;
+	struct lppaca *vpa;
+	unsigned long stolen;
+	unsigned long core_stolen;
+	u64 now;
+
+	dt = vcpu->arch.dtl_ptr;
+	vpa = vcpu->arch.vpa.pinned_addr;
+	now = mftb();
+	core_stolen = vcore_stolen_time(vc, now);
+	stolen = core_stolen - vcpu->arch.stolen_logged;
+	vcpu->arch.stolen_logged = core_stolen;
+	spin_lock_irq(&vcpu->arch.tbacct_lock);
+	stolen += vcpu->arch.busy_stolen;
+	vcpu->arch.busy_stolen = 0;
+	spin_unlock_irq(&vcpu->arch.tbacct_lock);
+	if (!dt || !vpa)
+		return;
+	memset(dt, 0, sizeof(struct dtl_entry));
+	dt->dispatch_reason = 7;
+	dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
+	dt->timebase = cpu_to_be64(now + vc->tb_offset);
+	dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
+	dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
+	dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
+	++dt;
+	if (dt == vcpu->arch.dtl.pinned_end)
+		dt = vcpu->arch.dtl.pinned_addr;
+	vcpu->arch.dtl_ptr = dt;
+	/* order writing *dt vs. writing vpa->dtl_idx */
+	smp_wmb();
+	vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
+	vcpu->arch.dtl.dirty = true;
+}
+
+static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
+		return true;
+	if ((!vcpu->arch.vcore->arch_compat) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S))
+		return true;
+	return false;
+}
+
+static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
+			     unsigned long resource, unsigned long value1,
+			     unsigned long value2)
+{
+	switch (resource) {
+	case H_SET_MODE_RESOURCE_SET_CIABR:
+		if (!kvmppc_power8_compatible(vcpu))
+			return H_P2;
+		if (value2)
+			return H_P4;
+		if (mflags)
+			return H_UNSUPPORTED_FLAG_START;
+		/* Guests can't breakpoint the hypervisor */
+		if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
+			return H_P3;
+		vcpu->arch.ciabr  = value1;
+		return H_SUCCESS;
+	case H_SET_MODE_RESOURCE_SET_DAWR:
+		if (!kvmppc_power8_compatible(vcpu))
+			return H_P2;
+		if (mflags)
+			return H_UNSUPPORTED_FLAG_START;
+		if (value2 & DABRX_HYP)
+			return H_P4;
+		vcpu->arch.dawr  = value1;
+		vcpu->arch.dawrx = value2;
+		return H_SUCCESS;
+	default:
+		return H_TOO_HARD;
+	}
+}
+
+static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
+{
+	struct kvmppc_vcore *vcore = target->arch.vcore;
+
+	/*
+	 * We expect to have been called by the real mode handler
+	 * (kvmppc_rm_h_confer()) which would have directly returned
+	 * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
+	 * have useful work to do and should not confer) so we don't
+	 * recheck that here.
+	 */
+
+	spin_lock(&vcore->lock);
+	if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	    vcore->vcore_state != VCORE_INACTIVE)
+		target = vcore->runner;
+	spin_unlock(&vcore->lock);
+
+	return kvm_vcpu_yield_to(target);
+}
+
+static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
+{
+	int yield_count = 0;
+	struct lppaca *lppaca;
+
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
+	if (lppaca)
+		yield_count = be32_to_cpu(lppaca->yield_count);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+	return yield_count;
+}
+
+int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
+{
+	unsigned long req = kvmppc_get_gpr(vcpu, 3);
+	unsigned long target, ret = H_SUCCESS;
+	int yield_count;
+	struct kvm_vcpu *tvcpu;
+	int idx, rc;
+
+	if (req <= MAX_HCALL_OPCODE &&
+	    !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
+		return RESUME_HOST;
+
+	switch (req) {
+	case H_CEDE:
+		break;
+	case H_PROD:
+		target = kvmppc_get_gpr(vcpu, 4);
+		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+		if (!tvcpu) {
+			ret = H_PARAMETER;
+			break;
+		}
+		tvcpu->arch.prodded = 1;
+		smp_mb();
+		if (vcpu->arch.ceded) {
+			if (waitqueue_active(&vcpu->wq)) {
+				wake_up_interruptible(&vcpu->wq);
+				vcpu->stat.halt_wakeup++;
+			}
+		}
+		break;
+	case H_CONFER:
+		target = kvmppc_get_gpr(vcpu, 4);
+		if (target == -1)
+			break;
+		tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
+		if (!tvcpu) {
+			ret = H_PARAMETER;
+			break;
+		}
+		yield_count = kvmppc_get_gpr(vcpu, 5);
+		if (kvmppc_get_yield_count(tvcpu) != yield_count)
+			break;
+		kvm_arch_vcpu_yield_to(tvcpu);
+		break;
+	case H_REGISTER_VPA:
+		ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6));
+		break;
+	case H_RTAS:
+		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+			return RESUME_HOST;
+
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		rc = kvmppc_rtas_hcall(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+		if (rc == -ENOENT)
+			return RESUME_HOST;
+		else if (rc == 0)
+			break;
+
+		/* Send the error out to userspace via KVM_RUN */
+		return rc;
+	case H_LOGICAL_CI_LOAD:
+		ret = kvmppc_h_logical_ci_load(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_LOGICAL_CI_STORE:
+		ret = kvmppc_h_logical_ci_store(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_SET_MODE:
+		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
+					kvmppc_get_gpr(vcpu, 5),
+					kvmppc_get_gpr(vcpu, 6),
+					kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+		if (kvmppc_xics_enabled(vcpu)) {
+			ret = kvmppc_xics_hcall(vcpu, req);
+			break;
+		} /* fallthrough */
+	default:
+		return RESUME_HOST;
+	}
+	kvmppc_set_gpr(vcpu, 3, ret);
+	vcpu->arch.hcall_needed = 0;
+	return RESUME_GUEST;
+}
+
+static int kvmppc_hcall_impl_hv(unsigned long cmd)
+{
+	switch (cmd) {
+	case H_CEDE:
+	case H_PROD:
+	case H_CONFER:
+	case H_REGISTER_VPA:
+	case H_SET_MODE:
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
+#ifdef CONFIG_KVM_XICS
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+#endif
+		return 1;
+	}
+
+	/* See if it's in the real-mode table */
+	return kvmppc_hcall_impl_hv_realmode(cmd);
+}
+
+static int kvmppc_emulate_debug_inst(struct kvm_run *run,
+					struct kvm_vcpu *vcpu)
+{
+	u32 last_inst;
+
+	if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
+					EMULATE_DONE) {
+		/*
+		 * Fetch failed, so return to guest and
+		 * try executing it again.
+		 */
+		return RESUME_GUEST;
+	}
+
+	if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
+		run->exit_reason = KVM_EXIT_DEBUG;
+		run->debug.arch.address = kvmppc_get_pc(vcpu);
+		return RESUME_HOST;
+	} else {
+		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+		return RESUME_GUEST;
+	}
+}
+
+static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				 struct task_struct *tsk)
+{
+	int r = RESUME_HOST;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+	case BOOK3S_INTERRUPT_H_DOORBELL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	/* HMI is hypervisor interrupt and host has handled it. Resume guest.*/
+	case BOOK3S_INTERRUPT_HMI:
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+		/*
+		 * Deliver a machine check interrupt to the guest.
+		 * We have to do this, even if the host has handled the
+		 * machine check, because machine checks use SRR0/1 and
+		 * the interrupt might have trashed guest state in them.
+		 */
+		kvmppc_book3s_queue_irqprio(vcpu,
+					    BOOK3S_INTERRUPT_MACHINE_CHECK);
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	{
+		ulong flags;
+		/*
+		 * Normally program interrupts are delivered directly
+		 * to the guest by the hardware, but we can get here
+		 * as a result of a hypervisor emulation interrupt
+		 * (e40) getting turned into a 700 by BML RTAS.
+		 */
+		flags = vcpu->arch.shregs.msr & 0x1f0000ull;
+		kvmppc_core_queue_program(vcpu, flags);
+		r = RESUME_GUEST;
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		/* hcall - punt to userspace */
+		int i;
+
+		/* hypercall with MSR_PR has already been handled in rmode,
+		 * and never reaches here.
+		 */
+
+		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
+		for (i = 0; i < 9; ++i)
+			run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
+		run->exit_reason = KVM_EXIT_PAPR_HCALL;
+		vcpu->arch.hcall_needed = 1;
+		r = RESUME_HOST;
+		break;
+	}
+	/*
+	 * We get these next two if the guest accesses a page which it thinks
+	 * it has mapped but which is not actually present, either because
+	 * it is for an emulated I/O device or because the corresonding
+	 * host page has been paged out.  Any other HDSI/HISI interrupts
+	 * have been handled already.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		r = RESUME_PAGE_FAULT;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+		vcpu->arch.fault_dsisr = 0;
+		r = RESUME_PAGE_FAULT;
+		break;
+	/*
+	 * This occurs if the guest executes an illegal instruction.
+	 * If the guest debug is disabled, generate a program interrupt
+	 * to the guest. If guest debug is enabled, we need to check
+	 * whether the instruction is a software breakpoint instruction.
+	 * Accordingly return to Guest or Host.
+	 */
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
+			vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
+				swab32(vcpu->arch.emul_inst) :
+				vcpu->arch.emul_inst;
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
+			r = kvmppc_emulate_debug_inst(run, vcpu);
+		} else {
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+			r = RESUME_GUEST;
+		}
+		break;
+	/*
+	 * This occurs if the guest (kernel or userspace), does something that
+	 * is prohibited by HFSCR.  We just generate a program interrupt to
+	 * the guest.
+	 */
+	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+		r = RESUME_GUEST;
+		break;
+	default:
+		kvmppc_dump_regs(vcpu);
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			vcpu->arch.shregs.msr);
+		run->hw.hardware_exit_reason = vcpu->arch.trap;
+		r = RESUME_HOST;
+		break;
+	}
+
+	return r;
+}
+
+static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	int i;
+
+	memset(sregs, 0, sizeof(struct kvm_sregs));
+	sregs->pvr = vcpu->arch.pvr;
+	for (i = 0; i < vcpu->arch.slb_max; i++) {
+		sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
+		sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+	}
+
+	return 0;
+}
+
+static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	int i, j;
+
+	/* Only accept the same PVR as the host's, since we can't spoof it */
+	if (sregs->pvr != vcpu->arch.pvr)
+		return -EINVAL;
+
+	j = 0;
+	for (i = 0; i < vcpu->arch.slb_nr; i++) {
+		if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
+			vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
+			vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
+			++j;
+		}
+	}
+	vcpu->arch.slb_max = j;
+
+	return 0;
+}
+
+static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
+		bool preserve_top32)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	u64 mask;
+
+	mutex_lock(&kvm->lock);
+	spin_lock(&vc->lock);
+	/*
+	 * If ILE (interrupt little-endian) has changed, update the
+	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
+	 */
+	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
+		struct kvm_vcpu *vcpu;
+		int i;
+
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (vcpu->arch.vcore != vc)
+				continue;
+			if (new_lpcr & LPCR_ILE)
+				vcpu->arch.intr_msr |= MSR_LE;
+			else
+				vcpu->arch.intr_msr &= ~MSR_LE;
+		}
+	}
+
+	/*
+	 * Userspace can only modify DPFD (default prefetch depth),
+	 * ILE (interrupt little-endian) and TC (translation control).
+	 * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
+	 */
+	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		mask |= LPCR_AIL;
+
+	/* Broken 32-bit version of LPCR must not clear top bits */
+	if (preserve_top32)
+		mask &= 0xFFFFFFFF;
+	vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
+	spin_unlock(&vc->lock);
+	mutex_unlock(&kvm->lock);
+}
+
+static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_DEBUG_INST:
+		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
+		break;
+	case KVM_REG_PPC_HIOR:
+		*val = get_reg_val(id, 0);
+		break;
+	case KVM_REG_PPC_DABR:
+		*val = get_reg_val(id, vcpu->arch.dabr);
+		break;
+	case KVM_REG_PPC_DABRX:
+		*val = get_reg_val(id, vcpu->arch.dabrx);
+		break;
+	case KVM_REG_PPC_DSCR:
+		*val = get_reg_val(id, vcpu->arch.dscr);
+		break;
+	case KVM_REG_PPC_PURR:
+		*val = get_reg_val(id, vcpu->arch.purr);
+		break;
+	case KVM_REG_PPC_SPURR:
+		*val = get_reg_val(id, vcpu->arch.spurr);
+		break;
+	case KVM_REG_PPC_AMR:
+		*val = get_reg_val(id, vcpu->arch.amr);
+		break;
+	case KVM_REG_PPC_UAMOR:
+		*val = get_reg_val(id, vcpu->arch.uamor);
+		break;
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
+		i = id - KVM_REG_PPC_MMCR0;
+		*val = get_reg_val(id, vcpu->arch.mmcr[i]);
+		break;
+	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+		i = id - KVM_REG_PPC_PMC1;
+		*val = get_reg_val(id, vcpu->arch.pmc[i]);
+		break;
+	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+		i = id - KVM_REG_PPC_SPMC1;
+		*val = get_reg_val(id, vcpu->arch.spmc[i]);
+		break;
+	case KVM_REG_PPC_SIAR:
+		*val = get_reg_val(id, vcpu->arch.siar);
+		break;
+	case KVM_REG_PPC_SDAR:
+		*val = get_reg_val(id, vcpu->arch.sdar);
+		break;
+	case KVM_REG_PPC_SIER:
+		*val = get_reg_val(id, vcpu->arch.sier);
+		break;
+	case KVM_REG_PPC_IAMR:
+		*val = get_reg_val(id, vcpu->arch.iamr);
+		break;
+	case KVM_REG_PPC_PSPB:
+		*val = get_reg_val(id, vcpu->arch.pspb);
+		break;
+	case KVM_REG_PPC_DPDES:
+		*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
+		break;
+	case KVM_REG_PPC_DAWR:
+		*val = get_reg_val(id, vcpu->arch.dawr);
+		break;
+	case KVM_REG_PPC_DAWRX:
+		*val = get_reg_val(id, vcpu->arch.dawrx);
+		break;
+	case KVM_REG_PPC_CIABR:
+		*val = get_reg_val(id, vcpu->arch.ciabr);
+		break;
+	case KVM_REG_PPC_CSIGR:
+		*val = get_reg_val(id, vcpu->arch.csigr);
+		break;
+	case KVM_REG_PPC_TACR:
+		*val = get_reg_val(id, vcpu->arch.tacr);
+		break;
+	case KVM_REG_PPC_TCSCR:
+		*val = get_reg_val(id, vcpu->arch.tcscr);
+		break;
+	case KVM_REG_PPC_PID:
+		*val = get_reg_val(id, vcpu->arch.pid);
+		break;
+	case KVM_REG_PPC_ACOP:
+		*val = get_reg_val(id, vcpu->arch.acop);
+		break;
+	case KVM_REG_PPC_WORT:
+		*val = get_reg_val(id, vcpu->arch.wort);
+		break;
+	case KVM_REG_PPC_VPA_ADDR:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_VPA_SLB:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
+		val->vpaval.length = vcpu->arch.slb_shadow.len;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_VPA_DTL:
+		spin_lock(&vcpu->arch.vpa_update_lock);
+		val->vpaval.addr = vcpu->arch.dtl.next_gpa;
+		val->vpaval.length = vcpu->arch.dtl.len;
+		spin_unlock(&vcpu->arch.vpa_update_lock);
+		break;
+	case KVM_REG_PPC_TB_OFFSET:
+		*val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
+		break;
+	case KVM_REG_PPC_LPCR:
+	case KVM_REG_PPC_LPCR_64:
+		*val = get_reg_val(id, vcpu->arch.vcore->lpcr);
+		break;
+	case KVM_REG_PPC_PPR:
+		*val = get_reg_val(id, vcpu->arch.ppr);
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case KVM_REG_PPC_TFHAR:
+		*val = get_reg_val(id, vcpu->arch.tfhar);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		*val = get_reg_val(id, vcpu->arch.tfiar);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		*val = get_reg_val(id, vcpu->arch.texasr);
+		break;
+	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+		i = id - KVM_REG_PPC_TM_GPR0;
+		*val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
+		break;
+	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+	{
+		int j;
+		i = id - KVM_REG_PPC_TM_VSR0;
+		if (i < 32)
+			for (j = 0; j < TS_FPRWIDTH; j++)
+				val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
+		else {
+			if (cpu_has_feature(CPU_FTR_ALTIVEC))
+				val->vval = vcpu->arch.vr_tm.vr[i-32];
+			else
+				r = -ENXIO;
+		}
+		break;
+	}
+	case KVM_REG_PPC_TM_CR:
+		*val = get_reg_val(id, vcpu->arch.cr_tm);
+		break;
+	case KVM_REG_PPC_TM_LR:
+		*val = get_reg_val(id, vcpu->arch.lr_tm);
+		break;
+	case KVM_REG_PPC_TM_CTR:
+		*val = get_reg_val(id, vcpu->arch.ctr_tm);
+		break;
+	case KVM_REG_PPC_TM_FPSCR:
+		*val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
+		break;
+	case KVM_REG_PPC_TM_AMR:
+		*val = get_reg_val(id, vcpu->arch.amr_tm);
+		break;
+	case KVM_REG_PPC_TM_PPR:
+		*val = get_reg_val(id, vcpu->arch.ppr_tm);
+		break;
+	case KVM_REG_PPC_TM_VRSAVE:
+		*val = get_reg_val(id, vcpu->arch.vrsave_tm);
+		break;
+	case KVM_REG_PPC_TM_VSCR:
+		if (cpu_has_feature(CPU_FTR_ALTIVEC))
+			*val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
+		else
+			r = -ENXIO;
+		break;
+	case KVM_REG_PPC_TM_DSCR:
+		*val = get_reg_val(id, vcpu->arch.dscr_tm);
+		break;
+	case KVM_REG_PPC_TM_TAR:
+		*val = get_reg_val(id, vcpu->arch.tar_tm);
+		break;
+#endif
+	case KVM_REG_PPC_ARCH_COMPAT:
+		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+	unsigned long addr, len;
+
+	switch (id) {
+	case KVM_REG_PPC_HIOR:
+		/* Only allow this to be set to zero */
+		if (set_reg_val(id, *val))
+			r = -EINVAL;
+		break;
+	case KVM_REG_PPC_DABR:
+		vcpu->arch.dabr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DABRX:
+		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
+		break;
+	case KVM_REG_PPC_DSCR:
+		vcpu->arch.dscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PURR:
+		vcpu->arch.purr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_SPURR:
+		vcpu->arch.spurr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_AMR:
+		vcpu->arch.amr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_UAMOR:
+		vcpu->arch.uamor = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
+		i = id - KVM_REG_PPC_MMCR0;
+		vcpu->arch.mmcr[i] = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+		i = id - KVM_REG_PPC_PMC1;
+		vcpu->arch.pmc[i] = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+		i = id - KVM_REG_PPC_SPMC1;
+		vcpu->arch.spmc[i] = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_SIAR:
+		vcpu->arch.siar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_SDAR:
+		vcpu->arch.sdar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_SIER:
+		vcpu->arch.sier = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_IAMR:
+		vcpu->arch.iamr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PSPB:
+		vcpu->arch.pspb = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DPDES:
+		vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DAWR:
+		vcpu->arch.dawr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DAWRX:
+		vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
+		break;
+	case KVM_REG_PPC_CIABR:
+		vcpu->arch.ciabr = set_reg_val(id, *val);
+		/* Don't allow setting breakpoints in hypervisor code */
+		if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+			vcpu->arch.ciabr &= ~CIABR_PRIV;	/* disable */
+		break;
+	case KVM_REG_PPC_CSIGR:
+		vcpu->arch.csigr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TACR:
+		vcpu->arch.tacr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TCSCR:
+		vcpu->arch.tcscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PID:
+		vcpu->arch.pid = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_ACOP:
+		vcpu->arch.acop = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_WORT:
+		vcpu->arch.wort = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_VPA_ADDR:
+		addr = set_reg_val(id, *val);
+		r = -EINVAL;
+		if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
+			      vcpu->arch.dtl.next_gpa))
+			break;
+		r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
+		break;
+	case KVM_REG_PPC_VPA_SLB:
+		addr = val->vpaval.addr;
+		len = val->vpaval.length;
+		r = -EINVAL;
+		if (addr && !vcpu->arch.vpa.next_gpa)
+			break;
+		r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
+		break;
+	case KVM_REG_PPC_VPA_DTL:
+		addr = val->vpaval.addr;
+		len = val->vpaval.length;
+		r = -EINVAL;
+		if (addr && (len < sizeof(struct dtl_entry) ||
+			     !vcpu->arch.vpa.next_gpa))
+			break;
+		len -= len % sizeof(struct dtl_entry);
+		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
+		break;
+	case KVM_REG_PPC_TB_OFFSET:
+		/* round up to multiple of 2^24 */
+		vcpu->arch.vcore->tb_offset =
+			ALIGN(set_reg_val(id, *val), 1UL << 24);
+		break;
+	case KVM_REG_PPC_LPCR:
+		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
+		break;
+	case KVM_REG_PPC_LPCR_64:
+		kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
+		break;
+	case KVM_REG_PPC_PPR:
+		vcpu->arch.ppr = set_reg_val(id, *val);
+		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case KVM_REG_PPC_TFHAR:
+		vcpu->arch.tfhar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		vcpu->arch.tfiar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		vcpu->arch.texasr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
+		i = id - KVM_REG_PPC_TM_GPR0;
+		vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
+	{
+		int j;
+		i = id - KVM_REG_PPC_TM_VSR0;
+		if (i < 32)
+			for (j = 0; j < TS_FPRWIDTH; j++)
+				vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
+		else
+			if (cpu_has_feature(CPU_FTR_ALTIVEC))
+				vcpu->arch.vr_tm.vr[i-32] = val->vval;
+			else
+				r = -ENXIO;
+		break;
+	}
+	case KVM_REG_PPC_TM_CR:
+		vcpu->arch.cr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_LR:
+		vcpu->arch.lr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_CTR:
+		vcpu->arch.ctr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_FPSCR:
+		vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_AMR:
+		vcpu->arch.amr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_PPR:
+		vcpu->arch.ppr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VRSAVE:
+		vcpu->arch.vrsave_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_VSCR:
+		if (cpu_has_feature(CPU_FTR_ALTIVEC))
+			vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
+		else
+			r = - ENXIO;
+		break;
+	case KVM_REG_PPC_TM_DSCR:
+		vcpu->arch.dscr_tm = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TM_TAR:
+		vcpu->arch.tar_tm = set_reg_val(id, *val);
+		break;
+#endif
+	case KVM_REG_PPC_ARCH_COMPAT:
+		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
+{
+	struct kvmppc_vcore *vcore;
+
+	vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
+
+	if (vcore == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&vcore->runnable_threads);
+	spin_lock_init(&vcore->lock);
+	spin_lock_init(&vcore->stoltb_lock);
+	init_waitqueue_head(&vcore->wq);
+	vcore->preempt_tb = TB_NIL;
+	vcore->lpcr = kvm->arch.lpcr;
+	vcore->first_vcpuid = core * threads_per_subcore;
+	vcore->kvm = kvm;
+
+	vcore->mpp_buffer_is_valid = false;
+
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		vcore->mpp_buffer = (void *)__get_free_pages(
+			GFP_KERNEL|__GFP_ZERO,
+			MPP_BUFFER_ORDER);
+
+	return vcore;
+}
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static struct debugfs_timings_element {
+	const char *name;
+	size_t offset;
+} timings[] = {
+	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
+	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
+	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
+	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
+	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
+};
+
+#define N_TIMINGS	(sizeof(timings) / sizeof(timings[0]))
+
+struct debugfs_timings_state {
+	struct kvm_vcpu	*vcpu;
+	unsigned int	buflen;
+	char		buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+	struct kvm_vcpu *vcpu = inode->i_private;
+	struct debugfs_timings_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(vcpu->kvm);
+	p->vcpu = vcpu;
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_timings_state *p = file->private_data;
+
+	kvm_put_kvm(p->vcpu->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos)
+{
+	struct debugfs_timings_state *p = file->private_data;
+	struct kvm_vcpu *vcpu = p->vcpu;
+	char *s, *buf_end;
+	struct kvmhv_tb_accumulator tb;
+	u64 count;
+	loff_t pos;
+	ssize_t n;
+	int i, loops;
+	bool ok;
+
+	if (!p->buflen) {
+		s = p->buf;
+		buf_end = s + sizeof(p->buf);
+		for (i = 0; i < N_TIMINGS; ++i) {
+			struct kvmhv_tb_accumulator *acc;
+
+			acc = (struct kvmhv_tb_accumulator *)
+				((unsigned long)vcpu + timings[i].offset);
+			ok = false;
+			for (loops = 0; loops < 1000; ++loops) {
+				count = acc->seqcount;
+				if (!(count & 1)) {
+					smp_rmb();
+					tb = *acc;
+					smp_rmb();
+					if (count == acc->seqcount) {
+						ok = true;
+						break;
+					}
+				}
+				udelay(1);
+			}
+			if (!ok)
+				snprintf(s, buf_end - s, "%s: stuck\n",
+					timings[i].name);
+			else
+				snprintf(s, buf_end - s,
+					"%s: %llu %llu %llu %llu\n",
+					timings[i].name, count / 2,
+					tb_to_ns(tb.tb_total),
+					tb_to_ns(tb.tb_min),
+					tb_to_ns(tb.tb_max));
+			s += strlen(s);
+		}
+		p->buflen = s - p->buf;
+	}
+
+	pos = *ppos;
+	if (pos >= p->buflen)
+		return 0;
+	if (len > p->buflen - pos)
+		len = p->buflen - pos;
+	n = copy_to_user(buf, p->buf + pos, len);
+	if (n) {
+		if (n == len)
+			return -EFAULT;
+		len -= n;
+	}
+	*ppos = pos + len;
+	return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+				     size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_timings_open,
+	.release = debugfs_timings_release,
+	.read	 = debugfs_timings_read,
+	.write	 = debugfs_timings_write,
+	.llseek	 = generic_file_llseek,
+};
+
+/* Create a debugfs directory for the vcpu */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+	char buf[16];
+	struct kvm *kvm = vcpu->kvm;
+
+	snprintf(buf, sizeof(buf), "vcpu%u", id);
+	if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+		return;
+	vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
+	if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
+		return;
+	vcpu->arch.debugfs_timings =
+		debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
+				    vcpu, &debugfs_timings_ops);
+}
+
+#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+}
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+
+static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
+						   unsigned int id)
+{
+	struct kvm_vcpu *vcpu;
+	int err = -EINVAL;
+	int core;
+	struct kvmppc_vcore *vcore;
+
+	core = id / threads_per_subcore;
+	if (core >= KVM_MAX_VCORES)
+		goto out;
+
+	err = -ENOMEM;
+	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu)
+		goto out;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	vcpu->arch.shared = &vcpu->arch.shregs;
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+	/*
+	 * The shared struct is never shared on HV,
+	 * so we can always use host endianness
+	 */
+#ifdef __BIG_ENDIAN__
+	vcpu->arch.shared_big_endian = true;
+#else
+	vcpu->arch.shared_big_endian = false;
+#endif
+#endif
+	vcpu->arch.mmcr[0] = MMCR0_FC;
+	vcpu->arch.ctrl = CTRL_RUNLATCH;
+	/* default to host PVR, since we can't spoof it */
+	kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
+	spin_lock_init(&vcpu->arch.vpa_update_lock);
+	spin_lock_init(&vcpu->arch.tbacct_lock);
+	vcpu->arch.busy_preempt = TB_NIL;
+	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
+
+	kvmppc_mmu_book3s_hv_init(vcpu);
+
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
+
+	init_waitqueue_head(&vcpu->arch.cpu_run);
+
+	mutex_lock(&kvm->lock);
+	vcore = kvm->arch.vcores[core];
+	if (!vcore) {
+		vcore = kvmppc_vcore_create(kvm, core);
+		kvm->arch.vcores[core] = vcore;
+		kvm->arch.online_vcores++;
+	}
+	mutex_unlock(&kvm->lock);
+
+	if (!vcore)
+		goto free_vcpu;
+
+	spin_lock(&vcore->lock);
+	++vcore->num_threads;
+	spin_unlock(&vcore->lock);
+	vcpu->arch.vcore = vcore;
+	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
+
+	vcpu->arch.cpu_type = KVM_CPU_3S_64;
+	kvmppc_sanity_check(vcpu);
+
+	debugfs_vcpu_init(vcpu, id);
+
+	return vcpu;
+
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
+{
+	if (vpa->pinned_addr)
+		kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
+					vpa->dirty);
+}
+
+static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->arch.vpa_update_lock);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
+	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
+	spin_unlock(&vcpu->arch.vpa_update_lock);
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
+}
+
+static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
+{
+	/* Indicate we want to get back into the guest */
+	return 1;
+}
+
+static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
+{
+	unsigned long dec_nsec, now;
+
+	now = get_tb();
+	if (now > vcpu->arch.dec_expires) {
+		/* decrementer has already gone negative */
+		kvmppc_core_queue_dec(vcpu);
+		kvmppc_core_prepare_to_enter(vcpu);
+		return;
+	}
+	dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
+		   / tb_ticks_per_sec;
+	hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+		      HRTIMER_MODE_REL);
+	vcpu->arch.timer_running = 1;
+}
+
+static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.ceded = 0;
+	if (vcpu->arch.timer_running) {
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+		vcpu->arch.timer_running = 0;
+	}
+}
+
+extern void __kvmppc_vcore_entry(void);
+
+static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
+				   struct kvm_vcpu *vcpu)
+{
+	u64 now;
+
+	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+	spin_lock_irq(&vcpu->arch.tbacct_lock);
+	now = mftb();
+	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
+		vcpu->arch.stolen_logged;
+	vcpu->arch.busy_preempt = now;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	spin_unlock_irq(&vcpu->arch.tbacct_lock);
+	--vc->n_runnable;
+	list_del(&vcpu->arch.run_list);
+}
+
+static int kvmppc_grab_hwthread(int cpu)
+{
+	struct paca_struct *tpaca;
+	long timeout = 10000;
+
+	tpaca = &paca[cpu];
+
+	/* Ensure the thread won't go into the kernel if it wakes */
+	tpaca->kvm_hstate.kvm_vcpu = NULL;
+	tpaca->kvm_hstate.napping = 0;
+	smp_wmb();
+	tpaca->kvm_hstate.hwthread_req = 1;
+
+	/*
+	 * If the thread is already executing in the kernel (e.g. handling
+	 * a stray interrupt), wait for it to get back to nap mode.
+	 * The smp_mb() is to ensure that our setting of hwthread_req
+	 * is visible before we look at hwthread_state, so if this
+	 * races with the code at system_reset_pSeries and the thread
+	 * misses our setting of hwthread_req, we are sure to see its
+	 * setting of hwthread_state, and vice versa.
+	 */
+	smp_mb();
+	while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
+		if (--timeout <= 0) {
+			pr_err("KVM: couldn't grab cpu %d\n", cpu);
+			return -EBUSY;
+		}
+		udelay(1);
+	}
+	return 0;
+}
+
+static void kvmppc_release_hwthread(int cpu)
+{
+	struct paca_struct *tpaca;
+
+	tpaca = &paca[cpu];
+	tpaca->kvm_hstate.hwthread_req = 0;
+	tpaca->kvm_hstate.kvm_vcpu = NULL;
+}
+
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+{
+	int cpu;
+	struct paca_struct *tpaca;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	if (vcpu->arch.timer_running) {
+		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+		vcpu->arch.timer_running = 0;
+	}
+	cpu = vc->pcpu + vcpu->arch.ptid;
+	tpaca = &paca[cpu];
+	tpaca->kvm_hstate.kvm_vcore = vc;
+	tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
+	vcpu->cpu = vc->pcpu;
+	/* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
+	smp_wmb();
+	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	if (cpu != smp_processor_id())
+		kvmppc_ipi_thread(cpu);
+}
+
+static void kvmppc_wait_for_nap(void)
+{
+	int cpu = smp_processor_id();
+	int i, loops;
+
+	for (loops = 0; loops < 1000000; ++loops) {
+		/*
+		 * Check if all threads are finished.
+		 * We set the vcpu pointer when starting a thread
+		 * and the thread clears it when finished, so we look
+		 * for any threads that still have a non-NULL vcpu ptr.
+		 */
+		for (i = 1; i < threads_per_subcore; ++i)
+			if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+				break;
+		if (i == threads_per_subcore) {
+			HMT_medium();
+			return;
+		}
+		HMT_low();
+	}
+	HMT_medium();
+	for (i = 1; i < threads_per_subcore; ++i)
+		if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
+}
+
+/*
+ * Check that we are on thread 0 and that any other threads in
+ * this core are off-line.  Then grab the threads so they can't
+ * enter the kernel.
+ */
+static int on_primary_thread(void)
+{
+	int cpu = smp_processor_id();
+	int thr;
+
+	/* Are we on a primary subcore? */
+	if (cpu_thread_in_subcore(cpu))
+		return 0;
+
+	thr = 0;
+	while (++thr < threads_per_subcore)
+		if (cpu_online(cpu + thr))
+			return 0;
+
+	/* Grab all hw threads so they can't go into the kernel */
+	for (thr = 1; thr < threads_per_subcore; ++thr) {
+		if (kvmppc_grab_hwthread(cpu + thr)) {
+			/* Couldn't grab one; let the others go */
+			do {
+				kvmppc_release_hwthread(cpu + thr);
+			} while (--thr > 0);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static void kvmppc_start_saving_l2_cache(struct kvmppc_vcore *vc)
+{
+	phys_addr_t phy_addr, mpp_addr;
+
+	phy_addr = (phys_addr_t)virt_to_phys(vc->mpp_buffer);
+	mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK;
+
+	mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_ABORT);
+	logmpp(mpp_addr | PPC_LOGMPP_LOG_L2);
+
+	vc->mpp_buffer_is_valid = true;
+}
+
+static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
+{
+	phys_addr_t phy_addr, mpp_addr;
+
+	phy_addr = virt_to_phys(vc->mpp_buffer);
+	mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK;
+
+	/* We must abort any in-progress save operations to ensure
+	 * the table is valid so that prefetch engine knows when to
+	 * stop prefetching. */
+	logmpp(mpp_addr | PPC_LOGMPP_LOG_ABORT);
+	mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
+}
+
+static void prepare_threads(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu, *vnext;
+
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		if (signal_pending(vcpu->arch.run_task))
+			vcpu->arch.ret = -EINTR;
+		else if (vcpu->arch.vpa.update_pending ||
+			 vcpu->arch.slb_shadow.update_pending ||
+			 vcpu->arch.dtl.update_pending)
+			vcpu->arch.ret = RESUME_GUEST;
+		else
+			continue;
+		kvmppc_remove_runnable(vc, vcpu);
+		wake_up(&vcpu->arch.cpu_run);
+	}
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc)
+{
+	u64 now;
+	long ret;
+	struct kvm_vcpu *vcpu, *vnext;
+
+	now = get_tb();
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		/* cancel pending dec exception if dec is positive */
+		if (now < vcpu->arch.dec_expires &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+
+		trace_kvm_guest_exit(vcpu);
+
+		ret = RESUME_GUEST;
+		if (vcpu->arch.trap)
+			ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+						    vcpu->arch.run_task);
+
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+
+		if (vcpu->arch.ceded) {
+			if (!is_kvmppc_resume_guest(ret))
+				kvmppc_end_cede(vcpu);
+			else
+				kvmppc_set_timer(vcpu);
+		}
+		if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+}
+
+/*
+ * Run a set of guest threads on a physical core.
+ * Called with vc->lock held.
+ */
+static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu, *vnext;
+	int i;
+	int srcu_idx;
+
+	/*
+	 * Remove from the list any threads that have a signal pending
+	 * or need a VPA update done
+	 */
+	prepare_threads(vc);
+
+	/* if the runner is no longer runnable, let the caller pick a new one */
+	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
+
+	/*
+	 * Initialize *vc.
+	 */
+	vc->entry_exit_map = 0;
+	vc->preempt_tb = TB_NIL;
+	vc->in_guest = 0;
+	vc->napping_threads = 0;
+	vc->conferring_threads = 0;
+
+	/*
+	 * Make sure we are running on primary threads, and that secondary
+	 * threads are offline.  Also check if the number of threads in this
+	 * guest are greater than the current system threads per guest.
+	 */
+	if ((threads_per_core > 1) &&
+	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+		list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+					 arch.run_list) {
+			vcpu->arch.ret = -EBUSY;
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+		goto out;
+	}
+
+
+	vc->pcpu = smp_processor_id();
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+		kvmppc_start_thread(vcpu);
+		kvmppc_create_dtl_entry(vcpu, vc);
+		trace_kvm_guest_enter(vcpu);
+	}
+
+	/* Set this explicitly in case thread 0 doesn't have a vcpu */
+	get_paca()->kvm_hstate.kvm_vcore = vc;
+	get_paca()->kvm_hstate.ptid = 0;
+
+	vc->vcore_state = VCORE_RUNNING;
+	preempt_disable();
+
+	trace_kvmppc_run_core(vc, 0);
+
+	spin_unlock(&vc->lock);
+
+	kvm_guest_enter();
+
+	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
+
+	if (vc->mpp_buffer_is_valid)
+		kvmppc_start_restoring_l2_cache(vc);
+
+	__kvmppc_vcore_entry();
+
+	spin_lock(&vc->lock);
+
+	if (vc->mpp_buffer)
+		kvmppc_start_saving_l2_cache(vc);
+
+	/* disable sending of IPIs on virtual external irqs */
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		vcpu->cpu = -1;
+	/* wait for secondary threads to finish writing their state to memory */
+	kvmppc_wait_for_nap();
+	for (i = 0; i < threads_per_subcore; ++i)
+		kvmppc_release_hwthread(vc->pcpu + i);
+	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
+	vc->vcore_state = VCORE_EXITING;
+	spin_unlock(&vc->lock);
+
+	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+
+	/* make sure updates to secondary vcpu structs are visible now */
+	smp_mb();
+	kvm_guest_exit();
+
+	preempt_enable();
+
+	spin_lock(&vc->lock);
+	post_guest_process(vc);
+
+ out:
+	vc->vcore_state = VCORE_INACTIVE;
+	trace_kvmppc_run_core(vc, 1);
+}
+
+/*
+ * Wait for some other vcpu thread to execute us, and
+ * wake us up when we need to handle something in the host.
+ */
+static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+		schedule();
+	finish_wait(&vcpu->arch.cpu_run, &wait);
+}
+
+/*
+ * All the vcpus in this vcore are idle, so wait for a decrementer
+ * or external interrupt to one of the vcpus.  vc->lock is held.
+ */
+static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu;
+	int do_sleep = 1;
+
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+
+	/*
+	 * Check one last time for pending exceptions and ceded state after
+	 * we put ourselves on the wait queue
+	 */
+	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
+			do_sleep = 0;
+			break;
+		}
+	}
+
+	if (!do_sleep) {
+		finish_wait(&vc->wq, &wait);
+		return;
+	}
+
+	vc->vcore_state = VCORE_SLEEPING;
+	trace_kvmppc_vcore_blocked(vc, 0);
+	spin_unlock(&vc->lock);
+	schedule();
+	finish_wait(&vc->wq, &wait);
+	spin_lock(&vc->lock);
+	vc->vcore_state = VCORE_INACTIVE;
+	trace_kvmppc_vcore_blocked(vc, 1);
+}
+
+static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int n_ceded;
+	struct kvmppc_vcore *vc;
+	struct kvm_vcpu *v, *vn;
+
+	trace_kvmppc_run_vcpu_enter(vcpu);
+
+	kvm_run->exit_reason = 0;
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+	kvmppc_update_vpas(vcpu);
+
+	/*
+	 * Synchronize with other threads in this virtual core
+	 */
+	vc = vcpu->arch.vcore;
+	spin_lock(&vc->lock);
+	vcpu->arch.ceded = 0;
+	vcpu->arch.run_task = current;
+	vcpu->arch.kvm_run = kvm_run;
+	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	vcpu->arch.busy_preempt = TB_NIL;
+	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+	++vc->n_runnable;
+
+	/*
+	 * This happens the first time this is called for a vcpu.
+	 * If the vcore is already running, we may be able to start
+	 * this thread straight away and have it join in.
+	 */
+	if (!signal_pending(current)) {
+		if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
+			kvmppc_create_dtl_entry(vcpu, vc);
+			kvmppc_start_thread(vcpu);
+			trace_kvm_guest_enter(vcpu);
+		} else if (vc->vcore_state == VCORE_SLEEPING) {
+			wake_up(&vc->wq);
+		}
+
+	}
+
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	       !signal_pending(current)) {
+		if (vc->vcore_state != VCORE_INACTIVE) {
+			spin_unlock(&vc->lock);
+			kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
+			spin_lock(&vc->lock);
+			continue;
+		}
+		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
+					 arch.run_list) {
+			kvmppc_core_prepare_to_enter(v);
+			if (signal_pending(v->arch.run_task)) {
+				kvmppc_remove_runnable(vc, v);
+				v->stat.signal_exits++;
+				v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
+				v->arch.ret = -EINTR;
+				wake_up(&v->arch.cpu_run);
+			}
+		}
+		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+			break;
+		n_ceded = 0;
+		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+			if (!v->arch.pending_exceptions)
+				n_ceded += v->arch.ceded;
+			else
+				v->arch.ceded = 0;
+		}
+		vc->runner = vcpu;
+		if (n_ceded == vc->n_runnable) {
+			kvmppc_vcore_blocked(vc);
+		} else if (should_resched()) {
+			vc->vcore_state = VCORE_PREEMPT;
+			/* Let something else run */
+			cond_resched_lock(&vc->lock);
+			vc->vcore_state = VCORE_INACTIVE;
+		} else {
+			kvmppc_run_core(vc);
+		}
+		vc->runner = NULL;
+	}
+
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	       (vc->vcore_state == VCORE_RUNNING ||
+		vc->vcore_state == VCORE_EXITING)) {
+		spin_unlock(&vc->lock);
+		kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+		spin_lock(&vc->lock);
+	}
+
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		kvmppc_remove_runnable(vc, vcpu);
+		vcpu->stat.signal_exits++;
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		vcpu->arch.ret = -EINTR;
+	}
+
+	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
+		/* Wake up some vcpu to run the core */
+		v = list_first_entry(&vc->runnable_threads,
+				     struct kvm_vcpu, arch.run_list);
+		wake_up(&v->arch.cpu_run);
+	}
+
+	trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
+	spin_unlock(&vc->lock);
+	return vcpu->arch.ret;
+}
+
+static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	int r;
+	int srcu_idx;
+
+	if (!vcpu->arch.sane) {
+		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return -EINVAL;
+	}
+
+	kvmppc_core_prepare_to_enter(vcpu);
+
+	/* No need to go into the guest when all we'll do is come back out */
+	if (signal_pending(current)) {
+		run->exit_reason = KVM_EXIT_INTR;
+		return -EINTR;
+	}
+
+	atomic_inc(&vcpu->kvm->arch.vcpus_running);
+	/* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
+	smp_mb();
+
+	/* On the first time here, set up HTAB and VRMA */
+	if (!vcpu->kvm->arch.hpte_setup_done) {
+		r = kvmppc_hv_setup_htab_rma(vcpu);
+		if (r)
+			goto out;
+	}
+
+	flush_fp_to_thread(current);
+	flush_altivec_to_thread(current);
+	flush_vsx_to_thread(current);
+	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
+	vcpu->arch.pgdir = current->mm->pgd;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+
+	do {
+		r = kvmppc_run_vcpu(run, vcpu);
+
+		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
+		    !(vcpu->arch.shregs.msr & MSR_PR)) {
+			trace_kvm_hcall_enter(vcpu);
+			r = kvmppc_pseries_do_hcall(vcpu);
+			trace_kvm_hcall_exit(vcpu, r);
+			kvmppc_core_prepare_to_enter(vcpu);
+		} else if (r == RESUME_PAGE_FAULT) {
+			srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = kvmppc_book3s_hv_page_fault(run, vcpu,
+				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		}
+	} while (is_kvmppc_resume_guest(r));
+
+ out:
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
+	atomic_dec(&vcpu->kvm->arch.vcpus_running);
+	return r;
+}
+
+static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
+				     int linux_psize)
+{
+	struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
+
+	if (!def->shift)
+		return;
+	(*sps)->page_shift = def->shift;
+	(*sps)->slb_enc = def->sllp;
+	(*sps)->enc[0].page_shift = def->shift;
+	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
+	/*
+	 * Add 16MB MPSS support if host supports it
+	 */
+	if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
+		(*sps)->enc[1].page_shift = 24;
+		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+	}
+	(*sps)++;
+}
+
+static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
+					 struct kvm_ppc_smmu_info *info)
+{
+	struct kvm_ppc_one_seg_page_size *sps;
+
+	info->flags = KVM_PPC_PAGE_SIZES_REAL;
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+		info->flags |= KVM_PPC_1T_SEGMENTS;
+	info->slb_size = mmu_slb_size;
+
+	/* We only support these sizes for now, and no muti-size segments */
+	sps = &info->sps[0];
+	kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
+	kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
+	kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
+
+	return 0;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
+					 struct kvm_dirty_log *log)
+{
+	struct kvm_memory_slot *memslot;
+	int r;
+	unsigned long n;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = -EINVAL;
+	if (log->slot >= KVM_USER_MEM_SLOTS)
+		goto out;
+
+	memslot = id_to_memslot(kvm->memslots, log->slot);
+	r = -ENOENT;
+	if (!memslot->dirty_bitmap)
+		goto out;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
+	memset(memslot->dirty_bitmap, 0, n);
+
+	r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
+	if (r)
+		goto out;
+
+	r = -EFAULT;
+	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
+		goto out;
+
+	r = 0;
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
+static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
+					struct kvm_memory_slot *dont)
+{
+	if (!dont || free->arch.rmap != dont->arch.rmap) {
+		vfree(free->arch.rmap);
+		free->arch.rmap = NULL;
+	}
+}
+
+static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
+					 unsigned long npages)
+{
+	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+	if (!slot->arch.rmap)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
+					struct kvm_memory_slot *memslot,
+					struct kvm_userspace_memory_region *mem)
+{
+	return 0;
+}
+
+static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old)
+{
+	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
+	struct kvm_memory_slot *memslot;
+
+	if (npages && old->npages) {
+		/*
+		 * If modifying a memslot, reset all the rmap dirty bits.
+		 * If this is a new memslot, we don't need to do anything
+		 * since the rmap array starts out as all zeroes,
+		 * i.e. no pages are dirty.
+		 */
+		memslot = id_to_memslot(kvm->memslots, mem->slot);
+		kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
+	}
+}
+
+/*
+ * Update LPCR values in kvm->arch and in vcores.
+ * Caller must hold kvm->lock.
+ */
+void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
+{
+	long int i;
+	u32 cores_done = 0;
+
+	if ((kvm->arch.lpcr & mask) == lpcr)
+		return;
+
+	kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
+
+	for (i = 0; i < KVM_MAX_VCORES; ++i) {
+		struct kvmppc_vcore *vc = kvm->arch.vcores[i];
+		if (!vc)
+			continue;
+		spin_lock(&vc->lock);
+		vc->lpcr = (vc->lpcr & ~mask) | lpcr;
+		spin_unlock(&vc->lock);
+		if (++cores_done >= kvm->arch.online_vcores)
+			break;
+	}
+}
+
+static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
+{
+	return;
+}
+
+static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
+{
+	int err = 0;
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long hva;
+	struct kvm_memory_slot *memslot;
+	struct vm_area_struct *vma;
+	unsigned long lpcr = 0, senc;
+	unsigned long psize, porder;
+	int srcu_idx;
+
+	mutex_lock(&kvm->lock);
+	if (kvm->arch.hpte_setup_done)
+		goto out;	/* another vcpu beat us to it */
+
+	/* Allocate hashed page table (if not done already) and reset it */
+	if (!kvm->arch.hpt_virt) {
+		err = kvmppc_alloc_hpt(kvm, NULL);
+		if (err) {
+			pr_err("KVM: Couldn't alloc HPT\n");
+			goto out;
+		}
+	}
+
+	/* Look up the memslot for guest physical address 0 */
+	srcu_idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, 0);
+
+	/* We must have some memory at 0 by now */
+	err = -EINVAL;
+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+		goto out_srcu;
+
+	/* Look up the VMA for the start of this memory slot */
+	hva = memslot->userspace_addr;
+	down_read(&current->mm->mmap_sem);
+	vma = find_vma(current->mm, hva);
+	if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
+		goto up_out;
+
+	psize = vma_kernel_pagesize(vma);
+	porder = __ilog2(psize);
+
+	up_read(&current->mm->mmap_sem);
+
+	/* We can handle 4k, 64k or 16M pages in the VRMA */
+	err = -EINVAL;
+	if (!(psize == 0x1000 || psize == 0x10000 ||
+	      psize == 0x1000000))
+		goto out_srcu;
+
+	/* Update VRMASD field in the LPCR */
+	senc = slb_pgsize_encoding(psize);
+	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+		(VRMA_VSID << SLB_VSID_SHIFT_1T);
+	/* the -4 is to account for senc values starting at 0x10 */
+	lpcr = senc << (LPCR_VRMASD_SH - 4);
+
+	/* Create HPTEs in the hash page table for the VRMA */
+	kvmppc_map_vrma(vcpu, memslot, porder);
+
+	kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
+
+	/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
+	smp_wmb();
+	kvm->arch.hpte_setup_done = 1;
+	err = 0;
+ out_srcu:
+	srcu_read_unlock(&kvm->srcu, srcu_idx);
+ out:
+	mutex_unlock(&kvm->lock);
+	return err;
+
+ up_out:
+	up_read(&current->mm->mmap_sem);
+	goto out_srcu;
+}
+
+static int kvmppc_core_init_vm_hv(struct kvm *kvm)
+{
+	unsigned long lpcr, lpid;
+	char buf[32];
+
+	/* Allocate the guest's logical partition ID */
+
+	lpid = kvmppc_alloc_lpid();
+	if ((long)lpid < 0)
+		return -ENOMEM;
+	kvm->arch.lpid = lpid;
+
+	/*
+	 * Since we don't flush the TLB when tearing down a VM,
+	 * and this lpid might have previously been used,
+	 * make sure we flush on each core before running the new VM.
+	 */
+	cpumask_setall(&kvm->arch.need_tlb_flush);
+
+	/* Start out with the default set of hcalls enabled */
+	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
+	       sizeof(kvm->arch.enabled_hcalls));
+
+	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
+
+	/* Init LPCR for virtual RMA mode */
+	kvm->arch.host_lpid = mfspr(SPRN_LPID);
+	kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+	lpcr &= LPCR_PECE | LPCR_LPES;
+	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
+		LPCR_VPM0 | LPCR_VPM1;
+	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
+		(VRMA_VSID << SLB_VSID_SHIFT_1T);
+	/* On POWER8 turn on online bit to enable PURR/SPURR */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		lpcr |= LPCR_ONL;
+	kvm->arch.lpcr = lpcr;
+
+	/*
+	 * Track that we now have a HV mode VM active. This blocks secondary
+	 * CPU threads from coming online.
+	 */
+	kvm_hv_vm_activated();
+
+	/*
+	 * Create a debugfs directory for the VM
+	 */
+	snprintf(buf, sizeof(buf), "vm%d", current->pid);
+	kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
+	if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+		kvmppc_mmu_debugfs_init(kvm);
+
+	return 0;
+}
+
+static void kvmppc_free_vcores(struct kvm *kvm)
+{
+	long int i;
+
+	for (i = 0; i < KVM_MAX_VCORES; ++i) {
+		if (kvm->arch.vcores[i] && kvm->arch.vcores[i]->mpp_buffer) {
+			struct kvmppc_vcore *vc = kvm->arch.vcores[i];
+			free_pages((unsigned long)vc->mpp_buffer,
+				   MPP_BUFFER_ORDER);
+		}
+		kfree(kvm->arch.vcores[i]);
+	}
+	kvm->arch.online_vcores = 0;
+}
+
+static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
+{
+	debugfs_remove_recursive(kvm->arch.debugfs_dir);
+
+	kvm_hv_vm_deactivated();
+
+	kvmppc_free_vcores(kvm);
+
+	kvmppc_free_hpt(kvm);
+}
+
+/* We don't need to emulate any privileged instructions or dcbz */
+static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				     unsigned int inst, int *advance)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
+					ulong spr_val)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
+					ulong *spr_val)
+{
+	return EMULATE_FAIL;
+}
+
+static int kvmppc_core_check_processor_compat_hv(void)
+{
+	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
+	    !cpu_has_feature(CPU_FTR_ARCH_206))
+		return -EIO;
+	return 0;
+}
+
+static long kvm_arch_vm_ioctl_hv(struct file *filp,
+				 unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm __maybe_unused = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r;
+
+	switch (ioctl) {
+
+	case KVM_PPC_ALLOCATE_HTAB: {
+		u32 htab_order;
+
+		r = -EFAULT;
+		if (get_user(htab_order, (u32 __user *)argp))
+			break;
+		r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+		if (r)
+			break;
+		r = -EFAULT;
+		if (put_user(htab_order, (u32 __user *)argp))
+			break;
+		r = 0;
+		break;
+	}
+
+	case KVM_PPC_GET_HTAB_FD: {
+		struct kvm_get_htab_fd ghf;
+
+		r = -EFAULT;
+		if (copy_from_user(&ghf, argp, sizeof(ghf)))
+			break;
+		r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
+		break;
+	}
+
+	default:
+		r = -ENOTTY;
+	}
+
+	return r;
+}
+
+/*
+ * List of hcall numbers to enable by default.
+ * For compatibility with old userspace, we enable by default
+ * all hcalls that were implemented before the hcall-enabling
+ * facility was added.  Note this list should not include H_RTAS.
+ */
+static unsigned int default_hcall_list[] = {
+	H_REMOVE,
+	H_ENTER,
+	H_READ,
+	H_PROTECT,
+	H_BULK_REMOVE,
+	H_GET_TCE,
+	H_PUT_TCE,
+	H_SET_DABR,
+	H_SET_XDABR,
+	H_CEDE,
+	H_PROD,
+	H_CONFER,
+	H_REGISTER_VPA,
+#ifdef CONFIG_KVM_XICS
+	H_EOI,
+	H_CPPR,
+	H_IPI,
+	H_IPOLL,
+	H_XIRR,
+	H_XIRR_X,
+#endif
+	0
+};
+
+static void init_default_hcalls(void)
+{
+	int i;
+	unsigned int hcall;
+
+	for (i = 0; default_hcall_list[i]; ++i) {
+		hcall = default_hcall_list[i];
+		WARN_ON(!kvmppc_hcall_impl_hv(hcall));
+		__set_bit(hcall / 4, default_enabled_hcalls);
+	}
+}
+
+static struct kvmppc_ops kvm_ops_hv = {
+	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
+	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
+	.get_one_reg = kvmppc_get_one_reg_hv,
+	.set_one_reg = kvmppc_set_one_reg_hv,
+	.vcpu_load   = kvmppc_core_vcpu_load_hv,
+	.vcpu_put    = kvmppc_core_vcpu_put_hv,
+	.set_msr     = kvmppc_set_msr_hv,
+	.vcpu_run    = kvmppc_vcpu_run_hv,
+	.vcpu_create = kvmppc_core_vcpu_create_hv,
+	.vcpu_free   = kvmppc_core_vcpu_free_hv,
+	.check_requests = kvmppc_core_check_requests_hv,
+	.get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
+	.flush_memslot  = kvmppc_core_flush_memslot_hv,
+	.prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
+	.commit_memory_region  = kvmppc_core_commit_memory_region_hv,
+	.unmap_hva = kvm_unmap_hva_hv,
+	.unmap_hva_range = kvm_unmap_hva_range_hv,
+	.age_hva  = kvm_age_hva_hv,
+	.test_age_hva = kvm_test_age_hva_hv,
+	.set_spte_hva = kvm_set_spte_hva_hv,
+	.mmu_destroy  = kvmppc_mmu_destroy_hv,
+	.free_memslot = kvmppc_core_free_memslot_hv,
+	.create_memslot = kvmppc_core_create_memslot_hv,
+	.init_vm =  kvmppc_core_init_vm_hv,
+	.destroy_vm = kvmppc_core_destroy_vm_hv,
+	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
+	.emulate_op = kvmppc_core_emulate_op_hv,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
+	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
+	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
+	.hcall_implemented = kvmppc_hcall_impl_hv,
+};
+
+static int kvmppc_book3s_init_hv(void)
+{
+	int r;
+	/*
+	 * FIXME!! Do we need to check on all cpus ?
+	 */
+	r = kvmppc_core_check_processor_compat_hv();
+	if (r < 0)
+		return -ENODEV;
+
+	kvm_ops_hv.owner = THIS_MODULE;
+	kvmppc_hv_ops = &kvm_ops_hv;
+
+	init_default_hcalls();
+
+	r = kvmppc_mmu_hv_init();
+	return r;
+}
+
+static void kvmppc_book3s_exit_hv(void)
+{
+	kvmppc_hv_ops = NULL;
+}
+
+module_init(kvmppc_book3s_init_hv);
+module_exit(kvmppc_book3s_exit_hv);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
new file mode 100644
index 000000000..ed2589d45
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm_host.h>
+#include <linux/preempt.h>
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/sizes.h>
+#include <linux/cma.h>
+#include <linux/bitops.h>
+
+#include <asm/cputable.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/archrandom.h>
+#include <asm/xics.h>
+#include <asm/dbell.h>
+#include <asm/cputhreads.h>
+
+#define KVM_CMA_CHUNK_ORDER	18
+
+/*
+ * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
+ * should be power of 2.
+ */
+#define HPT_ALIGN_PAGES		((1 << 18) >> PAGE_SHIFT) /* 256k */
+/*
+ * By default we reserve 5% of memory for hash pagetable allocation.
+ */
+static unsigned long kvm_cma_resv_ratio = 5;
+
+static struct cma *kvm_cma;
+
+static int __init early_parse_kvm_cma_resv(char *p)
+{
+	pr_debug("%s(%s)\n", __func__, p);
+	if (!p)
+		return -EINVAL;
+	return kstrtoul(p, 0, &kvm_cma_resv_ratio);
+}
+early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
+
+struct page *kvm_alloc_hpt(unsigned long nr_pages)
+{
+	VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
+
+	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
+
+void kvm_release_hpt(struct page *page, unsigned long nr_pages)
+{
+	cma_release(kvm_cma, page, nr_pages);
+}
+EXPORT_SYMBOL_GPL(kvm_release_hpt);
+
+/**
+ * kvm_cma_reserve() - reserve area for kvm hash pagetable
+ *
+ * This function reserves memory from early allocator. It should be
+ * called by arch specific code once the memblock allocator
+ * has been activated and all other subsystems have already allocated/reserved
+ * memory.
+ */
+void __init kvm_cma_reserve(void)
+{
+	unsigned long align_size;
+	struct memblock_region *reg;
+	phys_addr_t selected_size = 0;
+
+	/*
+	 * We need CMA reservation only when we are in HV mode
+	 */
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return;
+	/*
+	 * We cannot use memblock_phys_mem_size() here, because
+	 * memblock_analyze() has not been called yet.
+	 */
+	for_each_memblock(memory, reg)
+		selected_size += memblock_region_memory_end_pfn(reg) -
+				 memblock_region_memory_base_pfn(reg);
+
+	selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
+	if (selected_size) {
+		pr_debug("%s: reserving %ld MiB for global area\n", __func__,
+			 (unsigned long)selected_size / SZ_1M);
+		align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
+		cma_declare_contiguous(0, selected_size, 0, align_size,
+			KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma);
+	}
+}
+
+/*
+ * Real-mode H_CONFER implementation.
+ * We check if we are the only vcpu out of this virtual core
+ * still running in the guest and not ceded.  If so, we pop up
+ * to the virtual-mode implementation; if not, just return to
+ * the guest.
+ */
+long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
+			    unsigned int yield_count)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	int threads_running;
+	int threads_ceded;
+	int threads_conferring;
+	u64 stop = get_tb() + 10 * tb_ticks_per_usec;
+	int rv = H_SUCCESS; /* => don't yield */
+
+	set_bit(vcpu->arch.ptid, &vc->conferring_threads);
+	while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
+		threads_running = VCORE_ENTRY_MAP(vc);
+		threads_ceded = vc->napping_threads;
+		threads_conferring = vc->conferring_threads;
+		if ((threads_ceded | threads_conferring) == threads_running) {
+			rv = H_TOO_HARD; /* => do yield */
+			break;
+		}
+	}
+	clear_bit(vcpu->arch.ptid, &vc->conferring_threads);
+	return rv;
+}
+
+/*
+ * When running HV mode KVM we need to block certain operations while KVM VMs
+ * exist in the system. We use a counter of VMs to track this.
+ *
+ * One of the operations we need to block is onlining of secondaries, so we
+ * protect hv_vm_count with get/put_online_cpus().
+ */
+static atomic_t hv_vm_count;
+
+void kvm_hv_vm_activated(void)
+{
+	get_online_cpus();
+	atomic_inc(&hv_vm_count);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(kvm_hv_vm_activated);
+
+void kvm_hv_vm_deactivated(void)
+{
+	get_online_cpus();
+	atomic_dec(&hv_vm_count);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated);
+
+bool kvm_hv_mode_active(void)
+{
+	return atomic_read(&hv_vm_count) != 0;
+}
+
+extern int hcall_real_table[], hcall_real_table_end[];
+
+int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
+{
+	cmd /= 4;
+	if (cmd < hcall_real_table_end - hcall_real_table &&
+	    hcall_real_table[cmd])
+		return 1;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+	return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+	if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+		return H_SUCCESS;
+
+	return H_HARDWARE;
+}
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+	__asm__ __volatile__("stbcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+/*
+ * Send an interrupt or message to another CPU.
+ * This can only be called in real mode.
+ * The caller needs to include any barrier needed to order writes
+ * to memory vs. the IPI/message.
+ */
+void kvmhv_rm_send_ipi(int cpu)
+{
+	unsigned long xics_phys;
+
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+	    cpu_first_thread_sibling(cpu) ==
+	    cpu_first_thread_sibling(raw_smp_processor_id())) {
+		unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+		msg |= cpu_thread_in_core(cpu);
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return;
+	}
+
+	/* Else poke the target with an IPI */
+	xics_phys = paca[cpu].kvm_hstate.xics_phys;
+	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+/*
+ * The following functions are called from the assembly code
+ * in book3s_hv_rmhandlers.S.
+ */
+static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
+{
+	int cpu = vc->pcpu;
+
+	/* Order setting of exit map vs. msgsnd/IPI */
+	smp_mb();
+	for (; active; active >>= 1, ++cpu)
+		if (active & 1)
+			kvmhv_rm_send_ipi(cpu);
+}
+
+void kvmhv_commence_exit(int trap)
+{
+	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+	int ptid = local_paca->kvm_hstate.ptid;
+	int me, ee;
+
+	/* Set our bit in the threads-exiting-guest map in the 0xff00
+	   bits of vcore->entry_exit_map */
+	me = 0x100 << ptid;
+	do {
+		ee = vc->entry_exit_map;
+	} while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);
+
+	/* Are we the first here? */
+	if ((ee >> 8) != 0)
+		return;
+
+	/*
+	 * Trigger the other threads in this vcore to exit the guest.
+	 * If this is a hypervisor decrementer interrupt then they
+	 * will be already on their way out of the guest.
+	 */
+	if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
+		kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+}
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
new file mode 100644
index 000000000..0fdc4a289
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -0,0 +1,158 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_interrupts.S, which is:
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/ppc-opcode.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (vmalloc)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  none
+ */
+_GLOBAL(__kvmppc_vcore_entry)
+
+	/* Write correct stack frame */
+	mflr	r0
+	std	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save non-volatile registers (r14 - r31) and CR */
+	SAVE_NVGPRS(r1)
+	mfcr	r3
+	std	r3, _CCR(r1)
+
+	/* Save host DSCR */
+	mfspr	r3, SPRN_DSCR
+	std	r3, HSTATE_DSCR(r13)
+
+BEGIN_FTR_SECTION
+	/* Save host DABR */
+	mfspr	r3, SPRN_DABR
+	std	r3, HSTATE_DABR(r13)
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+
+	/* Hard-disable interrupts */
+	mfmsr   r10
+	std	r10, HSTATE_HOST_MSR(r13)
+	rldicl  r10,r10,48,1
+	rotldi  r10,r10,16
+	mtmsrd  r10,1
+
+	/* Save host PMU registers */
+BEGIN_FTR_SECTION
+	/* Work around P8 PMAE bug */
+	li	r3, -1
+	clrrdi	r3, r3, 10
+	mfspr	r8, SPRN_MMCR2
+	mtspr	SPRN_MMCR2, r3		/* freeze all counters using MMCR2 */
+	isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
+	mfspr	r6, SPRN_MMCRA
+	/* Clear MMCRA in order to disable SDAR updates */
+	li	r5, 0
+	mtspr	SPRN_MMCRA, r5
+	isync
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r5, LPPACA_PMCINUSE(r3)
+	cmpwi	r5, 0
+	beq	31f			/* skip if not */
+	mfspr	r5, SPRN_MMCR1
+	mfspr	r9, SPRN_SIAR
+	mfspr	r10, SPRN_SDAR
+	std	r7, HSTATE_MMCR0(r13)
+	std	r5, HSTATE_MMCR1(r13)
+	std	r6, HSTATE_MMCRA(r13)
+	std	r9, HSTATE_SIAR(r13)
+	std	r10, HSTATE_SDAR(r13)
+BEGIN_FTR_SECTION
+	mfspr	r9, SPRN_SIER
+	std	r8, HSTATE_MMCR2(r13)
+	std	r9, HSTATE_SIER(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r5, SPRN_PMC2
+	mfspr	r6, SPRN_PMC3
+	mfspr	r7, SPRN_PMC4
+	mfspr	r8, SPRN_PMC5
+	mfspr	r9, SPRN_PMC6
+	stw	r3, HSTATE_PMC1(r13)
+	stw	r5, HSTATE_PMC2(r13)
+	stw	r6, HSTATE_PMC3(r13)
+	stw	r7, HSTATE_PMC4(r13)
+	stw	r8, HSTATE_PMC5(r13)
+	stw	r9, HSTATE_PMC6(r13)
+31:
+
+	/*
+	 * Put whatever is in the decrementer into the
+	 * hypervisor decrementer.
+	 */
+	mfspr	r8,SPRN_DEC
+	mftb	r7
+	mtspr	SPRN_HDEC,r8
+	extsw	r8,r8
+	add	r8,r8,r7
+	std	r8,HSTATE_DECEXP(r13)
+
+	/* Jump to partition switch code */
+	bl	kvmppc_hv_entry_trampoline
+	nop
+
+/*
+ * We return here in virtual mode after the guest exits
+ * with something that we can't handle in real mode.
+ * Interrupts are enabled again at this point.
+ */
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 */
+
+	/* Restore non-volatile host registers (r14 - r31) and CR */
+	REST_NVGPRS(r1)
+	ld	r4, _CCR(r1)
+	mtcr	r4
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	ld	r0, PPC_LR_STKOFF(r1)
+	mtlr	r0
+	blr
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
new file mode 100644
index 000000000..93b5f5c9b
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -0,0 +1,142 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <asm/opal.h>
+#include <asm/mce.h>
+
+/* SRR1 bits for machine check on POWER7 */
+#define SRR1_MC_LDSTERR		(1ul << (63-42))
+#define SRR1_MC_IFETCH_SH	(63-45)
+#define SRR1_MC_IFETCH_MASK	0x7
+#define SRR1_MC_IFETCH_SLBPAR		2	/* SLB parity error */
+#define SRR1_MC_IFETCH_SLBMULTI		3	/* SLB multi-hit */
+#define SRR1_MC_IFETCH_SLBPARMULTI	4	/* SLB parity + multi-hit */
+#define SRR1_MC_IFETCH_TLBMULTI		5	/* I-TLB multi-hit */
+
+/* DSISR bits for machine check on POWER7 */
+#define DSISR_MC_DERAT_MULTI	0x800		/* D-ERAT multi-hit */
+#define DSISR_MC_TLB_MULTI	0x400		/* D-TLB multi-hit */
+#define DSISR_MC_SLB_PARITY	0x100		/* SLB parity error */
+#define DSISR_MC_SLB_MULTI	0x080		/* SLB multi-hit */
+#define DSISR_MC_SLB_PARMULTI	0x040		/* SLB parity + multi-hit */
+
+/* POWER7 SLB flush and reload */
+static void reload_slb(struct kvm_vcpu *vcpu)
+{
+	struct slb_shadow *slb;
+	unsigned long i, n;
+
+	/* First clear out SLB */
+	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+	/* Do they have an SLB shadow buffer registered? */
+	slb = vcpu->arch.slb_shadow.pinned_addr;
+	if (!slb)
+		return;
+
+	/* Sanity check */
+	n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE);
+	if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
+		return;
+
+	/* Load up the SLB from that */
+	for (i = 0; i < n; ++i) {
+		unsigned long rb = be64_to_cpu(slb->save_area[i].esid);
+		unsigned long rs = be64_to_cpu(slb->save_area[i].vsid);
+
+		rb = (rb & ~0xFFFul) | i;	/* insert entry number */
+		asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+	}
+}
+
+/*
+ * On POWER7, see if we can handle a machine check that occurred inside
+ * the guest in real mode, without switching to the host partition.
+ *
+ * Returns: 0 => exit guest, 1 => deliver machine check to guest
+ */
+static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
+{
+	unsigned long srr1 = vcpu->arch.shregs.msr;
+	struct machine_check_event mce_evt;
+	long handled = 1;
+
+	if (srr1 & SRR1_MC_LDSTERR) {
+		/* error on load/store */
+		unsigned long dsisr = vcpu->arch.shregs.dsisr;
+
+		if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+			     DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) {
+			/* flush and reload SLB; flushes D-ERAT too */
+			reload_slb(vcpu);
+			dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+				   DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI);
+		}
+		if (dsisr & DSISR_MC_TLB_MULTI) {
+			if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+				cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_LPID);
+			dsisr &= ~DSISR_MC_TLB_MULTI;
+		}
+		/* Any other errors we don't understand? */
+		if (dsisr & 0xffffffffUL)
+			handled = 0;
+	}
+
+	switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) {
+	case 0:
+		break;
+	case SRR1_MC_IFETCH_SLBPAR:
+	case SRR1_MC_IFETCH_SLBMULTI:
+	case SRR1_MC_IFETCH_SLBPARMULTI:
+		reload_slb(vcpu);
+		break;
+	case SRR1_MC_IFETCH_TLBMULTI:
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+			cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_LPID);
+		break;
+	default:
+		handled = 0;
+	}
+
+	/*
+	 * See if we have already handled the condition in the linux host.
+	 * We assume that if the condition is recovered then linux host
+	 * will have generated an error log event that we will pick
+	 * up and log later.
+	 * Don't release mce event now. We will queue up the event so that
+	 * we can log the MCE event info on host console.
+	 */
+	if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE))
+		goto out;
+
+	if (mce_evt.version == MCE_V1 &&
+	    (mce_evt.severity == MCE_SEV_NO_ERROR ||
+	     mce_evt.disposition == MCE_DISPOSITION_RECOVERED))
+		handled = 1;
+
+out:
+	/*
+	 * We are now going enter guest either through machine check
+	 * interrupt (for unhandled errors) or will continue from
+	 * current HSRR0 (for handled errors) in guest. Hence
+	 * queue up the event so that we can log it from host console later.
+	 */
+	machine_check_queue_event();
+
+	return handled;
+}
+
+long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
+{
+	return kvmppc_realmode_mc_power7(vcpu);
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
new file mode 100644
index 000000000..b027a8973
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -0,0 +1,857 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/hugetlb.h>
+#include <linux/module.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/hvcall.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+/* Translate address of a vmalloc'd thing to a linear map address */
+static void *real_vmalloc_addr(void *x)
+{
+	unsigned long addr = (unsigned long) x;
+	pte_t *p;
+	/*
+	 * assume we don't have huge pages in vmalloc space...
+	 * So don't worry about THP collapse/split. Called
+	 * Only in realmode, hence won't need irq_save/restore.
+	 */
+	p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
+	if (!p || !pte_present(*p))
+		return NULL;
+	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
+	return __va(addr);
+}
+
+/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
+static int global_invalidates(struct kvm *kvm, unsigned long flags)
+{
+	int global;
+
+	/*
+	 * If there is only one vcore, and it's currently running,
+	 * as indicated by local_paca->kvm_hstate.kvm_vcpu being set,
+	 * we can use tlbiel as long as we mark all other physical
+	 * cores as potentially having stale TLB entries for this lpid.
+	 * Otherwise, don't use tlbiel.
+	 */
+	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu)
+		global = 0;
+	else
+		global = 1;
+
+	if (!global) {
+		/* any other core might now have stale TLB entries... */
+		smp_wmb();
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
+				  &kvm->arch.need_tlb_flush);
+	}
+
+	return global;
+}
+
+/*
+ * Add this HPTE into the chain for the real page.
+ * Must be called with the chain locked; it unlocks the chain.
+ */
+void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
+			     unsigned long *rmap, long pte_index, int realmode)
+{
+	struct revmap_entry *head, *tail;
+	unsigned long i;
+
+	if (*rmap & KVMPPC_RMAP_PRESENT) {
+		i = *rmap & KVMPPC_RMAP_INDEX;
+		head = &kvm->arch.revmap[i];
+		if (realmode)
+			head = real_vmalloc_addr(head);
+		tail = &kvm->arch.revmap[head->back];
+		if (realmode)
+			tail = real_vmalloc_addr(tail);
+		rev->forw = i;
+		rev->back = head->back;
+		tail->forw = pte_index;
+		head->back = pte_index;
+	} else {
+		rev->forw = rev->back = pte_index;
+		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
+			pte_index | KVMPPC_RMAP_PRESENT;
+	}
+	unlock_rmap(rmap);
+}
+EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
+
+/* Remove this HPTE from the chain for a real page */
+static void remove_revmap_chain(struct kvm *kvm, long pte_index,
+				struct revmap_entry *rev,
+				unsigned long hpte_v, unsigned long hpte_r)
+{
+	struct revmap_entry *next, *prev;
+	unsigned long gfn, ptel, head;
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+	unsigned long rcbits;
+
+	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
+	ptel = rev->guest_rpte |= rcbits;
+	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	if (!memslot)
+		return;
+
+	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
+	lock_rmap(rmap);
+
+	head = *rmap & KVMPPC_RMAP_INDEX;
+	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
+	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
+	next->back = rev->back;
+	prev->forw = rev->forw;
+	if (head == pte_index) {
+		head = rev->forw;
+		if (head == pte_index)
+			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+		else
+			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
+	}
+	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+	unlock_rmap(rmap);
+}
+
+long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
+		       long pte_index, unsigned long pteh, unsigned long ptel,
+		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
+{
+	unsigned long i, pa, gpa, gfn, psize;
+	unsigned long slot_fn, hva;
+	__be64 *hpte;
+	struct revmap_entry *rev;
+	unsigned long g_ptel;
+	struct kvm_memory_slot *memslot;
+	unsigned hpage_shift;
+	unsigned long is_io;
+	unsigned long *rmap;
+	pte_t *ptep;
+	unsigned int writing;
+	unsigned long mmu_seq;
+	unsigned long rcbits, irq_flags = 0;
+
+	psize = hpte_page_size(pteh, ptel);
+	if (!psize)
+		return H_PARAMETER;
+	writing = hpte_is_writable(ptel);
+	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+	ptel &= ~HPTE_GR_RESERVED;
+	g_ptel = ptel;
+
+	/* used later to detect if we might have been invalidated */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	/* Find the memslot (if any) for this address */
+	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
+	gfn = gpa >> PAGE_SHIFT;
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	pa = 0;
+	is_io = ~0ul;
+	rmap = NULL;
+	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
+		/* Emulated MMIO - mark this with key=31 */
+		pteh |= HPTE_V_ABSENT;
+		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+		goto do_insert;
+	}
+
+	/* Check if the requested page fits entirely in the memslot. */
+	if (!slot_is_aligned(memslot, psize))
+		return H_PARAMETER;
+	slot_fn = gfn - memslot->base_gfn;
+	rmap = &memslot->arch.rmap[slot_fn];
+
+	/* Translate to host virtual address */
+	hva = __gfn_to_hva_memslot(memslot, gfn);
+	/*
+	 * If we had a page table table change after lookup, we would
+	 * retry via mmu_notifier_retry.
+	 */
+	if (realmode)
+		ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+	else {
+		local_irq_save(irq_flags);
+		ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+	}
+	if (ptep) {
+		pte_t pte;
+		unsigned int host_pte_size;
+
+		if (hpage_shift)
+			host_pte_size = 1ul << hpage_shift;
+		else
+			host_pte_size = PAGE_SIZE;
+		/*
+		 * We should always find the guest page size
+		 * to <= host page size, if host is using hugepage
+		 */
+		if (host_pte_size < psize) {
+			if (!realmode)
+				local_irq_restore(flags);
+			return H_PARAMETER;
+		}
+		pte = kvmppc_read_update_linux_pte(ptep, writing);
+		if (pte_present(pte) && !pte_protnone(pte)) {
+			if (writing && !pte_write(pte))
+				/* make the actual HPTE be read-only */
+				ptel = hpte_make_readonly(ptel);
+			is_io = hpte_cache_bits(pte_val(pte));
+			pa = pte_pfn(pte) << PAGE_SHIFT;
+			pa |= hva & (host_pte_size - 1);
+			pa |= gpa & ~PAGE_MASK;
+		}
+	}
+	if (!realmode)
+		local_irq_restore(irq_flags);
+
+	ptel &= ~(HPTE_R_PP0 - psize);
+	ptel |= pa;
+
+	if (pa)
+		pteh |= HPTE_V_VALID;
+	else
+		pteh |= HPTE_V_ABSENT;
+
+	/* Check WIMG */
+	if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
+		if (is_io)
+			return H_PARAMETER;
+		/*
+		 * Allow guest to map emulated device memory as
+		 * uncacheable, but actually make it cacheable.
+		 */
+		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
+		ptel |= HPTE_R_M;
+	}
+
+	/* Find and lock the HPTEG slot to use */
+ do_insert:
+	if (pte_index >= kvm->arch.hpt_npte)
+		return H_PARAMETER;
+	if (likely((flags & H_EXACT) == 0)) {
+		pte_index &= ~7UL;
+		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		for (i = 0; i < 8; ++i) {
+			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
+			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+					  HPTE_V_ABSENT))
+				break;
+			hpte += 2;
+		}
+		if (i == 8) {
+			/*
+			 * Since try_lock_hpte doesn't retry (not even stdcx.
+			 * failures), it could be that there is a free slot
+			 * but we transiently failed to lock it.  Try again,
+			 * actually locking each slot and checking it.
+			 */
+			hpte -= 16;
+			for (i = 0; i < 8; ++i) {
+				u64 pte;
+				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+					cpu_relax();
+				pte = be64_to_cpu(hpte[0]);
+				if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
+					break;
+				__unlock_hpte(hpte, pte);
+				hpte += 2;
+			}
+			if (i == 8)
+				return H_PTEG_FULL;
+		}
+		pte_index += i;
+	} else {
+		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
+				   HPTE_V_ABSENT)) {
+			/* Lock the slot and check again */
+			u64 pte;
+
+			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+				cpu_relax();
+			pte = be64_to_cpu(hpte[0]);
+			if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+				__unlock_hpte(hpte, pte);
+				return H_PTEG_FULL;
+			}
+		}
+	}
+
+	/* Save away the guest's idea of the second HPTE dword */
+	rev = &kvm->arch.revmap[pte_index];
+	if (realmode)
+		rev = real_vmalloc_addr(rev);
+	if (rev) {
+		rev->guest_rpte = g_ptel;
+		note_hpte_modification(kvm, rev);
+	}
+
+	/* Link HPTE into reverse-map chain */
+	if (pteh & HPTE_V_VALID) {
+		if (realmode)
+			rmap = real_vmalloc_addr(rmap);
+		lock_rmap(rmap);
+		/* Check for pending invalidations under the rmap chain lock */
+		if (mmu_notifier_retry(kvm, mmu_seq)) {
+			/* inval in progress, write a non-present HPTE */
+			pteh |= HPTE_V_ABSENT;
+			pteh &= ~HPTE_V_VALID;
+			unlock_rmap(rmap);
+		} else {
+			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
+						realmode);
+			/* Only set R/C in real HPTE if already set in *rmap */
+			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
+			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
+		}
+	}
+
+	hpte[1] = cpu_to_be64(ptel);
+
+	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
+	eieio();
+	__unlock_hpte(hpte, pteh);
+	asm volatile("ptesync" : : : "memory");
+
+	*pte_idx_ret = pte_index;
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+		    long pte_index, unsigned long pteh, unsigned long ptel)
+{
+	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
+				 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
+}
+
+#ifdef __BIG_ENDIAN__
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
+#else
+#define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
+#endif
+
+static inline int try_lock_tlbie(unsigned int *lock)
+{
+	unsigned int tmp, old;
+	unsigned int token = LOCK_TOKEN;
+
+	asm volatile("1:lwarx	%1,0,%2\n"
+		     "	cmpwi	cr0,%1,0\n"
+		     "	bne	2f\n"
+		     "  stwcx.	%3,0,%2\n"
+		     "	bne-	1b\n"
+		     "  isync\n"
+		     "2:"
+		     : "=&r" (tmp), "=&r" (old)
+		     : "r" (lock), "r" (token)
+		     : "cc", "memory");
+	return old == 0;
+}
+
+static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
+		      long npages, int global, bool need_sync)
+{
+	long i;
+
+	if (global) {
+		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
+			cpu_relax();
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i)
+			asm volatile(PPC_TLBIE(%1,%0) : :
+				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
+		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+		kvm->arch.tlbie_lock = 0;
+	} else {
+		if (need_sync)
+			asm volatile("ptesync" : : : "memory");
+		for (i = 0; i < npages; ++i)
+			asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
+		asm volatile("ptesync" : : : "memory");
+	}
+}
+
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+			unsigned long pte_index, unsigned long avpn,
+			unsigned long *hpret)
+{
+	__be64 *hpte;
+	unsigned long v, r, rb;
+	struct revmap_entry *rev;
+	u64 pte;
+
+	if (pte_index >= kvm->arch.hpt_npte)
+		return H_PARAMETER;
+	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	pte = be64_to_cpu(hpte[0]);
+	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
+	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
+		__unlock_hpte(hpte, pte);
+		return H_NOT_FOUND;
+	}
+
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	v = pte & ~HPTE_V_HVLOCK;
+	if (v & HPTE_V_VALID) {
+		u64 pte1;
+
+		pte1 = be64_to_cpu(hpte[1]);
+		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
+		rb = compute_tlbie_rb(v, pte1, pte_index);
+		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
+		/* Read PTE low word after tlbie to get final R/C values */
+		remove_revmap_chain(kvm, pte_index, rev, v, pte1);
+	}
+	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
+	note_hpte_modification(kvm, rev);
+	unlock_hpte(hpte, 0);
+
+	hpret[0] = v;
+	hpret[1] = r;
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+		     unsigned long pte_index, unsigned long avpn)
+{
+	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
+				  &vcpu->arch.gpr[4]);
+}
+
+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long *args = &vcpu->arch.gpr[4];
+	__be64 *hp, *hptes[4];
+	unsigned long tlbrb[4];
+	long int i, j, k, n, found, indexes[4];
+	unsigned long flags, req, pte_index, rcbits;
+	int global;
+	long int ret = H_SUCCESS;
+	struct revmap_entry *rev, *revs[4];
+	u64 hp0;
+
+	global = global_invalidates(kvm, 0);
+	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
+		n = 0;
+		for (; i < 4; ++i) {
+			j = i * 2;
+			pte_index = args[j];
+			flags = pte_index >> 56;
+			pte_index &= ((1ul << 56) - 1);
+			req = flags >> 6;
+			flags &= 3;
+			if (req == 3) {		/* no more requests */
+				i = 4;
+				break;
+			}
+			if (req != 1 || flags == 3 ||
+			    pte_index >= kvm->arch.hpt_npte) {
+				/* parameter error */
+				args[j] = ((0xa0 | flags) << 56) + pte_index;
+				ret = H_PARAMETER;
+				break;
+			}
+			hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4));
+			/* to avoid deadlock, don't spin except for first */
+			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
+				if (n)
+					break;
+				while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
+					cpu_relax();
+			}
+			found = 0;
+			hp0 = be64_to_cpu(hp[0]);
+			if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) {
+				switch (flags & 3) {
+				case 0:		/* absolute */
+					found = 1;
+					break;
+				case 1:		/* andcond */
+					if (!(hp0 & args[j + 1]))
+						found = 1;
+					break;
+				case 2:		/* AVPN */
+					if ((hp0 & ~0x7fUL) == args[j + 1])
+						found = 1;
+					break;
+				}
+			}
+			if (!found) {
+				hp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+				args[j] = ((0x90 | flags) << 56) + pte_index;
+				continue;
+			}
+
+			args[j] = ((0x80 | flags) << 56) + pte_index;
+			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+			note_hpte_modification(kvm, rev);
+
+			if (!(hp0 & HPTE_V_VALID)) {
+				/* insert R and C bits from PTE */
+				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+				args[j] |= rcbits << (56 - 5);
+				hp[0] = 0;
+				continue;
+			}
+
+			/* leave it locked */
+			hp[0] &= ~cpu_to_be64(HPTE_V_VALID);
+			tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]),
+				be64_to_cpu(hp[1]), pte_index);
+			indexes[n] = j;
+			hptes[n] = hp;
+			revs[n] = rev;
+			++n;
+		}
+
+		if (!n)
+			break;
+
+		/* Now that we've collected a batch, do the tlbies */
+		do_tlbies(kvm, tlbrb, n, global, true);
+
+		/* Read PTE low words after tlbie to get final R/C values */
+		for (k = 0; k < n; ++k) {
+			j = indexes[k];
+			pte_index = args[j] & ((1ul << 56) - 1);
+			hp = hptes[k];
+			rev = revs[k];
+			remove_revmap_chain(kvm, pte_index, rev,
+				be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
+			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
+			args[j] |= rcbits << (56 - 5);
+			__unlock_hpte(hp, 0);
+		}
+	}
+
+	return ret;
+}
+
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+		      unsigned long pte_index, unsigned long avpn,
+		      unsigned long va)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	struct revmap_entry *rev;
+	unsigned long v, r, rb, mask, bits;
+	u64 pte;
+
+	if (pte_index >= kvm->arch.hpt_npte)
+		return H_PARAMETER;
+
+	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	pte = be64_to_cpu(hpte[0]);
+	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
+	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
+		__unlock_hpte(hpte, pte);
+		return H_NOT_FOUND;
+	}
+
+	v = pte;
+	bits = (flags << 55) & HPTE_R_PP0;
+	bits |= (flags << 48) & HPTE_R_KEY_HI;
+	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	/* Update guest view of 2nd HPTE dword */
+	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
+		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	if (rev) {
+		r = (rev->guest_rpte & ~mask) | bits;
+		rev->guest_rpte = r;
+		note_hpte_modification(kvm, rev);
+	}
+
+	/* Update HPTE */
+	if (v & HPTE_V_VALID) {
+		/*
+		 * If the page is valid, don't let it transition from
+		 * readonly to writable.  If it should be writable, we'll
+		 * take a trap and let the page fault code sort it out.
+		 */
+		pte = be64_to_cpu(hpte[1]);
+		r = (pte & ~mask) | bits;
+		if (hpte_is_writable(r) && !hpte_is_writable(pte))
+			r = hpte_make_readonly(r);
+		/* If the PTE is changing, invalidate it first */
+		if (r != pte) {
+			rb = compute_tlbie_rb(v, r, pte_index);
+			hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) |
+					      HPTE_V_ABSENT);
+			do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags),
+				  true);
+			hpte[1] = cpu_to_be64(r);
+		}
+	}
+	unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+	asm volatile("ptesync" : : : "memory");
+	return H_SUCCESS;
+}
+
+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
+		   unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	unsigned long v, r;
+	int i, n = 1;
+	struct revmap_entry *rev = NULL;
+
+	if (pte_index >= kvm->arch.hpt_npte)
+		return H_PARAMETER;
+	if (flags & H_READ_4) {
+		pte_index &= ~3;
+		n = 4;
+	}
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	for (i = 0; i < n; ++i, ++pte_index) {
+		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+		v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
+		r = be64_to_cpu(hpte[1]);
+		if (v & HPTE_V_ABSENT) {
+			v &= ~HPTE_V_ABSENT;
+			v |= HPTE_V_VALID;
+		}
+		if (v & HPTE_V_VALID) {
+			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
+			r &= ~HPTE_GR_RESERVED;
+		}
+		vcpu->arch.gpr[4 + i * 2] = v;
+		vcpu->arch.gpr[5 + i * 2] = r;
+	}
+	return H_SUCCESS;
+}
+
+void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
+			unsigned long pte_index)
+{
+	unsigned long rb;
+
+	hptep[0] &= ~cpu_to_be64(HPTE_V_VALID);
+	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
+			      pte_index);
+	do_tlbies(kvm, &rb, 1, 1, true);
+}
+EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
+
+void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
+			   unsigned long pte_index)
+{
+	unsigned long rb;
+	unsigned char rbyte;
+
+	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
+			      pte_index);
+	rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8;
+	/* modify only the second-last byte, which contains the ref bit */
+	*((char *)hptep + 14) = rbyte;
+	do_tlbies(kvm, &rb, 1, 1, false);
+}
+EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
+
+static int slb_base_page_shift[4] = {
+	24,	/* 16M */
+	16,	/* 64k */
+	34,	/* 16G */
+	20,	/* 1M, unsupported */
+};
+
+/* When called from virtmode, this func should be protected by
+ * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
+ * can trigger deadlock issue.
+ */
+long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
+			      unsigned long valid)
+{
+	unsigned int i;
+	unsigned int pshift;
+	unsigned long somask;
+	unsigned long vsid, hash;
+	unsigned long avpn;
+	__be64 *hpte;
+	unsigned long mask, val;
+	unsigned long v, r;
+
+	/* Get page shift, work out hash and AVPN etc. */
+	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
+	val = 0;
+	pshift = 12;
+	if (slb_v & SLB_VSID_L) {
+		mask |= HPTE_V_LARGE;
+		val |= HPTE_V_LARGE;
+		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
+	}
+	if (slb_v & SLB_VSID_B_1T) {
+		somask = (1UL << 40) - 1;
+		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
+		vsid ^= vsid << 25;
+	} else {
+		somask = (1UL << 28) - 1;
+		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
+	}
+	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
+	avpn = slb_v & ~(somask >> 16);	/* also includes B */
+	avpn |= (eaddr & somask) >> 16;
+
+	if (pshift >= 24)
+		avpn &= ~((1UL << (pshift - 16)) - 1);
+	else
+		avpn &= ~0x7fUL;
+	val |= avpn;
+
+	for (;;) {
+		hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7));
+
+		for (i = 0; i < 16; i += 2) {
+			/* Read the PTE racily */
+			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
+
+			/* Check valid/absent, hash, segment size and AVPN */
+			if (!(v & valid) || (v & mask) != val)
+				continue;
+
+			/* Lock the PTE and read it under the lock */
+			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
+				cpu_relax();
+			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
+			r = be64_to_cpu(hpte[i+1]);
+
+			/*
+			 * Check the HPTE again, including base page size
+			 */
+			if ((v & valid) && (v & mask) == val &&
+			    hpte_base_page_size(v, r) == (1ul << pshift))
+				/* Return with the HPTE still locked */
+				return (hash << 3) + (i >> 1);
+
+			__unlock_hpte(&hpte[i], v);
+		}
+
+		if (val & HPTE_V_SECONDARY)
+			break;
+		val |= HPTE_V_SECONDARY;
+		hash = hash ^ kvm->arch.hpt_mask;
+	}
+	return -1;
+}
+EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
+
+/*
+ * Called in real mode to check whether an HPTE not found fault
+ * is due to accessing a paged-out page or an emulated MMIO page,
+ * or if a protection fault is due to accessing a page that the
+ * guest wanted read/write access to but which we made read-only.
+ * Returns a possibly modified status (DSISR) value if not
+ * (i.e. pass the interrupt to the guest),
+ * -1 to pass the fault up to host kernel mode code, -2 to do that
+ * and also load the instruction word (for MMIO emulation),
+ * or 0 if we should make the guest retry the access.
+ */
+long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
+			  unsigned long slb_v, unsigned int status, bool data)
+{
+	struct kvm *kvm = vcpu->kvm;
+	long int index;
+	unsigned long v, r, gr;
+	__be64 *hpte;
+	unsigned long valid;
+	struct revmap_entry *rev;
+	unsigned long pp, key;
+
+	/* For protection fault, expect to find a valid HPTE */
+	valid = HPTE_V_VALID;
+	if (status & DSISR_NOHPTE)
+		valid |= HPTE_V_ABSENT;
+
+	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
+	if (index < 0) {
+		if (status & DSISR_NOHPTE)
+			return status;	/* there really was no HPTE */
+		return 0;		/* for prot fault, HPTE disappeared */
+	}
+	hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+	v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
+	r = be64_to_cpu(hpte[1]);
+	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
+	gr = rev->guest_rpte;
+
+	unlock_hpte(hpte, v);
+
+	/* For not found, if the HPTE is valid by now, retry the instruction */
+	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
+		return 0;
+
+	/* Check access permissions to the page */
+	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
+	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
+	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */
+	if (!data) {
+		if (gr & (HPTE_R_N | HPTE_R_G))
+			return status | SRR1_ISI_N_OR_G;
+		if (!hpte_read_permission(pp, slb_v & key))
+			return status | SRR1_ISI_PROT;
+	} else if (status & DSISR_ISSTORE) {
+		/* check write permission */
+		if (!hpte_write_permission(pp, slb_v & key))
+			return status | DSISR_PROTFAULT;
+	} else {
+		if (!hpte_read_permission(pp, slb_v & key))
+			return status | DSISR_PROTFAULT;
+	}
+
+	/* Check storage key, if applicable */
+	if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
+		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
+		if (status & DSISR_ISSTORE)
+			perm >>= 1;
+		if (perm & 1)
+			return status | DSISR_KEYFAULT;
+	}
+
+	/* Save HPTE info for virtual-mode handler */
+	vcpu->arch.pgfault_addr = addr;
+	vcpu->arch.pgfault_index = index;
+	vcpu->arch.pgfault_hpte[0] = v;
+	vcpu->arch.pgfault_hpte[1] = r;
+
+	/* Check the storage key to see if it is possibly emulated MMIO */
+	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
+	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
+		return -2;	/* MMIO emulation - load instr word */
+
+	return -1;		/* send fault up to host kernel mode */
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
new file mode 100644
index 000000000..00e45b6d4
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#include "book3s_xics.h"
+
+#define DEBUG_PASSUP
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+/* -- ICS routines -- */
+static void ics_rm_check_resend(struct kvmppc_xics *xics,
+				struct kvmppc_ics *ics, struct kvmppc_icp *icp)
+{
+	int i;
+
+	arch_spin_lock(&ics->lock);
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		arch_spin_unlock(&ics->lock);
+		icp_rm_deliver_irq(xics, icp, state->number);
+		arch_spin_lock(&ics->lock);
+	}
+
+	arch_spin_unlock(&ics->lock);
+}
+
+/* -- ICP routines -- */
+
+static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
+				struct kvm_vcpu *this_vcpu)
+{
+	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
+	int cpu;
+
+	/* Mark the target VCPU as having an interrupt pending */
+	vcpu->stat.queue_intr++;
+	set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+
+	/* Kick self ? Just set MER and return */
+	if (vcpu == this_vcpu) {
+		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER);
+		return;
+	}
+
+	/* Check if the core is loaded, if not, too hard */
+	cpu = vcpu->cpu;
+	if (cpu < 0 || cpu >= nr_cpu_ids) {
+		this_icp->rm_action |= XICS_RM_KICK_VCPU;
+		this_icp->rm_kick_target = vcpu;
+		return;
+	}
+	/* In SMT cpu will always point to thread 0, we adjust it */
+	cpu += vcpu->arch.ptid;
+
+	smp_mb();
+	kvmhv_rm_send_ipi(cpu);
+}
+
+static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
+{
+	/* Note: Only called on self ! */
+	clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
+		  &vcpu->arch.pending_exceptions);
+	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
+}
+
+static inline bool icp_rm_try_update(struct kvmppc_icp *icp,
+				     union kvmppc_icp_state old,
+				     union kvmppc_icp_state new)
+{
+	struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu;
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
+	 */
+	if (new.out_ee)
+		icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu);
+
+	/* Expose the state change for debug purposes */
+	this_vcpu->arch.icp->rm_dbgstate = new;
+	this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu;
+
+ bail:
+	return success;
+}
+
+static inline int check_too_hard(struct kvmppc_xics *xics,
+				 struct kvmppc_icp *icp)
+{
+	return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
+}
+
+static void icp_rm_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_rm_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	return success;
+}
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		/* Unsafe increment, but this does not need to be accurate */
+		xics->err_noics++;
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	arch_spin_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->server_num) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			/* Unsafe increment again*/
+			xics->err_noicp++;
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differentiate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * ics spin lock.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_rm_try_to_deliver() the target
+	 * processor may well have already consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			arch_spin_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->icsid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_rm_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			arch_spin_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	arch_spin_unlock(&ics->lock);
+}
+
+static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			     u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here as well.
+	 */
+	if (resend) {
+		icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
+	}
+}
+
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/* First clear the interrupt */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Return the result in GPR4 */
+	vcpu->arch.gpr[4] = xirr;
+
+	return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+		    unsigned long mfrr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	local = this_icp->server_num == server;
+	if (local)
+		icp = this_icp;
+	else
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+	if (!icp)
+		return H_PARAMETER;
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be done as there can be no XISR to
+	 * reject.
+	 *
+	 * ICP state: Check_IPI
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it.
+	 *
+	 * ICP State: IPI
+	 *
+	 * Besides rejecting any pending interrupts, we also
+	 * update XISR and pending_pri to mark IPI as pending.
+	 *
+	 * PAPR does not describe this state, but if the MFRR is being
+	 * made less favored than its earlier value, there might be
+	 * a previously-rejected interrupt needing to be resent.
+	 * Ideally, we would want to resend only if
+	 *	prio(pending_interrupt) < mfrr &&
+	 *	prio(pending_interrupt) < cppr
+	 * where pending interrupt is the one that was rejected. But
+	 * we don't have that state, so we simply trigger a resend
+	 * whenever the MFRR is made less favored.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri) {
+				reject = new_state.xisr;
+				new_state.pending_pri = mfrr;
+				new_state.xisr = XICS_IPI;
+			}
+		}
+
+		if (mfrr > old_state.mfrr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/* Handle reject in real mode */
+	if (reject && reject != XICS_IPI) {
+		this_icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject);
+	}
+
+	/* Handle resends in real mode */
+	if (resend) {
+		this_icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
+	}
+
+	return check_too_hard(xics, this_icp);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr) {
+		icp_rm_down_cppr(xics, icp, cppr);
+		goto bail;
+	} else if (cppr == icp->state.cppr)
+		return H_SUCCESS;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	icp_rm_clr_vcpu_irq(icp->vcpu);
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_rm_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI) {
+		icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject);
+	}
+ bail:
+	return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	if (!xics || !xics->real_mode)
+		return H_TOO_HARD;
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		goto bail;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		goto bail;
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it */
+	if (state->asserted) {
+		icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, irq);
+	}
+
+	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
+		icp->rm_action |= XICS_RM_NOTIFY_EOI;
+		icp->rm_eoied_irq = irq;
+	}
+ bail:
+	return check_too_hard(xics, icp);
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
new file mode 100644
index 000000000..4d70df26c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -0,0 +1,2601 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * Derived from book3s_rmhandlers.S and other files, which are:
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/ptrace.h>
+#include <asm/hvcall.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+#include <asm/kvm_book3s_asm.h>
+#include <asm/mmu-hash64.h>
+#include <asm/tm.h>
+
+#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
+
+/* Values in HSTATE_NAPPING(r13) */
+#define NAPPING_CEDE	1
+#define NAPPING_NOVCPU	2
+
+/*
+ * Call kvmppc_hv_entry in real mode.
+ * Must be called with interrupts hard-disabled.
+ *
+ * Input Registers:
+ *
+ * LR = return address to continue at after eventually re-enabling MMU
+ */
+_GLOBAL_TOC(kvmppc_hv_entry_trampoline)
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -112(r1)
+	mfmsr	r10
+	LOAD_REG_ADDR(r5, kvmppc_call_hv_entry)
+	li	r0,MSR_RI
+	andc	r0,r10,r0
+	li	r6,MSR_IR | MSR_DR
+	andc	r6,r10,r6
+	mtmsrd	r0,1		/* clear RI in MSR */
+	mtsrr0	r5
+	mtsrr1	r6
+	RFI
+
+kvmppc_call_hv_entry:
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	bl	kvmppc_hv_entry
+
+	/* Back from guest - restore host state and return to caller */
+
+BEGIN_FTR_SECTION
+	/* Restore host DABR and DABRX */
+	ld	r5,HSTATE_DABR(r13)
+	li	r6,7
+	mtspr	SPRN_DABR,r5
+	mtspr	SPRN_DABRX,r6
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+
+	/* Restore SPRG3 */
+	ld	r3,PACA_SPRG_VDSO(r13)
+	mtspr	SPRN_SPRG_VDSO_WRITE,r3
+
+	/* Reload the host's PMU registers */
+	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
+	lbz	r4, LPPACA_PMCINUSE(r3)
+	cmpwi	r4, 0
+	beq	23f			/* skip if not */
+BEGIN_FTR_SECTION
+	ld	r3, HSTATE_MMCR0(r13)
+	andi.	r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+	cmpwi	r4, MMCR0_PMAO
+	beql	kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+	lwz	r3, HSTATE_PMC1(r13)
+	lwz	r4, HSTATE_PMC2(r13)
+	lwz	r5, HSTATE_PMC3(r13)
+	lwz	r6, HSTATE_PMC4(r13)
+	lwz	r8, HSTATE_PMC5(r13)
+	lwz	r9, HSTATE_PMC6(r13)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r4
+	mtspr	SPRN_PMC3, r5
+	mtspr	SPRN_PMC4, r6
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, HSTATE_MMCR0(r13)
+	ld	r4, HSTATE_MMCR1(r13)
+	ld	r5, HSTATE_MMCRA(r13)
+	ld	r6, HSTATE_SIAR(r13)
+	ld	r7, HSTATE_SDAR(r13)
+	mtspr	SPRN_MMCR1, r4
+	mtspr	SPRN_MMCRA, r5
+	mtspr	SPRN_SIAR, r6
+	mtspr	SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+	ld	r8, HSTATE_MMCR2(r13)
+	ld	r9, HSTATE_SIER(r13)
+	mtspr	SPRN_MMCR2, r8
+	mtspr	SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	mtspr	SPRN_MMCR0, r3
+	isync
+23:
+
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, HSTATE_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
+	/*
+	 * For external and machine check interrupts, we need
+	 * to call the Linux handler to process the interrupt.
+	 * We do that by jumping to absolute address 0x500 for
+	 * external interrupts, or the machine_check_fwnmi label
+	 * for machine checks (since firmware might have patched
+	 * the vector area at 0x200).  The [h]rfid at the end of the
+	 * handler will return to the book3s_hv_interrupts.S code.
+	 * For other interrupts we do the rfid to get back
+	 * to the book3s_hv_interrupts.S code here.
+	 */
+	ld	r8, 112+PPC_LR_STKOFF(r1)
+	addi	r1, r1, 112
+	ld	r7, HSTATE_HOST_MSR(r13)
+
+	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	11f
+	cmpwi	cr2, r12, BOOK3S_INTERRUPT_HMI
+	beq	cr2, 14f			/* HMI check */
+
+	/* RFI into the highmem handler, or branch to interrupt handler */
+	mfmsr	r6
+	li	r0, MSR_RI
+	andc	r6, r6, r0
+	mtmsrd	r6, 1			/* Clear RI in MSR */
+	mtsrr0	r8
+	mtsrr1	r7
+	beq	cr1, 13f		/* machine check */
+	RFI
+
+	/* On POWER7, we have external interrupts set to use HSRR0/1 */
+11:	mtspr	SPRN_HSRR0, r8
+	mtspr	SPRN_HSRR1, r7
+	ba	0x500
+
+13:	b	machine_check_fwnmi
+
+14:	mtspr	SPRN_HSRR0, r8
+	mtspr	SPRN_HSRR1, r7
+	b	hmi_exception_after_realmode
+
+kvmppc_primary_no_guest:
+	/* We handle this much like a ceded vcpu */
+	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+	mfspr	r3, SPRN_HDEC
+	mtspr	SPRN_DEC, r3
+	/*
+	 * Make sure the primary has finished the MMU switch.
+	 * We should never get here on a secondary thread, but
+	 * check it for robustness' sake.
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+65:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	65b
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
+	/* set our bit in napping_threads */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+1:	lwarx	r3, 0, r6
+	or	r3, r3, r0
+	stwcx.	r3, 0, r6
+	bne	1b
+	/* order napping_threads update vs testing entry_exit_map */
+	isync
+	li	r12, 0
+	lwz	r7, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r7, 0x100
+	bge	kvm_novcpu_exit	/* another thread already exiting */
+	li	r3, NAPPING_NOVCPU
+	stb	r3, HSTATE_NAPPING(r13)
+
+	li	r3, 0		/* Don't wake on privileged (OS) doorbell */
+	b	kvm_do_nap
+
+kvm_novcpu_wakeup:
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r0, 0
+	stb	r0, HSTATE_NAPPING(r13)
+	stb	r0, HSTATE_HWTHREAD_REQ(r13)
+
+	/* check the wake reason */
+	bl	kvmppc_check_wake_reason
+
+	/* see if any other thread is already exiting */
+	lwz	r0, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0, 0x100
+	bge	kvm_novcpu_exit
+
+	/* clear our bit in napping_threads */
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+4:	lwarx	r7, 0, r6
+	andc	r7, r7, r0
+	stwcx.	r7, 0, r6
+	bne	4b
+
+	/* See if the wake reason means we need to exit */
+	cmpdi	r3, 0
+	bge	kvm_novcpu_exit
+
+	/* See if our timeslice has expired (HDEC is negative) */
+	mfspr	r0, SPRN_HDEC
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	cmpwi	r0, 0
+	blt	kvm_novcpu_exit
+
+	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	kvmppc_primary_no_guest
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+#endif
+	b	kvmppc_got_guest
+
+kvm_novcpu_exit:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	13f
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+13:	mr	r3, r12
+	stw	r12, 112-4(r1)
+	bl	kvmhv_commence_exit
+	nop
+	lwz	r12, 112-4(r1)
+	b	kvmhv_switch_to_host
+
+/*
+ * We come in here when wakened from nap mode.
+ * Relocation is off and most register values are lost.
+ * r13 points to the PACA.
+ */
+	.globl	kvm_start_guest
+kvm_start_guest:
+
+	/* Set runlatch bit the minute you wake up from nap */
+	mfspr	r0, SPRN_CTRLF
+	ori 	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
+
+	ld	r2,PACATOC(r13)
+
+	li	r0,KVM_HWTHREAD_IN_KVM
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+
+	/* NV GPR values from power7_idle() will no longer be valid */
+	li	r0,1
+	stb	r0,PACA_NAPSTATELOST(r13)
+
+	/* were we napping due to cede? */
+	lbz	r0,HSTATE_NAPPING(r13)
+	cmpwi	r0,NAPPING_CEDE
+	beq	kvm_end_cede
+	cmpwi	r0,NAPPING_NOVCPU
+	beq	kvm_novcpu_wakeup
+
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
+
+	/*
+	 * We weren't napping due to cede, so this must be a secondary
+	 * thread being woken up to run a guest, or being woken up due
+	 * to a stray IPI.  (Or due to some machine check or hypervisor
+	 * maintenance interrupt while the core is in KVM.)
+	 */
+
+	/* Check the wake reason in SRR1 to see why we got here */
+	bl	kvmppc_check_wake_reason
+	cmpdi	r3, 0
+	bge	kvm_no_guest
+
+	/* get vcpu pointer, NULL if we have no vcpu to run */
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	cmpdi	r4,0
+	/* if we have no vcpu to run, go back to sleep */
+	beq	kvm_no_guest
+
+kvm_secondary_got_guest:
+
+	/* Set HSTATE_DSCR(r13) to something sensible */
+	ld	r6, PACA_DSCR(r13)
+	std	r6, HSTATE_DSCR(r13)
+
+	/* Order load of vcore, ptid etc. after load of vcpu */
+	lwsync
+	bl	kvmppc_hv_entry
+
+	/* Back from the guest, go back to nap */
+	/* Clear our vcpu pointer so we don't come back in early */
+	li	r0, 0
+	/*
+	 * Once we clear HSTATE_KVM_VCPU(r13), the code in
+	 * kvmppc_run_core() is going to assume that all our vcpu
+	 * state is visible in memory.  This lwsync makes sure
+	 * that that is true.
+	 */
+	lwsync
+	std	r0, HSTATE_KVM_VCPU(r13)
+
+/*
+ * At this point we have finished executing in the guest.
+ * We need to wait for hwthread_req to become zero, since
+ * we may not turn on the MMU while hwthread_req is non-zero.
+ * While waiting we also need to check if we get given a vcpu to run.
+ */
+kvm_no_guest:
+	lbz	r3, HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r3, 0
+	bne	53f
+	HMT_MEDIUM
+	li	r0, KVM_HWTHREAD_IN_KERNEL
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+	/* need to recheck hwthread_req after a barrier, to avoid race */
+	sync
+	lbz	r3, HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r3, 0
+	bne	54f
+/*
+ * We jump to power7_wakeup_loss, which will return to the caller
+ * of power7_nap in the powernv cpu offline loop.  The value we
+ * put in r3 becomes the return value for power7_nap.
+ */
+	li	r3, LPCR_PECE0
+	mfspr	r4, SPRN_LPCR
+	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
+	mtspr	SPRN_LPCR, r4
+	li	r3, 0
+	b	power7_wakeup_loss
+
+53:	HMT_LOW
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	kvm_no_guest
+	HMT_MEDIUM
+	b	kvm_secondary_got_guest
+
+54:	li	r0, KVM_HWTHREAD_IN_KVM
+	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+	b	kvm_no_guest
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_hv_entry
+kvmppc_hv_entry:
+
+	/* Required state:
+	 *
+	 * R4 = vcpu pointer (or NULL)
+	 * MSR = ~IR|DR
+	 * R13 = PACA
+	 * R1 = host R1
+	 * R2 = TOC
+	 * all other volatile GPRS = free
+	 */
+	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -112(r1)
+
+	/* Save R1 in the PACA */
+	std	r1, HSTATE_HOST_R1(r13)
+
+	li	r6, KVM_GUEST_MODE_HOST_HV
+	stb	r6, HSTATE_IN_GUEST(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Store initial timestamp */
+	cmpdi	r4, 0
+	beq	1f
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+1:
+#endif
+	/* Clear out SLB */
+	li	r6,0
+	slbmte	r6,r6
+	slbia
+	ptesync
+
+	/*
+	 * POWER7/POWER8 host -> guest partition switch code.
+	 * We don't have to lock against concurrent tlbies,
+	 * but we do have to coordinate across hardware threads.
+	 */
+	/* Set bit in entry map iff exit map is zero. */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r7, 1
+	lbz	r6, HSTATE_PTID(r13)
+	sld	r7, r7, r6
+	addi	r9, r5, VCORE_ENTRY_EXIT
+21:	lwarx	r3, 0, r9
+	cmpwi	r3, 0x100		/* any threads starting to exit? */
+	bge	secondary_too_late	/* if so we're too late to the party */
+	or	r3, r3, r7
+	stwcx.	r3, 0, r9
+	bne	21b
+
+	/* Primary thread switches to guest partition. */
+	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
+	cmpwi	r6,0
+	bne	10f
+	ld	r6,KVM_SDR1(r9)
+	lwz	r7,KVM_LPID(r9)
+	li	r0,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r0
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+
+	/* See if we need to flush the TLB */
+	lhz	r6,PACAPACAINDEX(r13)	/* test_bit(cpu, need_tlb_flush) */
+	clrldi	r7,r6,64-6		/* extract bit number (6 bits) */
+	srdi	r6,r6,6			/* doubleword number */
+	sldi	r6,r6,3			/* address offset */
+	add	r6,r6,r9
+	addi	r6,r6,KVM_NEED_FLUSH	/* dword in kvm->arch.need_tlb_flush */
+	li	r0,1
+	sld	r0,r0,r7
+	ld	r7,0(r6)
+	and.	r7,r7,r0
+	beq	22f
+23:	ldarx	r7,0,r6			/* if set, clear the bit */
+	andc	r7,r7,r0
+	stdcx.	r7,0,r6
+	bne	23b
+	/* Flush the TLB of any entries for this LPID */
+	/* use arch 2.07S as a proxy for POWER8 */
+BEGIN_FTR_SECTION
+	li	r6,512			/* POWER8 has 512 sets */
+FTR_SECTION_ELSE
+	li	r6,128			/* POWER7 has 128 sets */
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
+	mtctr	r6
+	li	r7,0x800		/* IS field = 0b10 */
+	ptesync
+28:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	28b
+	ptesync
+
+	/* Add timebase offset onto timebase */
+22:	ld	r8,VCORE_TB_OFFSET(r5)
+	cmpdi	r8,0
+	beq	37f
+	mftb	r6		/* current host timebase */
+	add	r8,r8,r6
+	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */
+	mftb	r7		/* check if lower 24 bits overflowed */
+	clrldi	r6,r6,40
+	clrldi	r7,r7,40
+	cmpld	r7,r6
+	bge	37f
+	addis	r8,r8,0x100	/* if so, increment upper 40 bits */
+	mtspr	SPRN_TBU40,r8
+
+	/* Load guest PCR value to select appropriate compat mode */
+37:	ld	r7, VCORE_PCR(r5)
+	cmpdi	r7, 0
+	beq	38f
+	mtspr	SPRN_PCR, r7
+38:
+
+BEGIN_FTR_SECTION
+	/* DPDES is shared between threads */
+	ld	r8, VCORE_DPDES(r5)
+	mtspr	SPRN_DPDES, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	li	r0,1
+	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
+
+	/* Do we have a guest vcpu to run? */
+10:	cmpdi	r4, 0
+	beq	kvmppc_primary_no_guest
+kvmppc_got_guest:
+
+	/* Load up guest SLB entries */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+	/* Increment yield count if they have a VPA */
+	ld	r3, VCPU_VPA(r4)
+	cmpdi	r3, 0
+	beq	25f
+	li	r6, LPPACA_YIELDCOUNT
+	LWZX_BE	r5, r3, r6
+	addi	r5, r5, 1
+	STWX_BE	r5, r3, r6
+	li	r6, 1
+	stb	r6, VCPU_VPA_DIRTY(r4)
+25:
+
+	/* Save purr/spurr */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	std	r5,HSTATE_PURR(r13)
+	std	r6,HSTATE_SPURR(r13)
+	ld	r7,VCPU_PURR(r4)
+	ld	r8,VCPU_SPURR(r4)
+	mtspr	SPRN_PURR,r7
+	mtspr	SPRN_SPURR,r8
+
+BEGIN_FTR_SECTION
+	/* Set partition DABR */
+	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
+	lwz	r5,VCPU_DABRX(r4)
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+	isync
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	b	skip_tm
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+
+	/* Turn on TM/FP/VSX/VMX so we can restore them. */
+	mfmsr	r5
+	li	r6, MSR_TM >> 32
+	sldi	r6, r6, 32
+	or	r5, r5, r6
+	ori	r5, r5, MSR_FP
+	oris	r5, r5, (MSR_VEC | MSR_VSX)@h
+	mtmsrd	r5
+
+	/*
+	 * The user may change these outside of a transaction, so they must
+	 * always be context switched.
+	 */
+	ld	r5, VCPU_TFHAR(r4)
+	ld	r6, VCPU_TFIAR(r4)
+	ld	r7, VCPU_TEXASR(r4)
+	mtspr	SPRN_TFHAR, r5
+	mtspr	SPRN_TFIAR, r6
+	mtspr	SPRN_TEXASR, r7
+
+	ld	r5, VCPU_MSR(r4)
+	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	beq	skip_tm	/* TM not active in guest */
+
+	/* Make sure the failure summary is set, otherwise we'll program check
+	 * when we trechkpt.  It's possible that this might have been not set
+	 * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+	 * host.
+	 */
+	oris	r7, r7, (TEXASR_FS)@h
+	mtspr	SPRN_TEXASR, r7
+
+	/*
+	 * We need to load up the checkpointed state for the guest.
+	 * We need to do this early as it will blow away any GPRs, VSRs and
+	 * some SPRs.
+	 */
+
+	mr	r31, r4
+	addi	r3, r31, VCPU_FPRS_TM
+	bl	load_fp_state
+	addi	r3, r31, VCPU_VRS_TM
+	bl	load_vr_state
+	mr	r4, r31
+	lwz	r7, VCPU_VRSAVE_TM(r4)
+	mtspr	SPRN_VRSAVE, r7
+
+	ld	r5, VCPU_LR_TM(r4)
+	lwz	r6, VCPU_CR_TM(r4)
+	ld	r7, VCPU_CTR_TM(r4)
+	ld	r8, VCPU_AMR_TM(r4)
+	ld	r9, VCPU_TAR_TM(r4)
+	mtlr	r5
+	mtcr	r6
+	mtctr	r7
+	mtspr	SPRN_AMR, r8
+	mtspr	SPRN_TAR, r9
+
+	/*
+	 * Load up PPR and DSCR values but don't put them in the actual SPRs
+	 * till the last moment to avoid running with userspace PPR and DSCR for
+	 * too long.
+	 */
+	ld	r29, VCPU_DSCR_TM(r4)
+	ld	r30, VCPU_PPR_TM(r4)
+
+	std	r2, PACATMSCRATCH(r13) /* Save TOC */
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/* Load GPRs r0-r28 */
+	reg = 0
+	.rept	29
+	ld	reg, VCPU_GPRS_TM(reg)(r31)
+	reg = reg + 1
+	.endr
+
+	mtspr	SPRN_DSCR, r29
+	mtspr	SPRN_PPR, r30
+
+	/* Load final GPRs */
+	ld	29, VCPU_GPRS_TM(29)(r31)
+	ld	30, VCPU_GPRS_TM(30)(r31)
+	ld	31, VCPU_GPRS_TM(31)(r31)
+
+	/* TM checkpointed state is now setup.  All GPRs are now volatile. */
+	TRECHKPT
+
+	/* Now let's get back the state we need. */
+	HMT_MEDIUM
+	GET_PACA(r13)
+	ld	r29, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r29
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATMSCRATCH(r13)
+
+	/* Set the MSR RI since we have our registers back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+skip_tm:
+#endif
+
+	/* Load guest PMU registers */
+	/* R4 is live here (vcpu pointer) */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+BEGIN_FTR_SECTION
+	ld	r3, VCPU_MMCR(r4)
+	andi.	r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+	cmpwi	r5, MMCR0_PMAO
+	beql	kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
+	lwz	r6, VCPU_PMC + 8(r4)
+	lwz	r7, VCPU_PMC + 12(r4)
+	lwz	r8, VCPU_PMC + 16(r4)
+	lwz	r9, VCPU_PMC + 20(r4)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r5
+	mtspr	SPRN_PMC3, r6
+	mtspr	SPRN_PMC4, r7
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+	ld	r3, VCPU_MMCR(r4)
+	ld	r5, VCPU_MMCR + 8(r4)
+	ld	r6, VCPU_MMCR + 16(r4)
+	ld	r7, VCPU_SIAR(r4)
+	ld	r8, VCPU_SDAR(r4)
+	mtspr	SPRN_MMCR1, r5
+	mtspr	SPRN_MMCRA, r6
+	mtspr	SPRN_SIAR, r7
+	mtspr	SPRN_SDAR, r8
+BEGIN_FTR_SECTION
+	ld	r5, VCPU_MMCR + 24(r4)
+	ld	r6, VCPU_SIER(r4)
+	lwz	r7, VCPU_PMC + 24(r4)
+	lwz	r8, VCPU_PMC + 28(r4)
+	ld	r9, VCPU_MMCR + 32(r4)
+	mtspr	SPRN_MMCR2, r5
+	mtspr	SPRN_SIER, r6
+	mtspr	SPRN_SPMC1, r7
+	mtspr	SPRN_SPMC2, r8
+	mtspr	SPRN_MMCRS, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	mtspr	SPRN_MMCR0, r3
+	isync
+
+	/* Load up FP, VMX and VSX registers */
+	bl	kvmppc_load_fp
+
+	ld	r14, VCPU_GPR(R14)(r4)
+	ld	r15, VCPU_GPR(R15)(r4)
+	ld	r16, VCPU_GPR(R16)(r4)
+	ld	r17, VCPU_GPR(R17)(r4)
+	ld	r18, VCPU_GPR(R18)(r4)
+	ld	r19, VCPU_GPR(R19)(r4)
+	ld	r20, VCPU_GPR(R20)(r4)
+	ld	r21, VCPU_GPR(R21)(r4)
+	ld	r22, VCPU_GPR(R22)(r4)
+	ld	r23, VCPU_GPR(R23)(r4)
+	ld	r24, VCPU_GPR(R24)(r4)
+	ld	r25, VCPU_GPR(R25)(r4)
+	ld	r26, VCPU_GPR(R26)(r4)
+	ld	r27, VCPU_GPR(R27)(r4)
+	ld	r28, VCPU_GPR(R28)(r4)
+	ld	r29, VCPU_GPR(R29)(r4)
+	ld	r30, VCPU_GPR(R30)(r4)
+	ld	r31, VCPU_GPR(R31)(r4)
+
+	/* Switch DSCR to guest value */
+	ld	r5, VCPU_DSCR(r4)
+	mtspr	SPRN_DSCR, r5
+
+BEGIN_FTR_SECTION
+	/* Skip next section on POWER7 */
+	b	8f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+	/* Load up POWER8-specific registers */
+	ld	r5, VCPU_IAMR(r4)
+	lwz	r6, VCPU_PSPB(r4)
+	ld	r7, VCPU_FSCR(r4)
+	mtspr	SPRN_IAMR, r5
+	mtspr	SPRN_PSPB, r6
+	mtspr	SPRN_FSCR, r7
+	ld	r5, VCPU_DAWR(r4)
+	ld	r6, VCPU_DAWRX(r4)
+	ld	r7, VCPU_CIABR(r4)
+	ld	r8, VCPU_TAR(r4)
+	mtspr	SPRN_DAWR, r5
+	mtspr	SPRN_DAWRX, r6
+	mtspr	SPRN_CIABR, r7
+	mtspr	SPRN_TAR, r8
+	ld	r5, VCPU_IC(r4)
+	ld	r6, VCPU_VTB(r4)
+	mtspr	SPRN_IC, r5
+	mtspr	SPRN_VTB, r6
+	ld	r8, VCPU_EBBHR(r4)
+	mtspr	SPRN_EBBHR, r8
+	ld	r5, VCPU_EBBRR(r4)
+	ld	r6, VCPU_BESCR(r4)
+	ld	r7, VCPU_CSIGR(r4)
+	ld	r8, VCPU_TACR(r4)
+	mtspr	SPRN_EBBRR, r5
+	mtspr	SPRN_BESCR, r6
+	mtspr	SPRN_CSIGR, r7
+	mtspr	SPRN_TACR, r8
+	ld	r5, VCPU_TCSCR(r4)
+	ld	r6, VCPU_ACOP(r4)
+	lwz	r7, VCPU_GUEST_PID(r4)
+	ld	r8, VCPU_WORT(r4)
+	mtspr	SPRN_TCSCR, r5
+	mtspr	SPRN_ACOP, r6
+	mtspr	SPRN_PID, r7
+	mtspr	SPRN_WORT, r8
+8:
+
+	/*
+	 * Set the decrementer to the guest decrementer.
+	 */
+	ld	r8,VCPU_DEC_EXPIRES(r4)
+	/* r8 is a host timebase value here, convert to guest TB */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r6,VCORE_TB_OFFSET(r5)
+	add	r8,r8,r6
+	mftb	r7
+	subf	r3,r7,r8
+	mtspr	SPRN_DEC,r3
+	stw	r3,VCPU_DEC(r4)
+
+	ld	r5, VCPU_SPRG0(r4)
+	ld	r6, VCPU_SPRG1(r4)
+	ld	r7, VCPU_SPRG2(r4)
+	ld	r8, VCPU_SPRG3(r4)
+	mtspr	SPRN_SPRG0, r5
+	mtspr	SPRN_SPRG1, r6
+	mtspr	SPRN_SPRG2, r7
+	mtspr	SPRN_SPRG3, r8
+
+	/* Load up DAR and DSISR */
+	ld	r5, VCPU_DAR(r4)
+	lwz	r6, VCPU_DSISR(r4)
+	mtspr	SPRN_DAR, r5
+	mtspr	SPRN_DSISR, r6
+
+	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	ld	r5,VCPU_AMR(r4)
+	ld	r6,VCPU_UAMOR(r4)
+	li	r7,-1
+	mtspr	SPRN_AMR,r5
+	mtspr	SPRN_UAMOR,r6
+	mtspr	SPRN_AMOR,r7
+
+	/* Restore state of CTRL run bit; assume 1 on entry */
+	lwz	r5,VCPU_CTRL(r4)
+	andi.	r5,r5,1
+	bne	4f
+	mfspr	r6,SPRN_CTRLF
+	clrrdi	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	/* Secondary threads wait for primary to have done partition switch */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, HSTATE_PTID(r13)
+	cmpwi	r6, 0
+	beq	21f
+	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	bne	21f
+	HMT_LOW
+20:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	20b
+	HMT_MEDIUM
+21:
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3, SPRN_HDEC
+	cmpwi	r3, 512		/* 1 microsecond */
+	blt	hdec_soon
+
+	ld	r6, VCPU_CTR(r4)
+	lwz	r7, VCPU_XER(r4)
+
+	mtctr	r6
+	mtxer	r7
+
+kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */
+	ld	r10, VCPU_PC(r4)
+	ld	r11, VCPU_MSR(r4)
+	ld	r6, VCPU_SRR0(r4)
+	ld	r7, VCPU_SRR1(r4)
+	mtspr	SPRN_SRR0, r6
+	mtspr	SPRN_SRR1, r7
+
+deliver_guest_interrupt:
+	/* r11 = vcpu->arch.msr & ~MSR_HV */
+	rldicl	r11, r11, 63 - MSR_HV_LG, 1
+	rotldi	r11, r11, 1 + MSR_HV_LG
+	ori	r11, r11, MSR_ME
+
+	/* Check if we can deliver an external or decrementer interrupt now */
+	ld	r0, VCPU_PENDING_EXC(r4)
+	rldicl	r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
+	cmpdi	cr1, r0, 0
+	andi.	r8, r11, MSR_EE
+	mfspr	r8, SPRN_LPCR
+	/* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
+	rldimi	r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
+	mtspr	SPRN_LPCR, r8
+	isync
+	beq	5f
+	li	r0, BOOK3S_INTERRUPT_EXTERNAL
+	bne	cr1, 12f
+	mfspr	r0, SPRN_DEC
+	cmpwi	r0, 0
+	li	r0, BOOK3S_INTERRUPT_DECREMENTER
+	bge	5f
+
+12:	mtspr	SPRN_SRR0, r10
+	mr	r10,r0
+	mtspr	SPRN_SRR1, r11
+	mr	r9, r4
+	bl	kvmppc_msr_interrupt
+5:
+
+/*
+ * Required state:
+ * R4 = vcpu
+ * R10: value for HSRR0
+ * R11: value for HSRR1
+ * R13 = PACA
+ */
+fast_guest_return:
+	li	r0,0
+	stb	r0,VCPU_CEDED(r4)	/* cancel cede */
+	mtspr	SPRN_HSRR0,r10
+	mtspr	SPRN_HSRR1,r11
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r9, KVM_GUEST_MODE_GUEST_HV
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Accumulate timing */
+	addi	r3, r4, VCPU_TB_GUEST
+	bl	kvmhv_accumulate_time
+#endif
+
+	/* Enter guest */
+
+BEGIN_FTR_SECTION
+	ld	r5, VCPU_CFAR(r4)
+	mtspr	SPRN_CFAR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+	ld	r0, VCPU_PPR(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	ld	r5, VCPU_LR(r4)
+	lwz	r6, VCPU_CR(r4)
+	mtlr	r5
+	mtcr	r6
+
+	ld	r1, VCPU_GPR(R1)(r4)
+	ld	r2, VCPU_GPR(R2)(r4)
+	ld	r3, VCPU_GPR(R3)(r4)
+	ld	r5, VCPU_GPR(R5)(r4)
+	ld	r6, VCPU_GPR(R6)(r4)
+	ld	r7, VCPU_GPR(R7)(r4)
+	ld	r8, VCPU_GPR(R8)(r4)
+	ld	r9, VCPU_GPR(R9)(r4)
+	ld	r10, VCPU_GPR(R10)(r4)
+	ld	r11, VCPU_GPR(R11)(r4)
+	ld	r12, VCPU_GPR(R12)(r4)
+	ld	r13, VCPU_GPR(R13)(r4)
+
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PPR, r0
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+	ld	r0, VCPU_GPR(R0)(r4)
+	ld	r4, VCPU_GPR(R4)(r4)
+
+	hrfid
+	b	.
+
+secondary_too_late:
+	li	r12, 0
+	cmpdi	r4, 0
+	beq	11f
+	stw	r12, VCPU_TRAP(r4)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+11:	b	kvmhv_switch_to_host
+
+hdec_soon:
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	stw	r12, VCPU_TRAP(r4)
+	mr	r9, r4
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+	b	guest_exit_cont
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+/*
+ * We come here from the first-level interrupt handlers.
+ */
+	.globl	kvmppc_interrupt_hv
+kvmppc_interrupt_hv:
+	/*
+	 * Register contents:
+	 * R12		= interrupt vector
+	 * R13		= PACA
+	 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
+	 * guest R13 saved in SPRN_SCRATCH0
+	 */
+	std	r9, HSTATE_SCRATCH2(r13)
+
+	lbz	r9, HSTATE_IN_GUEST(r13)
+	cmpwi	r9, KVM_GUEST_MODE_HOST_HV
+	beq	kvmppc_bad_host_intr
+#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+	cmpwi	r9, KVM_GUEST_MODE_GUEST
+	ld	r9, HSTATE_SCRATCH2(r13)
+	beq	kvmppc_interrupt_pr
+#endif
+	/* We're now back in the host but in guest MMU context */
+	li	r9, KVM_GUEST_MODE_HOST_HV
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Save registers */
+
+	std	r0, VCPU_GPR(R0)(r9)
+	std	r1, VCPU_GPR(R1)(r9)
+	std	r2, VCPU_GPR(R2)(r9)
+	std	r3, VCPU_GPR(R3)(r9)
+	std	r4, VCPU_GPR(R4)(r9)
+	std	r5, VCPU_GPR(R5)(r9)
+	std	r6, VCPU_GPR(R6)(r9)
+	std	r7, VCPU_GPR(R7)(r9)
+	std	r8, VCPU_GPR(R8)(r9)
+	ld	r0, HSTATE_SCRATCH2(r13)
+	std	r0, VCPU_GPR(R9)(r9)
+	std	r10, VCPU_GPR(R10)(r9)
+	std	r11, VCPU_GPR(R11)(r9)
+	ld	r3, HSTATE_SCRATCH0(r13)
+	lwz	r4, HSTATE_SCRATCH1(r13)
+	std	r3, VCPU_GPR(R12)(r9)
+	stw	r4, VCPU_CR(r9)
+BEGIN_FTR_SECTION
+	ld	r3, HSTATE_CFAR(r13)
+	std	r3, VCPU_CFAR(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+BEGIN_FTR_SECTION
+	ld	r4, HSTATE_PPR(r13)
+	std	r4, VCPU_PPR(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
+
+	/* Restore R1/R2 so we can handle faults */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	mfspr	r10, SPRN_SRR0
+	mfspr	r11, SPRN_SRR1
+	std	r10, VCPU_SRR0(r9)
+	std	r11, VCPU_SRR1(r9)
+	andi.	r0, r12, 2		/* need to read HSRR0/1? */
+	beq	1f
+	mfspr	r10, SPRN_HSRR0
+	mfspr	r11, SPRN_HSRR1
+	clrrdi	r12, r12, 2
+1:	std	r10, VCPU_PC(r9)
+	std	r11, VCPU_MSR(r9)
+
+	GET_SCRATCH0(r3)
+	mflr	r4
+	std	r3, VCPU_GPR(R13)(r9)
+	std	r4, VCPU_LR(r9)
+
+	stw	r12,VCPU_TRAP(r9)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMINTR
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+	ld	r5, VCPU_GPR(R5)(r9)
+	ld	r6, VCPU_GPR(R6)(r9)
+	ld	r7, VCPU_GPR(R7)(r9)
+	ld	r8, VCPU_GPR(R8)(r9)
+#endif
+
+	/* Save HEIR (HV emulation assist reg) in emul_inst
+	   if this is an HEI (HV emulation interrupt, e40) */
+	li	r3,KVM_INST_FETCH_FAILED
+	stw	r3,VCPU_LAST_INST(r9)
+	cmpwi	r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	bne	11f
+	mfspr	r3,SPRN_HEIR
+11:	stw	r3,VCPU_HEIR(r9)
+
+	/* these are volatile across C function calls */
+	mfctr	r3
+	mfxer	r4
+	std	r3, VCPU_CTR(r9)
+	stw	r4, VCPU_XER(r9)
+
+	/* If this is a page table miss then see if it's theirs or ours */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	kvmppc_hdsi
+	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
+	beq	kvmppc_hisi
+
+	/* See if this is a leftover HDEC interrupt */
+	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
+	bne	2f
+	mfspr	r3,SPRN_HDEC
+	cmpwi	r3,0
+	mr	r4,r9
+	bge	fast_guest_return
+2:
+	/* See if this is an hcall we can handle in real mode */
+	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
+	beq	hcall_try_real_mode
+
+	/* Hypervisor doorbell - exit only if host IPI flag set */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	bne	3f
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	beq	4f
+	b	guest_exit_cont
+3:
+	/* External interrupt ? */
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	bne+	guest_exit_cont
+
+	/* External interrupt, first check for host_ipi. If this is
+	 * set, we know the host wants us out so let's do it now
+	 */
+	bl	kvmppc_read_intr
+	cmpdi	r3, 0
+	bgt	guest_exit_cont
+
+	/* Check if any CPU is heading out to the host, if so head out too */
+4:	ld	r5, HSTATE_KVM_VCORE(r13)
+	lwz	r0, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0, 0x100
+	mr	r4, r9
+	blt	deliver_guest_interrupt
+
+guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+	/* Save more register state  */
+	mfdar	r6
+	mfdsisr	r7
+	std	r6, VCPU_DAR(r9)
+	stw	r7, VCPU_DSISR(r9)
+	/* don't overwrite fault_dar/fault_dsisr if HDSI */
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	mc_cont
+	std	r6, VCPU_FAULT_DAR(r9)
+	stw	r7, VCPU_FAULT_DSISR(r9)
+
+	/* See if it is a machine check */
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	machine_check_realmode
+mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMEXIT
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+#endif
+
+	/* Increment exit count, poke other threads to exit */
+	bl	kvmhv_commence_exit
+	nop
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	lwz	r12, VCPU_TRAP(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	ori	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
+	/* Read the guest SLB and save it away */
+	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
+	mtctr	r0
+	li	r6,0
+	addi	r7,r9,VCPU_SLB
+	li	r5,0
+1:	slbmfee	r8,r6
+	andis.	r0,r8,SLB_ESID_V@h
+	beq	2f
+	add	r8,r8,r6		/* put index in */
+	slbmfev	r3,r6
+	std	r8,VCPU_SLB_E(r7)
+	std	r3,VCPU_SLB_V(r7)
+	addi	r7,r7,VCPU_SLB_SIZE
+	addi	r5,r5,1
+2:	addi	r6,r6,1
+	bdnz	1b
+	stw	r5,VCPU_SLB_MAX(r9)
+
+	/*
+	 * Save the guest PURR/SPURR
+	 */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	ld	r7,VCPU_PURR(r9)
+	ld	r8,VCPU_SPURR(r9)
+	std	r5,VCPU_PURR(r9)
+	std	r6,VCPU_SPURR(r9)
+	subf	r5,r7,r5
+	subf	r6,r8,r6
+
+	/*
+	 * Restore host PURR/SPURR and add guest times
+	 * so that the time in the guest gets accounted.
+	 */
+	ld	r3,HSTATE_PURR(r13)
+	ld	r4,HSTATE_SPURR(r13)
+	add	r3,r3,r5
+	add	r4,r4,r6
+	mtspr	SPRN_PURR,r3
+	mtspr	SPRN_SPURR,r4
+
+	/* Save DEC */
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+	add	r5,r5,r6
+	/* r5 is a guest timebase value here, convert to host TB */
+	ld	r3,HSTATE_KVM_VCORE(r13)
+	ld	r4,VCORE_TB_OFFSET(r3)
+	subf	r5,r4,r5
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+BEGIN_FTR_SECTION
+	b	8f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Save POWER8-specific registers */
+	mfspr	r5, SPRN_IAMR
+	mfspr	r6, SPRN_PSPB
+	mfspr	r7, SPRN_FSCR
+	std	r5, VCPU_IAMR(r9)
+	stw	r6, VCPU_PSPB(r9)
+	std	r7, VCPU_FSCR(r9)
+	mfspr	r5, SPRN_IC
+	mfspr	r6, SPRN_VTB
+	mfspr	r7, SPRN_TAR
+	std	r5, VCPU_IC(r9)
+	std	r6, VCPU_VTB(r9)
+	std	r7, VCPU_TAR(r9)
+	mfspr	r8, SPRN_EBBHR
+	std	r8, VCPU_EBBHR(r9)
+	mfspr	r5, SPRN_EBBRR
+	mfspr	r6, SPRN_BESCR
+	mfspr	r7, SPRN_CSIGR
+	mfspr	r8, SPRN_TACR
+	std	r5, VCPU_EBBRR(r9)
+	std	r6, VCPU_BESCR(r9)
+	std	r7, VCPU_CSIGR(r9)
+	std	r8, VCPU_TACR(r9)
+	mfspr	r5, SPRN_TCSCR
+	mfspr	r6, SPRN_ACOP
+	mfspr	r7, SPRN_PID
+	mfspr	r8, SPRN_WORT
+	std	r5, VCPU_TCSCR(r9)
+	std	r6, VCPU_ACOP(r9)
+	stw	r7, VCPU_GUEST_PID(r9)
+	std	r8, VCPU_WORT(r9)
+8:
+
+	/* Save and reset AMR and UAMOR before turning on the MMU */
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	li	r6,0
+	mtspr	SPRN_AMR,r6
+
+	/* Switch DSCR back to host value */
+	mfspr	r8, SPRN_DSCR
+	ld	r7, HSTATE_DSCR(r13)
+	std	r8, VCPU_DSCR(r9)
+	mtspr	SPRN_DSCR, r7
+
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(R14)(r9)
+	std	r15, VCPU_GPR(R15)(r9)
+	std	r16, VCPU_GPR(R16)(r9)
+	std	r17, VCPU_GPR(R17)(r9)
+	std	r18, VCPU_GPR(R18)(r9)
+	std	r19, VCPU_GPR(R19)(r9)
+	std	r20, VCPU_GPR(R20)(r9)
+	std	r21, VCPU_GPR(R21)(r9)
+	std	r22, VCPU_GPR(R22)(r9)
+	std	r23, VCPU_GPR(R23)(r9)
+	std	r24, VCPU_GPR(R24)(r9)
+	std	r25, VCPU_GPR(R25)(r9)
+	std	r26, VCPU_GPR(R26)(r9)
+	std	r27, VCPU_GPR(R27)(r9)
+	std	r28, VCPU_GPR(R28)(r9)
+	std	r29, VCPU_GPR(R29)(r9)
+	std	r30, VCPU_GPR(R30)(r9)
+	std	r31, VCPU_GPR(R31)(r9)
+
+	/* Save SPRGs */
+	mfspr	r3, SPRN_SPRG0
+	mfspr	r4, SPRN_SPRG1
+	mfspr	r5, SPRN_SPRG2
+	mfspr	r6, SPRN_SPRG3
+	std	r3, VCPU_SPRG0(r9)
+	std	r4, VCPU_SPRG1(r9)
+	std	r5, VCPU_SPRG2(r9)
+	std	r6, VCPU_SPRG3(r9)
+
+	/* save FP state */
+	mr	r3, r9
+	bl	kvmppc_save_fp
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	b	2f
+END_FTR_SECTION_IFCLR(CPU_FTR_TM)
+	/* Turn on TM. */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+	ld	r5, VCPU_MSR(r9)
+	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+	beq	1f	/* TM not active in guest. */
+
+	li	r3, TM_CAUSE_KVM_RESCHED
+
+	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
+	li	r5, 0
+	mtmsrd	r5, 1
+
+	/* All GPRs are volatile at this point. */
+	TRECLAIM(R3)
+
+	/* Temporarily store r13 and r9 so we have some regs to play with */
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	std	r9, PACATMSCRATCH(r13)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	/* Get a few more GPRs free. */
+	std	r29, VCPU_GPRS_TM(29)(r9)
+	std	r30, VCPU_GPRS_TM(30)(r9)
+	std	r31, VCPU_GPRS_TM(31)(r9)
+
+	/* Save away PPR and DSCR soon so don't run with user values. */
+	mfspr	r31, SPRN_PPR
+	HMT_MEDIUM
+	mfspr	r30, SPRN_DSCR
+	ld	r29, HSTATE_DSCR(r13)
+	mtspr	SPRN_DSCR, r29
+
+	/* Save all but r9, r13 & r29-r31 */
+	reg = 0
+	.rept	29
+	.if (reg != 9) && (reg != 13)
+	std	reg, VCPU_GPRS_TM(reg)(r9)
+	.endif
+	reg = reg + 1
+	.endr
+	/* ... now save r13 */
+	GET_SCRATCH0(r4)
+	std	r4, VCPU_GPRS_TM(13)(r9)
+	/* ... and save r9 */
+	ld	r4, PACATMSCRATCH(r13)
+	std	r4, VCPU_GPRS_TM(9)(r9)
+
+	/* Reload stack pointer and TOC. */
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r2, PACATOC(r13)
+
+	/* Set MSR RI now we have r1 and r13 back. */
+	li	r5, MSR_RI
+	mtmsrd	r5, 1
+
+	/* Save away checkpinted SPRs. */
+	std	r31, VCPU_PPR_TM(r9)
+	std	r30, VCPU_DSCR_TM(r9)
+	mflr	r5
+	mfcr	r6
+	mfctr	r7
+	mfspr	r8, SPRN_AMR
+	mfspr	r10, SPRN_TAR
+	std	r5, VCPU_LR_TM(r9)
+	stw	r6, VCPU_CR_TM(r9)
+	std	r7, VCPU_CTR_TM(r9)
+	std	r8, VCPU_AMR_TM(r9)
+	std	r10, VCPU_TAR_TM(r9)
+
+	/* Restore r12 as trap number. */
+	lwz	r12, VCPU_TRAP(r9)
+
+	/* Save FP/VSX. */
+	addi	r3, r9, VCPU_FPRS_TM
+	bl	store_fp_state
+	addi	r3, r9, VCPU_VRS_TM
+	bl	store_vr_state
+	mfspr	r6, SPRN_VRSAVE
+	stw	r6, VCPU_VRSAVE_TM(r9)
+1:
+	/*
+	 * We need to save these SPRs after the treclaim so that the software
+	 * error code is recorded correctly in the TEXASR.  Also the user may
+	 * change these outside of a transaction, so they must always be
+	 * context switched.
+	 */
+	mfspr	r5, SPRN_TFHAR
+	mfspr	r6, SPRN_TFIAR
+	mfspr	r7, SPRN_TEXASR
+	std	r5, VCPU_TFHAR(r9)
+	std	r6, VCPU_TFIAR(r9)
+	std	r7, VCPU_TEXASR(r9)
+2:
+#endif
+
+	/* Increment yield count if they have a VPA */
+	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
+	cmpdi	r8, 0
+	beq	25f
+	li	r4, LPPACA_YIELDCOUNT
+	LWZX_BE	r3, r8, r4
+	addi	r3, r3, 1
+	STWX_BE	r3, r8, r4
+	li	r3, 1
+	stb	r3, VCPU_VPA_DIRTY(r9)
+25:
+	/* Save PMU registers if requested */
+	/* r8 and cr0.eq are live here */
+BEGIN_FTR_SECTION
+	/*
+	 * POWER8 seems to have a hardware bug where setting
+	 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
+	 * when some counters are already negative doesn't seem
+	 * to cause a performance monitor alert (and hence interrupt).
+	 * The effect of this is that when saving the PMU state,
+	 * if there is no PMU alert pending when we read MMCR0
+	 * before freezing the counters, but one becomes pending
+	 * before we read the counters, we lose it.
+	 * To work around this, we need a way to freeze the counters
+	 * before reading MMCR0.  Normally, freezing the counters
+	 * is done by writing MMCR0 (to set MMCR0[FC]) which
+	 * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
+	 * we can also freeze the counters using MMCR2, by writing
+	 * 1s to all the counter freeze condition bits (there are
+	 * 9 bits each for 6 counters).
+	 */
+	li	r3, -1			/* set all freeze bits */
+	clrrdi	r3, r3, 10
+	mfspr	r10, SPRN_MMCR2
+	mtspr	SPRN_MMCR2, r3
+	isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	mfspr	r6, SPRN_MMCRA
+	/* Clear MMCRA in order to disable SDAR updates */
+	li	r7, 0
+	mtspr	SPRN_MMCRA, r7
+	isync
+	beq	21f			/* if no VPA, save PMU stuff anyway */
+	lbz	r7, LPPACA_PMCINUSE(r8)
+	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
+	bne	21f
+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
+	b	22f
+21:	mfspr	r5, SPRN_MMCR1
+	mfspr	r7, SPRN_SIAR
+	mfspr	r8, SPRN_SDAR
+	std	r4, VCPU_MMCR(r9)
+	std	r5, VCPU_MMCR + 8(r9)
+	std	r6, VCPU_MMCR + 16(r9)
+BEGIN_FTR_SECTION
+	std	r10, VCPU_MMCR + 24(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	std	r7, VCPU_SIAR(r9)
+	std	r8, VCPU_SDAR(r9)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r4, SPRN_PMC2
+	mfspr	r5, SPRN_PMC3
+	mfspr	r6, SPRN_PMC4
+	mfspr	r7, SPRN_PMC5
+	mfspr	r8, SPRN_PMC6
+	stw	r3, VCPU_PMC(r9)
+	stw	r4, VCPU_PMC + 4(r9)
+	stw	r5, VCPU_PMC + 8(r9)
+	stw	r6, VCPU_PMC + 12(r9)
+	stw	r7, VCPU_PMC + 16(r9)
+	stw	r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+	mfspr	r5, SPRN_SIER
+	mfspr	r6, SPRN_SPMC1
+	mfspr	r7, SPRN_SPMC2
+	mfspr	r8, SPRN_MMCRS
+	std	r5, VCPU_SIER(r9)
+	stw	r6, VCPU_PMC + 24(r9)
+	stw	r7, VCPU_PMC + 28(r9)
+	std	r8, VCPU_MMCR + 32(r9)
+	lis	r4, 0x8000
+	mtspr	SPRN_MMCRS, r4
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+22:
+	/* Clear out SLB */
+	li	r5,0
+	slbmte	r5,r5
+	slbia
+	ptesync
+
+	/*
+	 * POWER7/POWER8 guest -> host partition switch code.
+	 * We don't have to lock against tlbies but we do
+	 * have to coordinate the hardware threads.
+	 */
+kvmhv_switch_to_host:
+	/* Secondary threads wait for primary to do partition switch */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
+	lbz	r3,HSTATE_PTID(r13)
+	cmpwi	r3,0
+	beq	15f
+	HMT_LOW
+13:	lbz	r3,VCORE_IN_GUEST(r5)
+	cmpwi	r3,0
+	bne	13b
+	HMT_MEDIUM
+	b	16f
+
+	/* Primary thread waits for all the secondaries to exit guest */
+15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
+	srwi	r0,r3,8
+	clrldi	r3,r3,56
+	cmpw	r3,r0
+	bne	15b
+	isync
+
+	/* Primary thread switches back to host partition */
+	ld	r6,KVM_HOST_SDR1(r4)
+	lwz	r7,KVM_HOST_LPID(r4)
+	li	r8,LPID_RSVD		/* switch to reserved LPID */
+	mtspr	SPRN_LPID,r8
+	ptesync
+	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
+	mtspr	SPRN_LPID,r7
+	isync
+
+BEGIN_FTR_SECTION
+	/* DPDES is shared between threads */
+	mfspr	r7, SPRN_DPDES
+	std	r7, VCORE_DPDES(r5)
+	/* clear DPDES so we don't get guest doorbells in the host */
+	li	r8, 0
+	mtspr	SPRN_DPDES, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	/* Subtract timebase offset from timebase */
+	ld	r8,VCORE_TB_OFFSET(r5)
+	cmpdi	r8,0
+	beq	17f
+	mftb	r6			/* current guest timebase */
+	subf	r8,r8,r6
+	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */
+	mftb	r7			/* check if lower 24 bits overflowed */
+	clrldi	r6,r6,40
+	clrldi	r7,r7,40
+	cmpld	r7,r6
+	bge	17f
+	addis	r8,r8,0x100		/* if so, increment upper 40 bits */
+	mtspr	SPRN_TBU40,r8
+
+	/* Reset PCR */
+17:	ld	r0, VCORE_PCR(r5)
+	cmpdi	r0, 0
+	beq	18f
+	li	r0, 0
+	mtspr	SPRN_PCR, r0
+18:
+	/* Signal secondary CPUs to continue */
+	stb	r0,VCORE_IN_GUEST(r5)
+	lis	r8,0x7fff		/* MAX_INT@h */
+	mtspr	SPRN_HDEC,r8
+
+16:	ld	r8,KVM_HOST_LPCR(r4)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* load host SLB entries */
+	ld	r8,PACA_SLBSHADOWPTR(r13)
+
+	.rept	SLB_NUM_BOLTED
+	li	r3, SLBSHADOW_SAVEAREA
+	LDX_BE	r5, r8, r3
+	addi	r3, r3, 8
+	LDX_BE	r6, r8, r3
+	andis.	r7,r5,SLB_ESID_V@h
+	beq	1f
+	slbmte	r6,r5
+1:	addi	r8,r8,16
+	.endr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Finish timing, if we have a vcpu */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	li	r3, 0
+	beq	2f
+	bl	kvmhv_accumulate_time
+2:
+#endif
+	/* Unset guest mode */
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	ld	r0, 112+PPC_LR_STKOFF(r1)
+	addi	r1, r1, 112
+	mtlr	r0
+	blr
+
+/*
+ * Check whether an HDSI is an HPTE not found fault or something else.
+ * If it is an HPTE not found fault that is due to the guest accessing
+ * a page that they have mapped but which we have paged out, then
+ * we continue on with the guest exit path.  In all other cases,
+ * reflect the HDSI to the guest as a DSI.
+ */
+kvmppc_hdsi:
+	mfspr	r4, SPRN_HDAR
+	mfspr	r6, SPRN_HDSISR
+	/* HPTE not found fault or protection fault? */
+	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
+	beq	1f			/* if not, send it to the guest */
+	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
+	beq	3f
+	clrrdi	r0, r4, 28
+	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */
+	bne	1f			/* if no SLB entry found */
+4:	std	r4, VCPU_FAULT_DAR(r9)
+	stw	r6, VCPU_FAULT_DSISR(r9)
+
+	/* Search the hash table. */
+	mr	r3, r9			/* vcpu pointer */
+	li	r7, 1			/* data fault */
+	bl	kvmppc_hpte_hv_fault
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	li	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
+	cmpdi	r3, 0			/* retry the instruction */
+	beq	6f
+	cmpdi	r3, -1			/* handle in kernel mode */
+	beq	guest_exit_cont
+	cmpdi	r3, -2			/* MMIO emulation; need instr word */
+	beq	2f
+
+	/* Synthesize a DSI for the guest */
+	ld	r4, VCPU_FAULT_DAR(r9)
+	mr	r6, r3
+1:	mtspr	SPRN_DAR, r4
+	mtspr	SPRN_DSISR, r6
+	mtspr	SPRN_SRR0, r10
+	mtspr	SPRN_SRR1, r11
+	li	r10, BOOK3S_INTERRUPT_DATA_STORAGE
+	bl	kvmppc_msr_interrupt
+fast_interrupt_c_return:
+6:	ld	r7, VCPU_CTR(r9)
+	lwz	r8, VCPU_XER(r9)
+	mtctr	r7
+	mtxer	r8
+	mr	r4, r9
+	b	fast_guest_return
+
+3:	ld	r5, VCPU_KVM(r9)	/* not relocated, use VRMA */
+	ld	r5, KVM_VRMA_SLB_V(r5)
+	b	4b
+
+	/* If this is for emulated MMIO, load the instruction word */
+2:	li	r8, KVM_INST_FETCH_FAILED	/* In case lwz faults */
+
+	/* Set guest mode to 'jump over instruction' so if lwz faults
+	 * we'll just continue at the next IP. */
+	li	r0, KVM_GUEST_MODE_SKIP
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	/* Do the access with MSR:DR enabled */
+	mfmsr	r3
+	ori	r4, r3, MSR_DR		/* Enable paging for data */
+	mtmsrd	r4
+	lwz	r8, 0(r10)
+	mtmsrd	r3
+
+	/* Store the result */
+	stw	r8, VCPU_LAST_INST(r9)
+
+	/* Unset guest mode. */
+	li	r0, KVM_GUEST_MODE_HOST_HV
+	stb	r0, HSTATE_IN_GUEST(r13)
+	b	guest_exit_cont
+
+/*
+ * Similarly for an HISI, reflect it to the guest as an ISI unless
+ * it is an HPTE not found fault for a page that we have paged out.
+ */
+kvmppc_hisi:
+	andis.	r0, r11, SRR1_ISI_NOPT@h
+	beq	1f
+	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
+	beq	3f
+	clrrdi	r0, r10, 28
+	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */
+	bne	1f			/* if no SLB entry found */
+4:
+	/* Search the hash table. */
+	mr	r3, r9			/* vcpu pointer */
+	mr	r4, r10
+	mr	r6, r11
+	li	r7, 0			/* instruction fault */
+	bl	kvmppc_hpte_hv_fault
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	li	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
+	cmpdi	r3, 0			/* retry the instruction */
+	beq	fast_interrupt_c_return
+	cmpdi	r3, -1			/* handle in kernel mode */
+	beq	guest_exit_cont
+
+	/* Synthesize an ISI for the guest */
+	mr	r11, r3
+1:	mtspr	SPRN_SRR0, r10
+	mtspr	SPRN_SRR1, r11
+	li	r10, BOOK3S_INTERRUPT_INST_STORAGE
+	bl	kvmppc_msr_interrupt
+	b	fast_interrupt_c_return
+
+3:	ld	r6, VCPU_KVM(r9)	/* not relocated, use VRMA */
+	ld	r5, KVM_VRMA_SLB_V(r6)
+	b	4b
+
+/*
+ * Try to handle an hcall in real mode.
+ * Returns to the guest if we handle it, or continues on up to
+ * the kernel if we can't (i.e. if we don't have a handler for
+ * it, or if the handler returns H_TOO_HARD).
+ *
+ * r5 - r8 contain hcall args,
+ * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca
+ */
+hcall_try_real_mode:
+	ld	r3,VCPU_GPR(R3)(r9)
+	andi.	r0,r11,MSR_PR
+	/* sc 1 from userspace - reflect to guest syscall */
+	bne	sc_1_fast_return
+	clrrdi	r3,r3,2
+	cmpldi	r3,hcall_real_table_end - hcall_real_table
+	bge	guest_exit_cont
+	/* See if this hcall is enabled for in-kernel handling */
+	ld	r4, VCPU_KVM(r9)
+	srdi	r0, r3, 8	/* r0 = (r3 / 4) >> 6 */
+	sldi	r0, r0, 3	/* index into kvm->arch.enabled_hcalls[] */
+	add	r4, r4, r0
+	ld	r0, KVM_ENABLED_HCALLS(r4)
+	rlwinm	r4, r3, 32-2, 0x3f	/* r4 = (r3 / 4) & 0x3f */
+	srd	r0, r0, r4
+	andi.	r0, r0, 1
+	beq	guest_exit_cont
+	/* Get pointer to handler, if any, and call it */
+	LOAD_REG_ADDR(r4, hcall_real_table)
+	lwax	r3,r3,r4
+	cmpwi	r3,0
+	beq	guest_exit_cont
+	add	r12,r3,r4
+	mtctr	r12
+	mr	r3,r9		/* get vcpu pointer */
+	ld	r4,VCPU_GPR(R4)(r9)
+	bctrl
+	cmpdi	r3,H_TOO_HARD
+	beq	hcall_real_fallback
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	std	r3,VCPU_GPR(R3)(r4)
+	ld	r10,VCPU_PC(r4)
+	ld	r11,VCPU_MSR(r4)
+	b	fast_guest_return
+
+sc_1_fast_return:
+	mtspr	SPRN_SRR0,r10
+	mtspr	SPRN_SRR1,r11
+	li	r10, BOOK3S_INTERRUPT_SYSCALL
+	bl	kvmppc_msr_interrupt
+	mr	r4,r9
+	b	fast_guest_return
+
+	/* We've attempted a real mode hcall, but it's punted it back
+	 * to userspace.  We need to restore some clobbered volatiles
+	 * before resuming the pass-it-to-qemu path */
+hcall_real_fallback:
+	li	r12,BOOK3S_INTERRUPT_SYSCALL
+	ld	r9, HSTATE_KVM_VCPU(r13)
+
+	b	guest_exit_cont
+
+	.globl	hcall_real_table
+hcall_real_table:
+	.long	0		/* 0 - unused */
+	.long	DOTSYM(kvmppc_h_remove) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_enter) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_read) - hcall_real_table
+	.long	0		/* 0x10 - H_CLEAR_MOD */
+	.long	0		/* 0x14 - H_CLEAR_REF */
+	.long	DOTSYM(kvmppc_h_protect) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_get_tce) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_put_tce) - hcall_real_table
+	.long	0		/* 0x24 - H_SET_SPRG0 */
+	.long	DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
+	.long	0		/* 0x2c */
+	.long	0		/* 0x30 */
+	.long	0		/* 0x34 */
+	.long	0		/* 0x38 */
+	.long	0		/* 0x3c */
+	.long	0		/* 0x40 */
+	.long	0		/* 0x44 */
+	.long	0		/* 0x48 */
+	.long	0		/* 0x4c */
+	.long	0		/* 0x50 */
+	.long	0		/* 0x54 */
+	.long	0		/* 0x58 */
+	.long	0		/* 0x5c */
+	.long	0		/* 0x60 */
+#ifdef CONFIG_KVM_XICS
+	.long	DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table
+	.long	DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table
+	.long	DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table
+	.long	0		/* 0x70 - H_IPOLL */
+	.long	DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table
+#else
+	.long	0		/* 0x64 - H_EOI */
+	.long	0		/* 0x68 - H_CPPR */
+	.long	0		/* 0x6c - H_IPI */
+	.long	0		/* 0x70 - H_IPOLL */
+	.long	0		/* 0x74 - H_XIRR */
+#endif
+	.long	0		/* 0x78 */
+	.long	0		/* 0x7c */
+	.long	0		/* 0x80 */
+	.long	0		/* 0x84 */
+	.long	0		/* 0x88 */
+	.long	0		/* 0x8c */
+	.long	0		/* 0x90 */
+	.long	0		/* 0x94 */
+	.long	0		/* 0x98 */
+	.long	0		/* 0x9c */
+	.long	0		/* 0xa0 */
+	.long	0		/* 0xa4 */
+	.long	0		/* 0xa8 */
+	.long	0		/* 0xac */
+	.long	0		/* 0xb0 */
+	.long	0		/* 0xb4 */
+	.long	0		/* 0xb8 */
+	.long	0		/* 0xbc */
+	.long	0		/* 0xc0 */
+	.long	0		/* 0xc4 */
+	.long	0		/* 0xc8 */
+	.long	0		/* 0xcc */
+	.long	0		/* 0xd0 */
+	.long	0		/* 0xd4 */
+	.long	0		/* 0xd8 */
+	.long	0		/* 0xdc */
+	.long	DOTSYM(kvmppc_h_cede) - hcall_real_table
+	.long	DOTSYM(kvmppc_rm_h_confer) - hcall_real_table
+	.long	0		/* 0xe8 */
+	.long	0		/* 0xec */
+	.long	0		/* 0xf0 */
+	.long	0		/* 0xf4 */
+	.long	0		/* 0xf8 */
+	.long	0		/* 0xfc */
+	.long	0		/* 0x100 */
+	.long	0		/* 0x104 */
+	.long	0		/* 0x108 */
+	.long	0		/* 0x10c */
+	.long	0		/* 0x110 */
+	.long	0		/* 0x114 */
+	.long	0		/* 0x118 */
+	.long	0		/* 0x11c */
+	.long	0		/* 0x120 */
+	.long	DOTSYM(kvmppc_h_bulk_remove) - hcall_real_table
+	.long	0		/* 0x128 */
+	.long	0		/* 0x12c */
+	.long	0		/* 0x130 */
+	.long	DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+	.long	0		/* 0x138 */
+	.long	0		/* 0x13c */
+	.long	0		/* 0x140 */
+	.long	0		/* 0x144 */
+	.long	0		/* 0x148 */
+	.long	0		/* 0x14c */
+	.long	0		/* 0x150 */
+	.long	0		/* 0x154 */
+	.long	0		/* 0x158 */
+	.long	0		/* 0x15c */
+	.long	0		/* 0x160 */
+	.long	0		/* 0x164 */
+	.long	0		/* 0x168 */
+	.long	0		/* 0x16c */
+	.long	0		/* 0x170 */
+	.long	0		/* 0x174 */
+	.long	0		/* 0x178 */
+	.long	0		/* 0x17c */
+	.long	0		/* 0x180 */
+	.long	0		/* 0x184 */
+	.long	0		/* 0x188 */
+	.long	0		/* 0x18c */
+	.long	0		/* 0x190 */
+	.long	0		/* 0x194 */
+	.long	0		/* 0x198 */
+	.long	0		/* 0x19c */
+	.long	0		/* 0x1a0 */
+	.long	0		/* 0x1a4 */
+	.long	0		/* 0x1a8 */
+	.long	0		/* 0x1ac */
+	.long	0		/* 0x1b0 */
+	.long	0		/* 0x1b4 */
+	.long	0		/* 0x1b8 */
+	.long	0		/* 0x1bc */
+	.long	0		/* 0x1c0 */
+	.long	0		/* 0x1c4 */
+	.long	0		/* 0x1c8 */
+	.long	0		/* 0x1cc */
+	.long	0		/* 0x1d0 */
+	.long	0		/* 0x1d4 */
+	.long	0		/* 0x1d8 */
+	.long	0		/* 0x1dc */
+	.long	0		/* 0x1e0 */
+	.long	0		/* 0x1e4 */
+	.long	0		/* 0x1e8 */
+	.long	0		/* 0x1ec */
+	.long	0		/* 0x1f0 */
+	.long	0		/* 0x1f4 */
+	.long	0		/* 0x1f8 */
+	.long	0		/* 0x1fc */
+	.long	0		/* 0x200 */
+	.long	0		/* 0x204 */
+	.long	0		/* 0x208 */
+	.long	0		/* 0x20c */
+	.long	0		/* 0x210 */
+	.long	0		/* 0x214 */
+	.long	0		/* 0x218 */
+	.long	0		/* 0x21c */
+	.long	0		/* 0x220 */
+	.long	0		/* 0x224 */
+	.long	0		/* 0x228 */
+	.long	0		/* 0x22c */
+	.long	0		/* 0x230 */
+	.long	0		/* 0x234 */
+	.long	0		/* 0x238 */
+	.long	0		/* 0x23c */
+	.long	0		/* 0x240 */
+	.long	0		/* 0x244 */
+	.long	0		/* 0x248 */
+	.long	0		/* 0x24c */
+	.long	0		/* 0x250 */
+	.long	0		/* 0x254 */
+	.long	0		/* 0x258 */
+	.long	0		/* 0x25c */
+	.long	0		/* 0x260 */
+	.long	0		/* 0x264 */
+	.long	0		/* 0x268 */
+	.long	0		/* 0x26c */
+	.long	0		/* 0x270 */
+	.long	0		/* 0x274 */
+	.long	0		/* 0x278 */
+	.long	0		/* 0x27c */
+	.long	0		/* 0x280 */
+	.long	0		/* 0x284 */
+	.long	0		/* 0x288 */
+	.long	0		/* 0x28c */
+	.long	0		/* 0x290 */
+	.long	0		/* 0x294 */
+	.long	0		/* 0x298 */
+	.long	0		/* 0x29c */
+	.long	0		/* 0x2a0 */
+	.long	0		/* 0x2a4 */
+	.long	0		/* 0x2a8 */
+	.long	0		/* 0x2ac */
+	.long	0		/* 0x2b0 */
+	.long	0		/* 0x2b4 */
+	.long	0		/* 0x2b8 */
+	.long	0		/* 0x2bc */
+	.long	0		/* 0x2c0 */
+	.long	0		/* 0x2c4 */
+	.long	0		/* 0x2c8 */
+	.long	0		/* 0x2cc */
+	.long	0		/* 0x2d0 */
+	.long	0		/* 0x2d4 */
+	.long	0		/* 0x2d8 */
+	.long	0		/* 0x2dc */
+	.long	0		/* 0x2e0 */
+	.long	0		/* 0x2e4 */
+	.long	0		/* 0x2e8 */
+	.long	0		/* 0x2ec */
+	.long	0		/* 0x2f0 */
+	.long	0		/* 0x2f4 */
+	.long	0		/* 0x2f8 */
+	.long	0		/* 0x2fc */
+	.long	DOTSYM(kvmppc_h_random) - hcall_real_table
+	.globl	hcall_real_table_end
+hcall_real_table_end:
+
+_GLOBAL(kvmppc_h_set_xdabr)
+	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
+	beq	6f
+	li	r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
+	andc.	r0, r5, r0
+	beq	3f
+6:	li	r3, H_PARAMETER
+	blr
+
+_GLOBAL(kvmppc_h_set_dabr)
+	li	r5, DABRX_USER | DABRX_KERNEL
+3:
+BEGIN_FTR_SECTION
+	b	2f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	std	r4,VCPU_DABR(r3)
+	stw	r5, VCPU_DABRX(r3)
+	mtspr	SPRN_DABRX, r5
+	/* Work around P7 bug where DABR can get corrupted on mtspr */
+1:	mtspr	SPRN_DABR,r4
+	mfspr	r5, SPRN_DABR
+	cmpd	r4, r5
+	bne	1b
+	isync
+	li	r3,0
+	blr
+
+	/* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
+2:	rlwimi	r5, r4, 5, DAWRX_DR | DAWRX_DW
+	rlwimi	r5, r4, 1, DAWRX_WT
+	clrrdi	r4, r4, 3
+	std	r4, VCPU_DAWR(r3)
+	std	r5, VCPU_DAWRX(r3)
+	mtspr	SPRN_DAWR, r4
+	mtspr	SPRN_DAWRX, r5
+	li	r3, 0
+	blr
+
+_GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
+	ori	r11,r11,MSR_EE
+	std	r11,VCPU_MSR(r3)
+	li	r0,1
+	stb	r0,VCPU_CEDED(r3)
+	sync			/* order setting ceded vs. testing prodded */
+	lbz	r5,VCPU_PRODDED(r3)
+	cmpwi	r5,0
+	bne	kvm_cede_prodded
+	li	r12,0		/* set trap to 0 to say hcall is handled */
+	stw	r12,VCPU_TRAP(r3)
+	li	r0,H_SUCCESS
+	std	r0,VCPU_GPR(R3)(r3)
+
+	/*
+	 * Set our bit in the bitmask of napping threads unless all the
+	 * other threads are already napping, in which case we send this
+	 * up to the host.
+	 */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	lbz	r6,HSTATE_PTID(r13)
+	lwz	r8,VCORE_ENTRY_EXIT(r5)
+	clrldi	r8,r8,56
+	li	r0,1
+	sld	r0,r0,r6
+	addi	r6,r5,VCORE_NAPPING_THREADS
+31:	lwarx	r4,0,r6
+	or	r4,r4,r0
+	cmpw	r4,r8
+	beq	kvm_cede_exit
+	stwcx.	r4,0,r6
+	bne	31b
+	/* order napping_threads update vs testing entry_exit_map */
+	isync
+	li	r0,NAPPING_CEDE
+	stb	r0,HSTATE_NAPPING(r13)
+	lwz	r7,VCORE_ENTRY_EXIT(r5)
+	cmpwi	r7,0x100
+	bge	33f		/* another thread already exiting */
+
+/*
+ * Although not specifically required by the architecture, POWER7
+ * preserves the following registers in nap mode, even if an SMT mode
+ * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3,
+ * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR.
+ */
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(R14)(r3)
+	std	r15, VCPU_GPR(R15)(r3)
+	std	r16, VCPU_GPR(R16)(r3)
+	std	r17, VCPU_GPR(R17)(r3)
+	std	r18, VCPU_GPR(R18)(r3)
+	std	r19, VCPU_GPR(R19)(r3)
+	std	r20, VCPU_GPR(R20)(r3)
+	std	r21, VCPU_GPR(R21)(r3)
+	std	r22, VCPU_GPR(R22)(r3)
+	std	r23, VCPU_GPR(R23)(r3)
+	std	r24, VCPU_GPR(R24)(r3)
+	std	r25, VCPU_GPR(R25)(r3)
+	std	r26, VCPU_GPR(R26)(r3)
+	std	r27, VCPU_GPR(R27)(r3)
+	std	r28, VCPU_GPR(R28)(r3)
+	std	r29, VCPU_GPR(R29)(r3)
+	std	r30, VCPU_GPR(R30)(r3)
+	std	r31, VCPU_GPR(R31)(r3)
+
+	/* save FP state */
+	bl	kvmppc_save_fp
+
+	/*
+	 * Set DEC to the smaller of DEC and HDEC, so that we wake
+	 * no later than the end of our timeslice (HDEC interrupts
+	 * don't wake us from nap).
+	 */
+	mfspr	r3, SPRN_DEC
+	mfspr	r4, SPRN_HDEC
+	mftb	r5
+	cmpw	r3, r4
+	ble	67f
+	mtspr	SPRN_DEC, r4
+67:
+	/* save expiry time of guest decrementer */
+	extsw	r3, r3
+	add	r3, r3, r5
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_TB_OFFSET(r5)
+	subf	r3, r6, r3	/* convert to host TB value */
+	std	r3, VCPU_DEC_EXPIRES(r4)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	addi	r3, r4, VCPU_TB_CEDE
+	bl	kvmhv_accumulate_time
+#endif
+
+	lis	r3, LPCR_PECEDP@h	/* Do wake on privileged doorbell */
+
+	/*
+	 * Take a nap until a decrementer or external or doobell interrupt
+	 * occurs, with PECE1 and PECE0 set in LPCR.
+	 * On POWER8, set PECEDH, and if we are ceding, also set PECEDP.
+	 * Also clear the runlatch bit before napping.
+	 */
+kvm_do_nap:
+	mfspr	r0, SPRN_CTRLF
+	clrrdi	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
+
+	li	r0,1
+	stb	r0,HSTATE_HWTHREAD_REQ(r13)
+	mfspr	r5,SPRN_LPCR
+	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
+BEGIN_FTR_SECTION
+	ori	r5, r5, LPCR_PECEDH
+	rlwimi	r5, r3, 0, LPCR_PECEDP
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	mtspr	SPRN_LPCR,r5
+	isync
+	li	r0, 0
+	std	r0, HSTATE_SCRATCH0(r13)
+	ptesync
+	ld	r0, HSTATE_SCRATCH0(r13)
+1:	cmpd	r0, r0
+	bne	1b
+	nap
+	b	.
+
+33:	mr	r4, r3
+	li	r3, 0
+	li	r12, 0
+	b	34f
+
+kvm_end_cede:
+	/* get vcpu pointer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
+	/* Woken by external or decrementer interrupt */
+	ld	r1, HSTATE_HOST_R1(r13)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMINTR
+	bl	kvmhv_accumulate_time
+#endif
+
+	/* load up FP state */
+	bl	kvmppc_load_fp
+
+	/* Restore guest decrementer */
+	ld	r3, VCPU_DEC_EXPIRES(r4)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_TB_OFFSET(r5)
+	add	r3, r3, r6	/* convert host TB to guest TB value */
+	mftb	r7
+	subf	r3, r7, r3
+	mtspr	SPRN_DEC, r3
+
+	/* Load NV GPRS */
+	ld	r14, VCPU_GPR(R14)(r4)
+	ld	r15, VCPU_GPR(R15)(r4)
+	ld	r16, VCPU_GPR(R16)(r4)
+	ld	r17, VCPU_GPR(R17)(r4)
+	ld	r18, VCPU_GPR(R18)(r4)
+	ld	r19, VCPU_GPR(R19)(r4)
+	ld	r20, VCPU_GPR(R20)(r4)
+	ld	r21, VCPU_GPR(R21)(r4)
+	ld	r22, VCPU_GPR(R22)(r4)
+	ld	r23, VCPU_GPR(R23)(r4)
+	ld	r24, VCPU_GPR(R24)(r4)
+	ld	r25, VCPU_GPR(R25)(r4)
+	ld	r26, VCPU_GPR(R26)(r4)
+	ld	r27, VCPU_GPR(R27)(r4)
+	ld	r28, VCPU_GPR(R28)(r4)
+	ld	r29, VCPU_GPR(R29)(r4)
+	ld	r30, VCPU_GPR(R30)(r4)
+	ld	r31, VCPU_GPR(R31)(r4)
+ 
+	/* Check the wake reason in SRR1 to see why we got here */
+	bl	kvmppc_check_wake_reason
+
+	/* clear our bit in vcore->napping_threads */
+34:	ld	r5,HSTATE_KVM_VCORE(r13)
+	lbz	r7,HSTATE_PTID(r13)
+	li	r0,1
+	sld	r0,r0,r7
+	addi	r6,r5,VCORE_NAPPING_THREADS
+32:	lwarx	r7,0,r6
+	andc	r7,r7,r0
+	stwcx.	r7,0,r6
+	bne	32b
+	li	r0,0
+	stb	r0,HSTATE_NAPPING(r13)
+
+	/* See if the wake reason means we need to exit */
+	stw	r12, VCPU_TRAP(r4)
+	mr	r9, r4
+	cmpdi	r3, 0
+	bgt	guest_exit_cont
+
+	/* see if any other thread is already exiting */
+	lwz	r0,VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0,0x100
+	bge	guest_exit_cont
+
+	b	kvmppc_cede_reentry	/* if not go back to guest */
+
+	/* cede when already previously prodded case */
+kvm_cede_prodded:
+	li	r0,0
+	stb	r0,VCPU_PRODDED(r3)
+	sync			/* order testing prodded vs. clearing ceded */
+	stb	r0,VCPU_CEDED(r3)
+	li	r3,H_SUCCESS
+	blr
+
+	/* we've ceded but we want to give control to the host */
+kvm_cede_exit:
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	b	guest_exit_cont
+
+	/* Try to handle a machine check in real mode */
+machine_check_realmode:
+	mr	r3, r9		/* get vcpu pointer */
+	bl	kvmppc_realmode_machine_check
+	nop
+	cmpdi	r3, 0		/* Did we handle MCE ? */
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	li	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	/*
+	 * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through
+	 * machine check interrupt (set HSRR0 to 0x200). And for handled
+	 * errors (no-fatal), just go back to guest execution with current
+	 * HSRR0 instead of exiting guest. This new approach will inject
+	 * machine check to guest for fatal error causing guest to crash.
+	 *
+	 * The old code used to return to host for unhandled errors which
+	 * was causing guest to hang with soft lockups inside guest and
+	 * makes it difficult to recover guest instance.
+	 */
+	ld	r10, VCPU_PC(r9)
+	ld	r11, VCPU_MSR(r9)
+	bne	2f	/* Continue guest execution. */
+	/* If not, deliver a machine check.  SRR0/1 are already set */
+	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK
+	ld	r11, VCPU_MSR(r9)
+	bl	kvmppc_msr_interrupt
+2:	b	fast_interrupt_c_return
+
+/*
+ * Check the reason we woke from nap, and take appropriate action.
+ * Returns (in r3):
+ *	0 if nothing needs to be done
+ *	1 if something happened that needs to be handled by the host
+ *	-1 if there was a guest wakeup (IPI or msgsnd)
+ *
+ * Also sets r12 to the interrupt vector for any interrupt that needs
+ * to be handled now by the host (0x500 for external interrupt), or zero.
+ * Modifies r0, r6, r7, r8.
+ */
+kvmppc_check_wake_reason:
+	mfspr	r6, SPRN_SRR1
+BEGIN_FTR_SECTION
+	rlwinm	r6, r6, 45-31, 0xf	/* extract wake reason field (P8) */
+FTR_SECTION_ELSE
+	rlwinm	r6, r6, 45-31, 0xe	/* P7 wake reason field is 3 bits */
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
+	cmpwi	r6, 8			/* was it an external interrupt? */
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beq	kvmppc_read_intr	/* if so, see what it was */
+	li	r3, 0
+	li	r12, 0
+	cmpwi	r6, 6			/* was it the decrementer? */
+	beq	0f
+BEGIN_FTR_SECTION
+	cmpwi	r6, 5			/* privileged doorbell? */
+	beq	0f
+	cmpwi	r6, 3			/* hypervisor doorbell? */
+	beq	3f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	li	r3, 1			/* anything else, return 1 */
+0:	blr
+
+	/* hypervisor doorbell */
+3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	/* see if it's a host IPI */
+	li	r3, 1
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bnelr
+	/* if not, clear it and return -1 */
+	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
+	PPC_MSGCLR(6)
+	li	r3, -1
+	blr
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ *	0 if no interrupt is pending
+ *	1 if an interrupt is pending that needs to be handled by the host
+ *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ * Modifies r0, r6, r7, r8, returns value in r3.
+ */
+kvmppc_read_intr:
+	/* see if a host IPI is pending */
+	li	r3, 1
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bne	1f
+
+	/* Now read the interrupt from the ICP */
+	ld	r6, HSTATE_XICS_PHYS(r13)
+	li	r7, XICS_XIRR
+	cmpdi	r6, 0
+	beq-	1f
+	lwzcix	r0, r6, r7
+	/*
+	 * Save XIRR for later. Since we get in in reverse endian on LE
+	 * systems, save it byte reversed and fetch it back in host endian.
+	 */
+	li	r3, HSTATE_SAVED_XIRR
+	STWX_BE	r0, r3, r13
+#ifdef __LITTLE_ENDIAN__
+	lwz	r3, HSTATE_SAVED_XIRR(r13)
+#else
+	mr	r3, r0
+#endif
+	rlwinm.	r3, r3, 0, 0xffffff
+	sync
+	beq	1f			/* if nothing pending in the ICP */
+
+	/* We found something in the ICP...
+	 *
+	 * If it's not an IPI, stash it in the PACA and return to
+	 * the host, we don't (yet) handle directing real external
+	 * interrupts directly to the guest
+	 */
+	cmpwi	r3, XICS_IPI		/* if there is, is it an IPI? */
+	bne	42f
+
+	/* It's an IPI, clear the MFRR and EOI it */
+	li	r3, 0xff
+	li	r8, XICS_MFRR
+	stbcix	r3, r6, r8		/* clear the IPI */
+	stwcix	r0, r6, r7		/* EOI it */
+	sync
+
+	/* We need to re-check host IPI now in case it got set in the
+	 * meantime. If it's clear, we bounce the interrupt to the
+	 * guest
+	 */
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bne-	43f
+
+	/* OK, it's an IPI for us */
+	li	r12, 0
+	li	r3, -1
+1:	blr
+
+42:	/* It's not an IPI and it's for the host. We saved a copy of XIRR in
+	 * the PACA earlier, it will be picked up by the host ICP driver
+	 */
+	li	r3, 1
+	b	1b
+
+43:	/* We raced with the host, we need to resend that IPI, bummer */
+	li	r0, IPI_PRIORITY
+	stbcix	r0, r6, r8		/* set the IPI */
+	sync
+	li	r3, 1
+	b	1b
+
+/*
+ * Save away FP, VMX and VSX registers.
+ * r3 = vcpu pointer
+ * N.B. r30 and r31 are volatile across this function,
+ * thus it is not callable from C.
+ */
+kvmppc_save_fp:
+	mflr	r30
+	mr	r31,r3
+	mfmsr	r5
+	ori	r8,r5,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	addi	r3,r3,VCPU_FPRS
+	bl	store_fp_state
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	addi	r3,r31,VCPU_VRS
+	bl	store_vr_state
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	mfspr	r6,SPRN_VRSAVE
+	stw	r6,VCPU_VRSAVE(r31)
+	mtlr	r30
+	blr
+
+/*
+ * Load up FP, VMX and VSX registers
+ * r4 = vcpu pointer
+ * N.B. r30 and r31 are volatile across this function,
+ * thus it is not callable from C.
+ */
+kvmppc_load_fp:
+	mflr	r30
+	mr	r31,r4
+	mfmsr	r9
+	ori	r8,r9,MSR_FP
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VEC@h
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r8,r8,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	mtmsrd	r8
+	addi	r3,r4,VCPU_FPRS
+	bl	load_fp_state
+#ifdef CONFIG_ALTIVEC
+BEGIN_FTR_SECTION
+	addi	r3,r31,VCPU_VRS
+	bl	load_vr_state
+END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
+#endif
+	lwz	r7,VCPU_VRSAVE(r31)
+	mtspr	SPRN_VRSAVE,r7
+	mtlr	r30
+	mr	r4,r31
+	blr
+
+/*
+ * We come here if we get any exception or interrupt while we are
+ * executing host real mode code while in guest MMU context.
+ * For now just spin, but we should do something better.
+ */
+kvmppc_bad_host_intr:
+	b	.
+
+/*
+ * This mimics the MSR transition on IRQ delivery.  The new guest MSR is taken
+ * from VCPU_INTR_MSR and is modified based on the required TM state changes.
+ *   r11 has the guest MSR value (in/out)
+ *   r9 has a vcpu pointer (in)
+ *   r0 is used as a scratch register
+ */
+kvmppc_msr_interrupt:
+	rldicl	r0, r11, 64 - MSR_TS_S_LG, 62
+	cmpwi	r0, 2 /* Check if we are in transactional state..  */
+	ld	r11, VCPU_INTR_MSR(r9)
+	bne	1f
+	/* ... if transactional, change to suspended */
+	li	r0, 1
+1:	rldimi	r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+	blr
+
+/*
+ * This works around a hardware bug on POWER8E processors, where
+ * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
+ * performance monitor interrupt.  Instead, when we need to have
+ * an interrupt pending, we have to arrange for a counter to overflow.
+ */
+kvmppc_fix_pmao:
+	li	r3, 0
+	mtspr	SPRN_MMCR2, r3
+	lis	r3, (MMCR0_PMXE | MMCR0_FCECE)@h
+	ori	r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN
+	mtspr	SPRN_MMCR0, r3
+	lis	r3, 0x7fff
+	ori	r3, r3, 0xffff
+	mtspr	SPRN_PMC6, r3
+	isync
+	blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, VCORE_IN_GUEST(r5)
+	cmpwi	r6, 0
+	beq	5f				/* if in guest, need to */
+	ld	r6, VCORE_TB_OFFSET(r5)		/* subtract timebase offset */
+5:	mftb	r5
+	subf	r5, r6, r5
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	std	r5, VCPU_ACTIVITY_START(r4)
+	blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r8, VCORE_IN_GUEST(r5)
+	cmpwi	r8, 0
+	beq	4f				/* if in guest, need to */
+	ld	r8, VCORE_TB_OFFSET(r5)		/* subtract timebase offset */
+4:	ld	r5, VCPU_CUR_ACTIVITY(r4)
+	ld	r6, VCPU_ACTIVITY_START(r4)
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	mftb	r7
+	subf	r7, r8, r7
+	std	r7, VCPU_ACTIVITY_START(r4)
+	cmpdi	r5, 0
+	beqlr
+	subf	r3, r6, r7
+	ld	r8, TAS_SEQCOUNT(r5)
+	cmpdi	r8, 0
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	lwsync
+	ld	r7, TAS_TOTAL(r5)
+	add	r7, r7, r3
+	std	r7, TAS_TOTAL(r5)
+	ld	r6, TAS_MIN(r5)
+	ld	r7, TAS_MAX(r5)
+	beq	3f
+	cmpd	r3, r6
+	bge	1f
+3:	std	r3, TAS_MIN(r5)
+1:	cmpd	r3, r7
+	ble	2f
+	std	r3, TAS_MAX(r5)
+2:	lwsync
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	blr
+#endif
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
new file mode 100644
index 000000000..d044b8b7c
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -0,0 +1,253 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define FUNC(name) 		name
+#else
+#define FUNC(name) 		GLUE(.,name)
+#endif
+#define GET_SHADOW_VCPU(reg)    addi	reg, r13, PACA_SVCPU
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+#define FUNC(name)		name
+#define GET_SHADOW_VCPU(reg)	lwz     reg, (THREAD + THREAD_KVM_SVCPU)(r2)
+
+#endif /* CONFIG_PPC_BOOK3S_XX */
+
+#define VCPU_LOAD_NVGPRS(vcpu) \
+	PPC_LL	r14, VCPU_GPR(R14)(vcpu); \
+	PPC_LL	r15, VCPU_GPR(R15)(vcpu); \
+	PPC_LL	r16, VCPU_GPR(R16)(vcpu); \
+	PPC_LL	r17, VCPU_GPR(R17)(vcpu); \
+	PPC_LL	r18, VCPU_GPR(R18)(vcpu); \
+	PPC_LL	r19, VCPU_GPR(R19)(vcpu); \
+	PPC_LL	r20, VCPU_GPR(R20)(vcpu); \
+	PPC_LL	r21, VCPU_GPR(R21)(vcpu); \
+	PPC_LL	r22, VCPU_GPR(R22)(vcpu); \
+	PPC_LL	r23, VCPU_GPR(R23)(vcpu); \
+	PPC_LL	r24, VCPU_GPR(R24)(vcpu); \
+	PPC_LL	r25, VCPU_GPR(R25)(vcpu); \
+	PPC_LL	r26, VCPU_GPR(R26)(vcpu); \
+	PPC_LL	r27, VCPU_GPR(R27)(vcpu); \
+	PPC_LL	r28, VCPU_GPR(R28)(vcpu); \
+	PPC_LL	r29, VCPU_GPR(R29)(vcpu); \
+	PPC_LL	r30, VCPU_GPR(R30)(vcpu); \
+	PPC_LL	r31, VCPU_GPR(R31)(vcpu); \
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (highmem)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+
+kvm_start_entry:
+	/* Write correct stack frame */
+	mflr	r0
+	PPC_STL	r0,PPC_LR_STKOFF(r1)
+
+	/* Save host state to the stack */
+	PPC_STLU r1, -SWITCH_FRAME_SIZE(r1)
+
+	/* Save r3 (kvm_run) and r4 (vcpu) */
+	SAVE_2GPRS(3, r1)
+
+	/* Save non-volatile registers (r14 - r31) */
+	SAVE_NVGPRS(r1)
+
+	/* Save CR */
+	mfcr	r14
+	stw	r14, _CCR(r1)
+
+	/* Save LR */
+	PPC_STL	r0, _LINK(r1)
+
+	/* Load non-volatile guest state from the vcpu */
+	VCPU_LOAD_NVGPRS(r4)
+
+kvm_start_lightweight:
+	/* Copy registers into shadow vcpu so we can access them in real mode */
+	GET_SHADOW_VCPU(r3)
+	bl	FUNC(kvmppc_copy_to_svcpu)
+	nop
+	REST_GPR(4, r1)
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Get the dcbz32 flag */
+	PPC_LL	r3, VCPU_HFLAGS(r4)
+	rldicl	r3, r3, 0, 63		/* r3 &= 1 */
+	stb	r3, HSTATE_RESTORE_HID5(r13)
+
+	/* Load up guest SPRG3 value, since it's user readable */
+	lwz	r3, VCPU_SHAREDBE(r4)
+	cmpwi	r3, 0
+	ld	r5, VCPU_SHARED(r4)
+	beq	sprg3_little_endian
+sprg3_big_endian:
+#ifdef __BIG_ENDIAN__
+	ld	r3, VCPU_SHARED_SPRG3(r5)
+#else
+	addi	r5, r5, VCPU_SHARED_SPRG3
+	ldbrx	r3, 0, r5
+#endif
+	b	after_sprg3_load
+sprg3_little_endian:
+#ifdef __LITTLE_ENDIAN__
+	ld	r3, VCPU_SHARED_SPRG3(r5)
+#else
+	addi	r5, r5, VCPU_SHARED_SPRG3
+	ldbrx	r3, 0, r5
+#endif
+
+after_sprg3_load:
+	mtspr	SPRN_SPRG3, r3
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	PPC_LL	r4, VCPU_SHADOW_MSR(r4)	/* get shadow_msr */
+
+	/* Jump to segment patching handler and into our guest */
+	bl	FUNC(kvmppc_entry_trampoline)
+	nop
+
+/*
+ * This is the handler in module memory. It gets jumped at from the
+ * lowmem trampoline code, so it's basically the guest exit code.
+ *
+ */
+
+	/*
+	 * Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R12      = exit handler id
+	 * R13      = PACA
+	 * SVCPU.*  = guest *
+	 * MSR.EE   = 1
+	 *
+	 */
+
+	PPC_LL	r3, GPR4(r1)		/* vcpu pointer */
+
+	/*
+	 * kvmppc_copy_from_svcpu can clobber volatile registers, save
+	 * the exit handler id to the vcpu and restore it from there later.
+	 */
+	stw	r12, VCPU_TRAP(r3)
+
+	/* Transfer reg values from shadow vcpu back to vcpu struct */
+	/* On 64-bit, interrupts are still off at this point */
+
+	GET_SHADOW_VCPU(r4)
+	bl	FUNC(kvmppc_copy_from_svcpu)
+	nop
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * Reload kernel SPRG3 value.
+	 * No need to save guest value as usermode can't modify SPRG3.
+	 */
+	ld	r3, PACA_SPRG_VDSO(r13)
+	mtspr	SPRN_SPRG_VDSO_WRITE, r3
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/* R7 = vcpu */
+	PPC_LL	r7, GPR4(r1)
+
+	PPC_STL	r14, VCPU_GPR(R14)(r7)
+	PPC_STL	r15, VCPU_GPR(R15)(r7)
+	PPC_STL	r16, VCPU_GPR(R16)(r7)
+	PPC_STL	r17, VCPU_GPR(R17)(r7)
+	PPC_STL	r18, VCPU_GPR(R18)(r7)
+	PPC_STL	r19, VCPU_GPR(R19)(r7)
+	PPC_STL	r20, VCPU_GPR(R20)(r7)
+	PPC_STL	r21, VCPU_GPR(R21)(r7)
+	PPC_STL	r22, VCPU_GPR(R22)(r7)
+	PPC_STL	r23, VCPU_GPR(R23)(r7)
+	PPC_STL	r24, VCPU_GPR(R24)(r7)
+	PPC_STL	r25, VCPU_GPR(R25)(r7)
+	PPC_STL	r26, VCPU_GPR(R26)(r7)
+	PPC_STL	r27, VCPU_GPR(R27)(r7)
+	PPC_STL	r28, VCPU_GPR(R28)(r7)
+	PPC_STL	r29, VCPU_GPR(R29)(r7)
+	PPC_STL	r30, VCPU_GPR(R30)(r7)
+	PPC_STL	r31, VCPU_GPR(R31)(r7)
+
+	/* Pass the exit number as 3rd argument to kvmppc_handle_exit */
+	lwz	r5, VCPU_TRAP(r7)
+
+	/* Restore r3 (kvm_run) and r4 (vcpu) */
+	REST_2GPRS(3, r1)
+	bl	FUNC(kvmppc_handle_exit_pr)
+
+	/* If RESUME_GUEST, get back in the loop */
+	cmpwi	r3, RESUME_GUEST
+	beq	kvm_loop_lightweight
+
+	cmpwi	r3, RESUME_GUEST_NV
+	beq	kvm_loop_heavyweight
+
+kvm_exit_loop:
+
+	PPC_LL	r4, _LINK(r1)
+	mtlr	r4
+
+	lwz	r14, _CCR(r1)
+	mtcr	r14
+
+	/* Restore non-volatile host registers (r14 - r31) */
+	REST_NVGPRS(r1)
+
+	addi    r1, r1, SWITCH_FRAME_SIZE
+	blr
+
+kvm_loop_heavyweight:
+
+	PPC_LL	r4, _LINK(r1)
+	PPC_STL r4, (PPC_LR_STKOFF + SWITCH_FRAME_SIZE)(r1)
+
+	/* Load vcpu and cpu_run */
+	REST_2GPRS(3, r1)
+
+	/* Load non-volatile guest state from the vcpu */
+	VCPU_LOAD_NVGPRS(r4)
+
+	/* Jump back into the beginning of this function */
+	b	kvm_start_lightweight
+
+kvm_loop_lightweight:
+
+	/* We'll need the vcpu pointer */
+	REST_GPR(4, r1)
+
+	/* Jump back into the beginning of this function */
+	b	kvm_start_lightweight
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644
index 000000000..5a1ab1250
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#include "trace_pr.h"
+
+#define PTE_SIZE	12
+
+static struct kmem_cache *hpte_cache;
+
+static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
+{
+	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
+}
+
+static inline u64 kvmppc_mmu_hash_pte_long(u64 eaddr)
+{
+	return hash_64((eaddr & 0x0ffff000) >> PTE_SIZE,
+		       HPTEG_HASH_BITS_PTE_LONG);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
+{
+	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
+{
+	return hash_64((vpage & 0xffffff000ULL) >> 12,
+		       HPTEG_HASH_BITS_VPTE_LONG);
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage)
+{
+	return hash_64((vpage & 0xffffffff0ULL) >> 4,
+		       HPTEG_HASH_BITS_VPTE_64K);
+}
+#endif
+
+void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	u64 index;
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
+	trace_kvm_book3s_mmu_map(pte);
+
+	spin_lock(&vcpu3s->mmu_lock);
+
+	/* Add to ePTE list */
+	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
+	hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]);
+
+	/* Add to ePTE_long list */
+	index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr);
+	hlist_add_head_rcu(&pte->list_pte_long,
+			   &vcpu3s->hpte_hash_pte_long[index]);
+
+	/* Add to vPTE list */
+	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
+	hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]);
+
+	/* Add to vPTE_long list */
+	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
+	hlist_add_head_rcu(&pte->list_vpte_long,
+			   &vcpu3s->hpte_hash_vpte_long[index]);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Add to vPTE_64k list */
+	index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage);
+	hlist_add_head_rcu(&pte->list_vpte_64k,
+			   &vcpu3s->hpte_hash_vpte_64k[index]);
+#endif
+
+	vcpu3s->hpte_cache_count++;
+
+	spin_unlock(&vcpu3s->mmu_lock);
+}
+
+static void free_pte_rcu(struct rcu_head *head)
+{
+	struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head);
+	kmem_cache_free(hpte_cache, pte);
+}
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
+	trace_kvm_book3s_mmu_invalidate(pte);
+
+	/* Different for 32 and 64 bit */
+	kvmppc_mmu_invalidate_pte(vcpu, pte);
+
+	spin_lock(&vcpu3s->mmu_lock);
+
+	/* pte already invalidated in between? */
+	if (hlist_unhashed(&pte->list_pte)) {
+		spin_unlock(&vcpu3s->mmu_lock);
+		return;
+	}
+
+	hlist_del_init_rcu(&pte->list_pte);
+	hlist_del_init_rcu(&pte->list_pte_long);
+	hlist_del_init_rcu(&pte->list_vpte);
+	hlist_del_init_rcu(&pte->list_vpte_long);
+#ifdef CONFIG_PPC_BOOK3S_64
+	hlist_del_init_rcu(&pte->list_vpte_64k);
+#endif
+	vcpu3s->hpte_cache_count--;
+
+	spin_unlock(&vcpu3s->mmu_lock);
+
+	call_rcu(&pte->rcu_head, free_pte_rcu);
+}
+
+static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hpte_cache *pte;
+	int i;
+
+	rcu_read_lock();
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_rcu(pte, list, list_vpte_long)
+			invalidate_pte(vcpu, pte);
+	}
+
+	rcu_read_unlock();
+}
+
+static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+
+	/* Find the list of entries in the map */
+	list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_pte)
+		if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+
+	/* Find the list of entries in the map */
+	list = &vcpu3s->hpte_hash_pte_long[
+			kvmppc_mmu_hash_pte_long(guest_ea)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_pte_long)
+		if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+	trace_kvm_book3s_mmu_flush("", vcpu, guest_ea, ea_mask);
+	guest_ea &= ea_mask;
+
+	switch (ea_mask) {
+	case ~0xfffUL:
+		kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
+		break;
+	case 0x0ffff000:
+		kvmppc_mmu_pte_flush_long(vcpu, guest_ea);
+		break;
+	case 0:
+		/* Doing a complete flush -> start from scratch */
+		kvmppc_mmu_pte_flush_all(vcpu);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+/* Flush with mask 0xfffffffff */
+static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xfffffffffULL;
+
+	list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_vpte)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/* Flush with mask 0xffffffff0 */
+static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xffffffff0ULL;
+
+	list = &vcpu3s->hpte_hash_vpte_64k[
+		kvmppc_mmu_hash_vpte_64k(guest_vp)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_vpte_64k)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+#endif
+
+/* Flush with mask 0xffffff000 */
+static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hlist_head *list;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xffffff000ULL;
+
+	list = &vcpu3s->hpte_hash_vpte_long[
+		kvmppc_mmu_hash_vpte_long(guest_vp)];
+
+	rcu_read_lock();
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_rcu(pte, list, list_vpte_long)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+
+	rcu_read_unlock();
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+	trace_kvm_book3s_mmu_flush("v", vcpu, guest_vp, vp_mask);
+	guest_vp &= vp_mask;
+
+	switch(vp_mask) {
+	case 0xfffffffffULL:
+		kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case 0xffffffff0ULL:
+		kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp);
+		break;
+#endif
+	case 0xffffff000ULL:
+		kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hpte_cache *pte;
+	int i;
+
+	trace_kvm_book3s_mmu_flush("p", vcpu, pa_start, pa_end);
+
+	rcu_read_lock();
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_rcu(pte, list, list_vpte_long)
+			if ((pte->pte.raddr >= pa_start) &&
+			    (pte->pte.raddr < pa_end))
+				invalidate_pte(vcpu, pte);
+	}
+
+	rcu_read_unlock();
+}
+
+struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	struct hpte_cache *pte;
+
+	if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM)
+		kvmppc_mmu_pte_flush_all(vcpu);
+
+	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
+
+	return pte;
+}
+
+void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte)
+{
+	kmem_cache_free(hpte_cache, pte);
+}
+
+void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+}
+
+static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		INIT_HLIST_HEAD(&hash_list[i]);
+}
+
+int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+
+	/* init hpte lookup hashes */
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_pte_long));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long));
+#ifdef CONFIG_PPC_BOOK3S_64
+	kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k,
+				  ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k));
+#endif
+
+	spin_lock_init(&vcpu3s->mmu_lock);
+
+	return 0;
+}
+
+int kvmppc_mmu_hpte_sysinit(void)
+{
+	/* init hpte slab cache */
+	hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache),
+				       sizeof(struct hpte_cache), 0, NULL);
+
+	return 0;
+}
+
+void kvmppc_mmu_hpte_sysexit(void)
+{
+	kmem_cache_destroy(hpte_cache);
+}
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
new file mode 100644
index 000000000..bd6ab1672
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -0,0 +1,1271 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright Novell Inc 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/kvm.h>
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_fpu.h>
+#include <asm/reg.h>
+#include <asm/cacheflush.h>
+#include <asm/switch_to.h>
+#include <linux/vmalloc.h>
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+#define dprintk printk
+#else
+#define dprintk(...) do { } while(0);
+#endif
+
+#define OP_LFS			48
+#define OP_LFSU			49
+#define OP_LFD			50
+#define OP_LFDU			51
+#define OP_STFS			52
+#define OP_STFSU		53
+#define OP_STFD			54
+#define OP_STFDU		55
+#define OP_PSQ_L		56
+#define OP_PSQ_LU		57
+#define OP_PSQ_ST		60
+#define OP_PSQ_STU		61
+
+#define OP_31_LFSX		535
+#define OP_31_LFSUX		567
+#define OP_31_LFDX		599
+#define OP_31_LFDUX		631
+#define OP_31_STFSX		663
+#define OP_31_STFSUX		695
+#define OP_31_STFX		727
+#define OP_31_STFUX		759
+#define OP_31_LWIZX		887
+#define OP_31_STFIWX		983
+
+#define OP_59_FADDS		21
+#define OP_59_FSUBS		20
+#define OP_59_FSQRTS		22
+#define OP_59_FDIVS		18
+#define OP_59_FRES		24
+#define OP_59_FMULS		25
+#define OP_59_FRSQRTES		26
+#define OP_59_FMSUBS		28
+#define OP_59_FMADDS		29
+#define OP_59_FNMSUBS		30
+#define OP_59_FNMADDS		31
+
+#define OP_63_FCMPU		0
+#define OP_63_FCPSGN		8
+#define OP_63_FRSP		12
+#define OP_63_FCTIW		14
+#define OP_63_FCTIWZ		15
+#define OP_63_FDIV		18
+#define OP_63_FADD		21
+#define OP_63_FSQRT		22
+#define OP_63_FSEL		23
+#define OP_63_FRE		24
+#define OP_63_FMUL		25
+#define OP_63_FRSQRTE		26
+#define OP_63_FMSUB		28
+#define OP_63_FMADD		29
+#define OP_63_FNMSUB		30
+#define OP_63_FNMADD		31
+#define OP_63_FCMPO		32
+#define OP_63_MTFSB1		38 // XXX
+#define OP_63_FSUB		20
+#define OP_63_FNEG		40
+#define OP_63_MCRFS		64
+#define OP_63_MTFSB0		70
+#define OP_63_FMR		72
+#define OP_63_MTFSFI		134
+#define OP_63_FABS		264
+#define OP_63_MFFS		583
+#define OP_63_MTFSF		711
+
+#define OP_4X_PS_CMPU0		0
+#define OP_4X_PSQ_LX		6
+#define OP_4XW_PSQ_STX		7
+#define OP_4A_PS_SUM0		10
+#define OP_4A_PS_SUM1		11
+#define OP_4A_PS_MULS0		12
+#define OP_4A_PS_MULS1		13
+#define OP_4A_PS_MADDS0		14
+#define OP_4A_PS_MADDS1		15
+#define OP_4A_PS_DIV		18
+#define OP_4A_PS_SUB		20
+#define OP_4A_PS_ADD		21
+#define OP_4A_PS_SEL		23
+#define OP_4A_PS_RES		24
+#define OP_4A_PS_MUL		25
+#define OP_4A_PS_RSQRTE		26
+#define OP_4A_PS_MSUB		28
+#define OP_4A_PS_MADD		29
+#define OP_4A_PS_NMSUB		30
+#define OP_4A_PS_NMADD		31
+#define OP_4X_PS_CMPO0		32
+#define OP_4X_PSQ_LUX		38
+#define OP_4XW_PSQ_STUX		39
+#define OP_4X_PS_NEG		40
+#define OP_4X_PS_CMPU1		64
+#define OP_4X_PS_MR		72
+#define OP_4X_PS_CMPO1		96
+#define OP_4X_PS_NABS		136
+#define OP_4X_PS_ABS		264
+#define OP_4X_PS_MERGE00	528
+#define OP_4X_PS_MERGE01	560
+#define OP_4X_PS_MERGE10	592
+#define OP_4X_PS_MERGE11	624
+
+#define SCALAR_NONE		0
+#define SCALAR_HIGH		(1 << 0)
+#define SCALAR_LOW		(1 << 1)
+#define SCALAR_NO_PS0		(1 << 2)
+#define SCALAR_NO_PS1		(1 << 3)
+
+#define GQR_ST_TYPE_MASK	0x00000007
+#define GQR_ST_TYPE_SHIFT	0
+#define GQR_ST_SCALE_MASK	0x00003f00
+#define GQR_ST_SCALE_SHIFT	8
+#define GQR_LD_TYPE_MASK	0x00070000
+#define GQR_LD_TYPE_SHIFT	16
+#define GQR_LD_SCALE_MASK	0x3f000000
+#define GQR_LD_SCALE_SHIFT	24
+
+#define GQR_QUANTIZE_FLOAT	0
+#define GQR_QUANTIZE_U8		4
+#define GQR_QUANTIZE_U16	5
+#define GQR_QUANTIZE_S8		6
+#define GQR_QUANTIZE_S16	7
+
+#define FPU_LS_SINGLE		0
+#define FPU_LS_DOUBLE		1
+#define FPU_LS_SINGLE_LOW	2
+
+static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
+{
+	kvm_cvt_df(&VCPU_FPR(vcpu, rt), &vcpu->arch.qpr[rt]);
+}
+
+static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
+{
+	u32 dsisr;
+	u64 msr = kvmppc_get_msr(vcpu);
+
+	msr = kvmppc_set_field(msr, 33, 36, 0);
+	msr = kvmppc_set_field(msr, 42, 47, 0);
+	kvmppc_set_msr(vcpu, msr);
+	kvmppc_set_dar(vcpu, eaddr);
+	/* Page Fault */
+	dsisr = kvmppc_set_field(0, 33, 33, 1);
+	if (is_store)
+		dsisr = kvmppc_set_field(dsisr, 38, 38, 1);
+	kvmppc_set_dsisr(vcpu, dsisr);
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE);
+}
+
+static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				   int rs, ulong addr, int ls_type)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	char tmp[8];
+	int len = sizeof(u32);
+
+	if (ls_type == FPU_LS_DOUBLE)
+		len = sizeof(u64);
+
+	/* read from memory */
+	r = kvmppc_ld(vcpu, &addr, len, tmp, true);
+	vcpu->arch.paddr_accessed = addr;
+
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, false);
+		goto done_load;
+	} else if (r == EMULATE_DO_MMIO) {
+		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs,
+					      len, 1);
+		goto done_load;
+	}
+
+	emulated = EMULATE_DONE;
+
+	/* put in registers */
+	switch (ls_type) {
+	case FPU_LS_SINGLE:
+		kvm_cvt_fd((u32*)tmp, &VCPU_FPR(vcpu, rs));
+		vcpu->arch.qpr[rs] = *((u32*)tmp);
+		break;
+	case FPU_LS_DOUBLE:
+		VCPU_FPR(vcpu, rs) = *((u64*)tmp);
+		break;
+	}
+
+	dprintk(KERN_INFO "KVM: FPR_LD [0x%llx] at 0x%lx (%d)\n", *(u64*)tmp,
+			  addr, len);
+
+done_load:
+	return emulated;
+}
+
+static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				    int rs, ulong addr, int ls_type)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	char tmp[8];
+	u64 val;
+	int len;
+
+	switch (ls_type) {
+	case FPU_LS_SINGLE:
+		kvm_cvt_df(&VCPU_FPR(vcpu, rs), (u32*)tmp);
+		val = *((u32*)tmp);
+		len = sizeof(u32);
+		break;
+	case FPU_LS_SINGLE_LOW:
+		*((u32*)tmp) = VCPU_FPR(vcpu, rs);
+		val = VCPU_FPR(vcpu, rs) & 0xffffffff;
+		len = sizeof(u32);
+		break;
+	case FPU_LS_DOUBLE:
+		*((u64*)tmp) = VCPU_FPR(vcpu, rs);
+		val = VCPU_FPR(vcpu, rs);
+		len = sizeof(u64);
+		break;
+	default:
+		val = 0;
+		len = 0;
+	}
+
+	r = kvmppc_st(vcpu, &addr, len, tmp, true);
+	vcpu->arch.paddr_accessed = addr;
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, true);
+	} else if (r == EMULATE_DO_MMIO) {
+		emulated = kvmppc_handle_store(run, vcpu, val, len, 1);
+	} else {
+		emulated = EMULATE_DONE;
+	}
+
+	dprintk(KERN_INFO "KVM: FPR_ST [0x%llx] at 0x%lx (%d)\n",
+			  val, addr, len);
+
+	return emulated;
+}
+
+static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				   int rs, ulong addr, bool w, int i)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	float one = 1.0;
+	u32 tmp[2];
+
+	/* read from memory */
+	if (w) {
+		r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
+		memcpy(&tmp[1], &one, sizeof(u32));
+	} else {
+		r = kvmppc_ld(vcpu, &addr, sizeof(u32) * 2, tmp, true);
+	}
+	vcpu->arch.paddr_accessed = addr;
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, false);
+		goto done_load;
+	} else if ((r == EMULATE_DO_MMIO) && w) {
+		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs,
+					      4, 1);
+		vcpu->arch.qpr[rs] = tmp[1];
+		goto done_load;
+	} else if (r == EMULATE_DO_MMIO) {
+		emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs,
+					      8, 1);
+		goto done_load;
+	}
+
+	emulated = EMULATE_DONE;
+
+	/* put in registers */
+	kvm_cvt_fd(&tmp[0], &VCPU_FPR(vcpu, rs));
+	vcpu->arch.qpr[rs] = tmp[1];
+
+	dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
+			  tmp[1], addr, w ? 4 : 8);
+
+done_load:
+	return emulated;
+}
+
+static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				    int rs, ulong addr, bool w, int i)
+{
+	int emulated = EMULATE_FAIL;
+	int r;
+	u32 tmp[2];
+	int len = w ? sizeof(u32) : sizeof(u64);
+
+	kvm_cvt_df(&VCPU_FPR(vcpu, rs), &tmp[0]);
+	tmp[1] = vcpu->arch.qpr[rs];
+
+	r = kvmppc_st(vcpu, &addr, len, tmp, true);
+	vcpu->arch.paddr_accessed = addr;
+	if (r < 0) {
+		kvmppc_inject_pf(vcpu, addr, true);
+	} else if ((r == EMULATE_DO_MMIO) && w) {
+		emulated = kvmppc_handle_store(run, vcpu, tmp[0], 4, 1);
+	} else if (r == EMULATE_DO_MMIO) {
+		u64 val = ((u64)tmp[0] << 32) | tmp[1];
+		emulated = kvmppc_handle_store(run, vcpu, val, 8, 1);
+	} else {
+		emulated = EMULATE_DONE;
+	}
+
+	dprintk(KERN_INFO "KVM: PSQ_ST [0x%x, 0x%x] at 0x%lx (%d)\n",
+			  tmp[0], tmp[1], addr, len);
+
+	return emulated;
+}
+
+/*
+ * Cuts out inst bits with ordering according to spec.
+ * That means the leftmost bit is zero. All given bits are included.
+ */
+static inline u32 inst_get_field(u32 inst, int msb, int lsb)
+{
+	return kvmppc_get_field(inst, msb + 32, lsb + 32);
+}
+
+bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
+{
+	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
+		return false;
+
+	switch (get_op(inst)) {
+	case OP_PSQ_L:
+	case OP_PSQ_LU:
+	case OP_PSQ_ST:
+	case OP_PSQ_STU:
+	case OP_LFS:
+	case OP_LFSU:
+	case OP_LFD:
+	case OP_LFDU:
+	case OP_STFS:
+	case OP_STFSU:
+	case OP_STFD:
+	case OP_STFDU:
+		return true;
+	case 4:
+		/* X form */
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_4X_PS_CMPU0:
+		case OP_4X_PSQ_LX:
+		case OP_4X_PS_CMPO0:
+		case OP_4X_PSQ_LUX:
+		case OP_4X_PS_NEG:
+		case OP_4X_PS_CMPU1:
+		case OP_4X_PS_MR:
+		case OP_4X_PS_CMPO1:
+		case OP_4X_PS_NABS:
+		case OP_4X_PS_ABS:
+		case OP_4X_PS_MERGE00:
+		case OP_4X_PS_MERGE01:
+		case OP_4X_PS_MERGE10:
+		case OP_4X_PS_MERGE11:
+			return true;
+		}
+		/* XW form */
+		switch (inst_get_field(inst, 25, 30)) {
+		case OP_4XW_PSQ_STX:
+		case OP_4XW_PSQ_STUX:
+			return true;
+		}
+		/* A form */
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_4A_PS_SUM1:
+		case OP_4A_PS_SUM0:
+		case OP_4A_PS_MULS0:
+		case OP_4A_PS_MULS1:
+		case OP_4A_PS_MADDS0:
+		case OP_4A_PS_MADDS1:
+		case OP_4A_PS_DIV:
+		case OP_4A_PS_SUB:
+		case OP_4A_PS_ADD:
+		case OP_4A_PS_SEL:
+		case OP_4A_PS_RES:
+		case OP_4A_PS_MUL:
+		case OP_4A_PS_RSQRTE:
+		case OP_4A_PS_MSUB:
+		case OP_4A_PS_MADD:
+		case OP_4A_PS_NMSUB:
+		case OP_4A_PS_NMADD:
+			return true;
+		}
+		break;
+	case 59:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_59_FADDS:
+		case OP_59_FSUBS:
+		case OP_59_FDIVS:
+		case OP_59_FRES:
+		case OP_59_FRSQRTES:
+			return true;
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_59_FMULS:
+		case OP_59_FMSUBS:
+		case OP_59_FMADDS:
+		case OP_59_FNMSUBS:
+		case OP_59_FNMADDS:
+			return true;
+		}
+		break;
+	case 63:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_63_MTFSB0:
+		case OP_63_MTFSB1:
+		case OP_63_MTFSF:
+		case OP_63_MTFSFI:
+		case OP_63_MCRFS:
+		case OP_63_MFFS:
+		case OP_63_FCMPU:
+		case OP_63_FCMPO:
+		case OP_63_FNEG:
+		case OP_63_FMR:
+		case OP_63_FABS:
+		case OP_63_FRSP:
+		case OP_63_FDIV:
+		case OP_63_FADD:
+		case OP_63_FSUB:
+		case OP_63_FCTIW:
+		case OP_63_FCTIWZ:
+		case OP_63_FRSQRTE:
+		case OP_63_FCPSGN:
+			return true;
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_63_FMUL:
+		case OP_63_FSEL:
+		case OP_63_FMSUB:
+		case OP_63_FMADD:
+		case OP_63_FNMSUB:
+		case OP_63_FNMADD:
+			return true;
+		}
+		break;
+	case 31:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_31_LFSX:
+		case OP_31_LFSUX:
+		case OP_31_LFDX:
+		case OP_31_LFDUX:
+		case OP_31_STFSX:
+		case OP_31_STFSUX:
+		case OP_31_STFX:
+		case OP_31_STFUX:
+		case OP_31_STFIWX:
+			return true;
+		}
+		break;
+	}
+
+	return false;
+}
+
+static int get_d_signext(u32 inst)
+{
+	int d = inst & 0x8ff;
+
+	if (d & 0x800)
+		return -(d & 0x7ff);
+
+	return (d & 0x7ff);
+}
+
+static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
+				      int reg_out, int reg_in1, int reg_in2,
+				      int reg_in3, int scalar,
+				      void (*func)(u64 *fpscr,
+						 u32 *dst, u32 *src1,
+						 u32 *src2, u32 *src3))
+{
+	u32 *qpr = vcpu->arch.qpr;
+	u32 ps0_out;
+	u32 ps0_in1, ps0_in2, ps0_in3;
+	u32 ps1_in1, ps1_in2, ps1_in3;
+
+	/* RC */
+	WARN_ON(rc);
+
+	/* PS0 */
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in3), &ps0_in3);
+
+	if (scalar & SCALAR_LOW)
+		ps0_in2 = qpr[reg_in2];
+
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
+
+	dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
+			  ps0_in1, ps0_in2, ps0_in3, ps0_out);
+
+	if (!(scalar & SCALAR_NO_PS0))
+		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
+
+	/* PS1 */
+	ps1_in1 = qpr[reg_in1];
+	ps1_in2 = qpr[reg_in2];
+	ps1_in3 = qpr[reg_in3];
+
+	if (scalar & SCALAR_HIGH)
+		ps1_in2 = ps0_in2;
+
+	if (!(scalar & SCALAR_NO_PS1))
+		func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
+
+	dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
+			  ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
+				    int reg_out, int reg_in1, int reg_in2,
+				    int scalar,
+				    void (*func)(u64 *fpscr,
+						 u32 *dst, u32 *src1,
+						 u32 *src2))
+{
+	u32 *qpr = vcpu->arch.qpr;
+	u32 ps0_out;
+	u32 ps0_in1, ps0_in2;
+	u32 ps1_out;
+	u32 ps1_in1, ps1_in2;
+
+	/* RC */
+	WARN_ON(rc);
+
+	/* PS0 */
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1);
+
+	if (scalar & SCALAR_LOW)
+		ps0_in2 = qpr[reg_in2];
+	else
+		kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2);
+
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
+
+	if (!(scalar & SCALAR_NO_PS0)) {
+		dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
+				  ps0_in1, ps0_in2, ps0_out);
+
+		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
+	}
+
+	/* PS1 */
+	ps1_in1 = qpr[reg_in1];
+	ps1_in2 = qpr[reg_in2];
+
+	if (scalar & SCALAR_HIGH)
+		ps1_in2 = ps0_in2;
+
+	func(&vcpu->arch.fp.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
+
+	if (!(scalar & SCALAR_NO_PS1)) {
+		qpr[reg_out] = ps1_out;
+
+		dprintk(KERN_INFO "PS2 ps1 -> f(0x%x, 0x%x) = 0x%x\n",
+				  ps1_in1, ps1_in2, qpr[reg_out]);
+	}
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
+				    int reg_out, int reg_in,
+				    void (*func)(u64 *t,
+						 u32 *dst, u32 *src1))
+{
+	u32 *qpr = vcpu->arch.qpr;
+	u32 ps0_out, ps0_in;
+	u32 ps1_in;
+
+	/* RC */
+	WARN_ON(rc);
+
+	/* PS0 */
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in), &ps0_in);
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in);
+
+	dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
+			  ps0_in, ps0_out);
+
+	kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
+
+	/* PS1 */
+	ps1_in = qpr[reg_in];
+	func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in);
+
+	dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
+			  ps1_in, qpr[reg_out]);
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	u32 inst;
+	enum emulation_result emulated = EMULATE_DONE;
+	int ax_rd, ax_ra, ax_rb, ax_rc;
+	short full_d;
+	u64 *fpr_d, *fpr_a, *fpr_b, *fpr_c;
+
+	bool rcomp;
+	u32 cr;
+#ifdef DEBUG
+	int i;
+#endif
+
+	emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst);
+	if (emulated != EMULATE_DONE)
+		return emulated;
+
+	ax_rd = inst_get_field(inst, 6, 10);
+	ax_ra = inst_get_field(inst, 11, 15);
+	ax_rb = inst_get_field(inst, 16, 20);
+	ax_rc = inst_get_field(inst, 21, 25);
+	full_d = inst_get_field(inst, 16, 31);
+
+	fpr_d = &VCPU_FPR(vcpu, ax_rd);
+	fpr_a = &VCPU_FPR(vcpu, ax_ra);
+	fpr_b = &VCPU_FPR(vcpu, ax_rb);
+	fpr_c = &VCPU_FPR(vcpu, ax_rc);
+
+	rcomp = (inst & 1) ? true : false;
+	cr = kvmppc_get_cr(vcpu);
+
+	if (!kvmppc_inst_is_paired_single(vcpu, inst))
+		return EMULATE_FAIL;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_FP)) {
+		kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL);
+		return EMULATE_AGAIN;
+	}
+
+	kvmppc_giveup_ext(vcpu, MSR_FP);
+	preempt_disable();
+	enable_kernel_fp();
+	/* Do we need to clear FE0 / FE1 here? Don't think so. */
+
+#ifdef DEBUG
+	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {
+		u32 f;
+		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);
+		dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n",
+			i, f, VCPU_FPR(vcpu, i), i, vcpu->arch.qpr[i]);
+	}
+#endif
+
+	switch (get_op(inst)) {
+	case OP_PSQ_L:
+	{
+		ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+		break;
+	}
+	case OP_PSQ_LU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_PSQ_ST:
+	{
+		ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+		break;
+	}
+	case OP_PSQ_STU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+		bool w = inst_get_field(inst, 16, 16) ? true : false;
+		int i = inst_get_field(inst, 17, 19);
+
+		addr += get_d_signext(inst);
+		emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case 4:
+		/* X form */
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_4X_PS_CMPU0:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PSQ_LX:
+		{
+			ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+			break;
+		}
+		case OP_4X_PS_CMPO0:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PSQ_LUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_4X_PS_NEG:
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) ^= 0x8000000000000000ULL;
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			vcpu->arch.qpr[ax_rd] ^= 0x80000000;
+			break;
+		case OP_4X_PS_CMPU1:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PS_MR:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			break;
+		case OP_4X_PS_CMPO1:
+			/* XXX */
+			emulated = EMULATE_FAIL;
+			break;
+		case OP_4X_PS_NABS:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) |= 0x8000000000000000ULL;
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			vcpu->arch.qpr[ax_rd] |= 0x80000000;
+			break;
+		case OP_4X_PS_ABS:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) &= ~0x8000000000000000ULL;
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			vcpu->arch.qpr[ax_rd] &= ~0x80000000;
+			break;
+		case OP_4X_PS_MERGE00:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra);
+			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */
+			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),
+				   &vcpu->arch.qpr[ax_rd]);
+			break;
+		case OP_4X_PS_MERGE01:
+			WARN_ON(rcomp);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra);
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			break;
+		case OP_4X_PS_MERGE10:
+			WARN_ON(rcomp);
+			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &VCPU_FPR(vcpu, ax_rd));
+			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */
+			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),
+				   &vcpu->arch.qpr[ax_rd]);
+			break;
+		case OP_4X_PS_MERGE11:
+			WARN_ON(rcomp);
+			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &VCPU_FPR(vcpu, ax_rd));
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
+			break;
+		}
+		/* XW form */
+		switch (inst_get_field(inst, 25, 30)) {
+		case OP_4XW_PSQ_STX:
+		{
+			ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+			break;
+		}
+		case OP_4XW_PSQ_STUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra);
+			bool w = inst_get_field(inst, 21, 21) ? true : false;
+			int i = inst_get_field(inst, 22, 24);
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		}
+		/* A form */
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_4A_PS_SUM1:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds);
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rc);
+			break;
+		case OP_4A_PS_SUM0:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NO_PS1 | SCALAR_LOW, fps_fadds);
+			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rc];
+			break;
+		case OP_4A_PS_MULS0:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, SCALAR_HIGH, fps_fmuls);
+			break;
+		case OP_4A_PS_MULS1:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, SCALAR_LOW, fps_fmuls);
+			break;
+		case OP_4A_PS_MADDS0:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_HIGH, fps_fmadds);
+			break;
+		case OP_4A_PS_MADDS1:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_LOW, fps_fmadds);
+			break;
+		case OP_4A_PS_DIV:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NONE, fps_fdivs);
+			break;
+		case OP_4A_PS_SUB:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NONE, fps_fsubs);
+			break;
+		case OP_4A_PS_ADD:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rb, SCALAR_NONE, fps_fadds);
+			break;
+		case OP_4A_PS_SEL:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fsel);
+			break;
+		case OP_4A_PS_RES:
+			emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd,
+					ax_rb, fps_fres);
+			break;
+		case OP_4A_PS_MUL:
+			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, SCALAR_NONE, fps_fmuls);
+			break;
+		case OP_4A_PS_RSQRTE:
+			emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd,
+					ax_rb, fps_frsqrte);
+			break;
+		case OP_4A_PS_MSUB:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmsubs);
+			break;
+		case OP_4A_PS_MADD:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmadds);
+			break;
+		case OP_4A_PS_NMSUB:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmsubs);
+			break;
+		case OP_4A_PS_NMADD:
+			emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd,
+					ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmadds);
+			break;
+		}
+		break;
+
+	/* Real FPU operations */
+
+	case OP_LFS:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+						   FPU_LS_SINGLE);
+		break;
+	}
+	case OP_LFSU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+						   FPU_LS_SINGLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_LFD:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+						   FPU_LS_DOUBLE);
+		break;
+	}
+	case OP_LFDU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr,
+						   FPU_LS_DOUBLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_STFS:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+						    FPU_LS_SINGLE);
+		break;
+	}
+	case OP_STFSU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+						    FPU_LS_SINGLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case OP_STFD:
+	{
+		ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+						    FPU_LS_DOUBLE);
+		break;
+	}
+	case OP_STFDU:
+	{
+		ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d;
+
+		emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr,
+						    FPU_LS_DOUBLE);
+
+		if (emulated == EMULATE_DONE)
+			kvmppc_set_gpr(vcpu, ax_ra, addr);
+		break;
+	}
+	case 31:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_31_LFSX:
+		{
+			ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0;
+
+			addr += kvmppc_get_gpr(vcpu, ax_rb);
+			emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+							   addr, FPU_LS_SINGLE);
+			break;
+		}
+		case OP_31_LFSUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+							   addr, FPU_LS_SINGLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_LFDX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+							   addr, FPU_LS_DOUBLE);
+			break;
+		}
+		case OP_31_LFDUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd,
+							   addr, FPU_LS_DOUBLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_STFSX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+							    addr, FPU_LS_SINGLE);
+			break;
+		}
+		case OP_31_STFSUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+							    addr, FPU_LS_SINGLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_STFX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+							    addr, FPU_LS_DOUBLE);
+			break;
+		}
+		case OP_31_STFUX:
+		{
+			ulong addr = kvmppc_get_gpr(vcpu, ax_ra) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+							    addr, FPU_LS_DOUBLE);
+
+			if (emulated == EMULATE_DONE)
+				kvmppc_set_gpr(vcpu, ax_ra, addr);
+			break;
+		}
+		case OP_31_STFIWX:
+		{
+			ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) +
+				     kvmppc_get_gpr(vcpu, ax_rb);
+
+			emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd,
+							    addr,
+							    FPU_LS_SINGLE_LOW);
+			break;
+		}
+			break;
+		}
+		break;
+	case 59:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_59_FADDS:
+			fpd_fadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FSUBS:
+			fpd_fsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FDIVS:
+			fpd_fdivs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FRES:
+			fpd_fres(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FRSQRTES:
+			fpd_frsqrtes(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_59_FMULS:
+			fpd_fmuls(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FMSUBS:
+			fpd_fmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FMADDS:
+			fpd_fmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FNMSUBS:
+			fpd_fnmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_59_FNMADDS:
+			fpd_fnmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		}
+		break;
+	case 63:
+		switch (inst_get_field(inst, 21, 30)) {
+		case OP_63_MTFSB0:
+		case OP_63_MTFSB1:
+		case OP_63_MCRFS:
+		case OP_63_MTFSFI:
+			/* XXX need to implement */
+			break;
+		case OP_63_MFFS:
+			/* XXX missing CR */
+			*fpr_d = vcpu->arch.fp.fpscr;
+			break;
+		case OP_63_MTFSF:
+			/* XXX missing fm bits */
+			/* XXX missing CR */
+			vcpu->arch.fp.fpscr = *fpr_b;
+			break;
+		case OP_63_FCMPU:
+		{
+			u32 tmp_cr;
+			u32 cr0_mask = 0xf0000000;
+			u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
+
+			fpd_fcmpu(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);
+			cr &= ~(cr0_mask >> cr_shift);
+			cr |= (cr & cr0_mask) >> cr_shift;
+			break;
+		}
+		case OP_63_FCMPO:
+		{
+			u32 tmp_cr;
+			u32 cr0_mask = 0xf0000000;
+			u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
+
+			fpd_fcmpo(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);
+			cr &= ~(cr0_mask >> cr_shift);
+			cr |= (cr & cr0_mask) >> cr_shift;
+			break;
+		}
+		case OP_63_FNEG:
+			fpd_fneg(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FMR:
+			*fpr_d = *fpr_b;
+			break;
+		case OP_63_FABS:
+			fpd_fabs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FCPSGN:
+			fpd_fcpsgn(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FDIV:
+			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FADD:
+			fpd_fadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FSUB:
+			fpd_fsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			break;
+		case OP_63_FCTIW:
+			fpd_fctiw(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FCTIWZ:
+			fpd_fctiwz(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			break;
+		case OP_63_FRSP:
+			fpd_frsp(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			kvmppc_sync_qpr(vcpu, ax_rd);
+			break;
+		case OP_63_FRSQRTE:
+		{
+			double one = 1.0f;
+
+			/* fD = sqrt(fB) */
+			fpd_fsqrt(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
+			/* fD = 1.0f / fD */
+			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, (u64*)&one, fpr_d);
+			break;
+		}
+		}
+		switch (inst_get_field(inst, 26, 30)) {
+		case OP_63_FMUL:
+			fpd_fmul(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+			break;
+		case OP_63_FSEL:
+			fpd_fsel(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FMSUB:
+			fpd_fmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FMADD:
+			fpd_fmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FNMSUB:
+			fpd_fnmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		case OP_63_FNMADD:
+			fpd_fnmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			break;
+		}
+		break;
+	}
+
+#ifdef DEBUG
+	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {
+		u32 f;
+		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);
+		dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
+	}
+#endif
+
+	if (rcomp)
+		kvmppc_set_cr(vcpu, cr);
+
+	preempt_enable();
+
+	return emulated;
+}
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
new file mode 100644
index 000000000..f57383941
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -0,0 +1,1772 @@
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ * Functions relating to running KVM on Book 3S processors where
+ * we don't have access to hypervisor mode, and we run the guest
+ * in problem state (user mode).
+ *
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <asm/switch_to.h>
+#include <asm/firmware.h>
+#include <asm/hvcall.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+#include "book3s.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_pr.h"
+
+/* #define EXIT_DEBUG */
+/* #define DEBUG_EXT */
+
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr);
+static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
+
+/* Some compatibility defines */
+#ifdef CONFIG_PPC_BOOK3S_32
+#define MSR_USER32 MSR_USER
+#define MSR_USER64 MSR_USER
+#define HW_PAGE_SIZE PAGE_SIZE
+#endif
+
+static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu)
+{
+	ulong msr = kvmppc_get_msr(vcpu);
+	return (msr & (MSR_IR|MSR_DR)) == MSR_DR;
+}
+
+static void kvmppc_fixup_split_real(struct kvm_vcpu *vcpu)
+{
+	ulong msr = kvmppc_get_msr(vcpu);
+	ulong pc = kvmppc_get_pc(vcpu);
+
+	/* We are in DR only split real mode */
+	if ((msr & (MSR_IR|MSR_DR)) != MSR_DR)
+		return;
+
+	/* We have not fixed up the guest already */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK)
+		return;
+
+	/* The code is in fixupable address space */
+	if (pc & SPLIT_HACK_MASK)
+		return;
+
+	vcpu->arch.hflags |= BOOK3S_HFLAG_SPLIT_HACK;
+	kvmppc_set_pc(vcpu, pc | SPLIT_HACK_OFFS);
+}
+
+void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu);
+
+static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb));
+	svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
+	svcpu->in_use = 0;
+	svcpu_put(svcpu);
+#endif
+
+	/* Disable AIL if supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S))
+		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL);
+
+	vcpu->cpu = smp_processor_id();
+#ifdef CONFIG_PPC_BOOK3S_32
+	current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu;
+#endif
+
+	if (kvmppc_is_split_real(vcpu))
+		kvmppc_fixup_split_real(vcpu);
+}
+
+static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu);
+	if (svcpu->in_use) {
+		kvmppc_copy_from_svcpu(vcpu, svcpu);
+	}
+	memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb));
+	to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max;
+	svcpu_put(svcpu);
+#endif
+
+	if (kvmppc_is_split_real(vcpu))
+		kvmppc_unfixup_split_real(vcpu);
+
+	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+
+	/* Enable AIL if supported */
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    cpu_has_feature(CPU_FTR_ARCH_207S))
+		mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3);
+
+	vcpu->cpu = -1;
+}
+
+/* Copy data needed by real-mode code from vcpu to shadow vcpu */
+void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu,
+			  struct kvm_vcpu *vcpu)
+{
+	svcpu->gpr[0] = vcpu->arch.gpr[0];
+	svcpu->gpr[1] = vcpu->arch.gpr[1];
+	svcpu->gpr[2] = vcpu->arch.gpr[2];
+	svcpu->gpr[3] = vcpu->arch.gpr[3];
+	svcpu->gpr[4] = vcpu->arch.gpr[4];
+	svcpu->gpr[5] = vcpu->arch.gpr[5];
+	svcpu->gpr[6] = vcpu->arch.gpr[6];
+	svcpu->gpr[7] = vcpu->arch.gpr[7];
+	svcpu->gpr[8] = vcpu->arch.gpr[8];
+	svcpu->gpr[9] = vcpu->arch.gpr[9];
+	svcpu->gpr[10] = vcpu->arch.gpr[10];
+	svcpu->gpr[11] = vcpu->arch.gpr[11];
+	svcpu->gpr[12] = vcpu->arch.gpr[12];
+	svcpu->gpr[13] = vcpu->arch.gpr[13];
+	svcpu->cr  = vcpu->arch.cr;
+	svcpu->xer = vcpu->arch.xer;
+	svcpu->ctr = vcpu->arch.ctr;
+	svcpu->lr  = vcpu->arch.lr;
+	svcpu->pc  = vcpu->arch.pc;
+#ifdef CONFIG_PPC_BOOK3S_64
+	svcpu->shadow_fscr = vcpu->arch.shadow_fscr;
+#endif
+	/*
+	 * Now also save the current time base value. We use this
+	 * to find the guest purr and spurr value.
+	 */
+	vcpu->arch.entry_tb = get_tb();
+	vcpu->arch.entry_vtb = get_vtb();
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		vcpu->arch.entry_ic = mfspr(SPRN_IC);
+	svcpu->in_use = true;
+}
+
+/* Copy data touched by real-mode code from shadow vcpu back to vcpu */
+void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
+			    struct kvmppc_book3s_shadow_vcpu *svcpu)
+{
+	/*
+	 * vcpu_put would just call us again because in_use hasn't
+	 * been updated yet.
+	 */
+	preempt_disable();
+
+	/*
+	 * Maybe we were already preempted and synced the svcpu from
+	 * our preempt notifiers. Don't bother touching this svcpu then.
+	 */
+	if (!svcpu->in_use)
+		goto out;
+
+	vcpu->arch.gpr[0] = svcpu->gpr[0];
+	vcpu->arch.gpr[1] = svcpu->gpr[1];
+	vcpu->arch.gpr[2] = svcpu->gpr[2];
+	vcpu->arch.gpr[3] = svcpu->gpr[3];
+	vcpu->arch.gpr[4] = svcpu->gpr[4];
+	vcpu->arch.gpr[5] = svcpu->gpr[5];
+	vcpu->arch.gpr[6] = svcpu->gpr[6];
+	vcpu->arch.gpr[7] = svcpu->gpr[7];
+	vcpu->arch.gpr[8] = svcpu->gpr[8];
+	vcpu->arch.gpr[9] = svcpu->gpr[9];
+	vcpu->arch.gpr[10] = svcpu->gpr[10];
+	vcpu->arch.gpr[11] = svcpu->gpr[11];
+	vcpu->arch.gpr[12] = svcpu->gpr[12];
+	vcpu->arch.gpr[13] = svcpu->gpr[13];
+	vcpu->arch.cr  = svcpu->cr;
+	vcpu->arch.xer = svcpu->xer;
+	vcpu->arch.ctr = svcpu->ctr;
+	vcpu->arch.lr  = svcpu->lr;
+	vcpu->arch.pc  = svcpu->pc;
+	vcpu->arch.shadow_srr1 = svcpu->shadow_srr1;
+	vcpu->arch.fault_dar   = svcpu->fault_dar;
+	vcpu->arch.fault_dsisr = svcpu->fault_dsisr;
+	vcpu->arch.last_inst   = svcpu->last_inst;
+#ifdef CONFIG_PPC_BOOK3S_64
+	vcpu->arch.shadow_fscr = svcpu->shadow_fscr;
+#endif
+	/*
+	 * Update purr and spurr using time base on exit.
+	 */
+	vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb;
+	vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb;
+	vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
+	svcpu->in_use = false;
+
+out:
+	preempt_enable();
+}
+
+static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
+{
+	int r = 1; /* Indicate we want to get back into the guest */
+
+	/* We misuse TLB_FLUSH to indicate that we want to clear
+	   all shadow cache entries */
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+		kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return r;
+}
+
+/************* MMU Notifiers *************/
+static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start,
+			     unsigned long end)
+{
+	long i;
+	struct kvm_vcpu *vcpu;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots) {
+		unsigned long hva_start, hva_end;
+		gfn_t gfn, gfn_end;
+
+		hva_start = max(start, memslot->userspace_addr);
+		hva_end = min(end, memslot->userspace_addr +
+					(memslot->npages << PAGE_SHIFT));
+		if (hva_start >= hva_end)
+			continue;
+		/*
+		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+		 * {gfn, gfn+1, ..., gfn_end-1}.
+		 */
+		gfn = hva_to_gfn_memslot(hva_start, memslot);
+		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT,
+					      gfn_end << PAGE_SHIFT);
+	}
+}
+
+static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva)
+{
+	trace_kvm_unmap_hva(hva);
+
+	do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+
+	return 0;
+}
+
+static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start,
+				  unsigned long end)
+{
+	do_kvm_unmap_hva(kvm, start, end);
+
+	return 0;
+}
+
+static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start,
+			  unsigned long end)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	/* The page will get remapped properly on its next fault */
+	do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE);
+}
+
+/*****************************************/
+
+static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
+{
+	ulong guest_msr = kvmppc_get_msr(vcpu);
+	ulong smsr = guest_msr;
+
+	/* Guest MSR values */
+	smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
+	/* Process MSR values */
+	smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
+	/* External providers the guest reserved */
+	smsr |= (guest_msr & vcpu->arch.guest_owned_ext);
+	/* 64-bit Process MSR values */
+#ifdef CONFIG_PPC_BOOK3S_64
+	smsr |= MSR_ISF | MSR_HV;
+#endif
+	vcpu->arch.shadow_msr = smsr;
+}
+
+static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
+{
+	ulong old_msr = kvmppc_get_msr(vcpu);
+
+#ifdef EXIT_DEBUG
+	printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
+#endif
+
+	msr &= to_book3s(vcpu)->msr_mask;
+	kvmppc_set_msr_fast(vcpu, msr);
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	if (msr & MSR_POW) {
+		if (!vcpu->arch.pending_exceptions) {
+			kvm_vcpu_block(vcpu);
+			clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			vcpu->stat.halt_wakeup++;
+
+			/* Unset POW bit after we woke up */
+			msr &= ~MSR_POW;
+			kvmppc_set_msr_fast(vcpu, msr);
+		}
+	}
+
+	if (kvmppc_is_split_real(vcpu))
+		kvmppc_fixup_split_real(vcpu);
+	else
+		kvmppc_unfixup_split_real(vcpu);
+
+	if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) !=
+		   (old_msr & (MSR_PR|MSR_IR|MSR_DR))) {
+		kvmppc_mmu_flush_segments(vcpu);
+		kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+
+		/* Preload magic page segment when in kernel mode */
+		if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) {
+			struct kvm_vcpu_arch *a = &vcpu->arch;
+
+			if (msr & MSR_DR)
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_ea);
+			else
+				kvmppc_mmu_map_segment(vcpu, a->magic_page_pa);
+		}
+	}
+
+	/*
+	 * When switching from 32 to 64-bit, we may have a stale 32-bit
+	 * magic page around, we need to flush it. Typically 32-bit magic
+	 * page will be instanciated when calling into RTAS. Note: We
+	 * assume that such transition only happens while in kernel mode,
+	 * ie, we never transition from user 32-bit to kernel 64-bit with
+	 * a 32-bit magic page around.
+	 */
+	if (vcpu->arch.magic_page_pa &&
+	    !(old_msr & MSR_PR) && !(old_msr & MSR_SF) && (msr & MSR_SF)) {
+		/* going from RTAS to normal kernel code */
+		kvmppc_mmu_pte_flush(vcpu, (uint32_t)vcpu->arch.magic_page_pa,
+				     ~0xFFFUL);
+	}
+
+	/* Preload FPU if it's enabled */
+	if (kvmppc_get_msr(vcpu) & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+}
+
+void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+	u32 host_pvr;
+
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB;
+	vcpu->arch.pvr = pvr;
+#ifdef CONFIG_PPC_BOOK3S_64
+	if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
+		kvmppc_mmu_book3s_64_init(vcpu);
+		if (!to_book3s(vcpu)->hior_explicit)
+			to_book3s(vcpu)->hior = 0xfff00000;
+		to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+		vcpu->arch.cpu_type = KVM_CPU_3S_64;
+	} else
+#endif
+	{
+		kvmppc_mmu_book3s_32_init(vcpu);
+		if (!to_book3s(vcpu)->hior_explicit)
+			to_book3s(vcpu)->hior = 0;
+		to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+		vcpu->arch.cpu_type = KVM_CPU_3S_32;
+	}
+
+	kvmppc_sanity_check(vcpu);
+
+	/* If we are in hypervisor level on 970, we can tell the CPU to
+	 * treat DCBZ as 32 bytes store */
+	vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
+	    !strcmp(cur_cpu_spec->platform, "ppc970"))
+		vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+
+	/* Cell performs badly if MSR_FEx are set. So let's hope nobody
+	   really needs them in a VM on Cell and force disable them. */
+	if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be"))
+		to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1);
+
+	/*
+	 * If they're asking for POWER6 or later, set the flag
+	 * indicating that we can do multiple large page sizes
+	 * and 1TB segments.
+	 * Also set the flag that indicates that tlbie has the large
+	 * page bit in the RB operand instead of the instruction.
+	 */
+	switch (PVR_VER(pvr)) {
+	case PVR_POWER6:
+	case PVR_POWER7:
+	case PVR_POWER7p:
+	case PVR_POWER8:
+		vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
+			BOOK3S_HFLAG_NEW_TLBIE;
+		break;
+	}
+
+#ifdef CONFIG_PPC_BOOK3S_32
+	/* 32 bit Book3S always has 32 byte dcbz */
+	vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+#endif
+
+	/* On some CPUs we can execute paired single operations natively */
+	asm ( "mfpvr %0" : "=r"(host_pvr));
+	switch (host_pvr) {
+	case 0x00080200:	/* lonestar 2.0 */
+	case 0x00088202:	/* lonestar 2.2 */
+	case 0x70000100:	/* gekko 1.0 */
+	case 0x00080100:	/* gekko 2.0 */
+	case 0x00083203:	/* gekko 2.3a */
+	case 0x00083213:	/* gekko 2.3b */
+	case 0x00083204:	/* gekko 2.4 */
+	case 0x00083214:	/* gekko 2.4e (8SE) - retail HW2 */
+	case 0x00087200:	/* broadway */
+		vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS;
+		/* Enable HID2.PSE - in case we need it later */
+		mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29));
+	}
+}
+
+/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
+ * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
+ * emulate 32 bytes dcbz length.
+ *
+ * The Book3s_64 inventors also realized this case and implemented a special bit
+ * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
+ *
+ * My approach here is to patch the dcbz instruction on executing pages.
+ */
+static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+	struct page *hpage;
+	u64 hpage_offset;
+	u32 *page;
+	int i;
+
+	hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+	if (is_error_page(hpage))
+		return;
+
+	hpage_offset = pte->raddr & ~PAGE_MASK;
+	hpage_offset &= ~0xFFFULL;
+	hpage_offset /= 4;
+
+	get_page(hpage);
+	page = kmap_atomic(hpage);
+
+	/* patch dcbz into reserved instruction, so we trap */
+	for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++)
+		if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ)
+			page[i] &= cpu_to_be32(0xfffffff7);
+
+	kunmap_atomic(page);
+	put_page(hpage);
+}
+
+static int kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF))
+		mp_pa = (uint32_t)mp_pa;
+
+	gpa &= ~0xFFFULL;
+	if (unlikely(mp_pa) && unlikely((mp_pa & KVM_PAM) == (gpa & KVM_PAM))) {
+		return 1;
+	}
+
+	return kvm_is_visible_gfn(vcpu->kvm, gpa >> PAGE_SHIFT);
+}
+
+int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			    ulong eaddr, int vec)
+{
+	bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
+	bool iswrite = false;
+	int r = RESUME_GUEST;
+	int relocated;
+	int page_found = 0;
+	struct kvmppc_pte pte;
+	bool is_mmio = false;
+	bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false;
+	bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false;
+	u64 vsid;
+
+	relocated = data ? dr : ir;
+	if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE))
+		iswrite = true;
+
+	/* Resolve real address if translation turned on */
+	if (relocated) {
+		page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite);
+	} else {
+		pte.may_execute = true;
+		pte.may_read = true;
+		pte.may_write = true;
+		pte.raddr = eaddr & KVM_PAM;
+		pte.eaddr = eaddr;
+		pte.vpage = eaddr >> 12;
+		pte.page_size = MMU_PAGE_64K;
+	}
+
+	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) {
+	case 0:
+		pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12));
+		break;
+	case MSR_DR:
+		if (!data &&
+		    (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) &&
+		    ((pte.raddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS))
+			pte.raddr &= ~SPLIT_HACK_MASK;
+		/* fall through */
+	case MSR_IR:
+		vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid);
+
+		if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR)
+			pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12));
+		else
+			pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12));
+		pte.vpage |= vsid;
+
+		if (vsid == -1)
+			page_found = -EINVAL;
+		break;
+	}
+
+	if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+	   (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+		/*
+		 * If we do the dcbz hack, we have to NX on every execution,
+		 * so we can patch the executing code. This renders our guest
+		 * NX-less.
+		 */
+		pte.may_execute = !data;
+	}
+
+	if (page_found == -ENOENT) {
+		/* Page not found in guest PTE entries */
+		u64 ssrr1 = vcpu->arch.shadow_srr1;
+		u64 msr = kvmppc_get_msr(vcpu);
+		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+		kvmppc_set_dsisr(vcpu, vcpu->arch.fault_dsisr);
+		kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL));
+		kvmppc_book3s_queue_irqprio(vcpu, vec);
+	} else if (page_found == -EPERM) {
+		/* Storage protection */
+		u32 dsisr = vcpu->arch.fault_dsisr;
+		u64 ssrr1 = vcpu->arch.shadow_srr1;
+		u64 msr = kvmppc_get_msr(vcpu);
+		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+		dsisr = (dsisr & ~DSISR_NOHPTE) | DSISR_PROTFAULT;
+		kvmppc_set_dsisr(vcpu, dsisr);
+		kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL));
+		kvmppc_book3s_queue_irqprio(vcpu, vec);
+	} else if (page_found == -EINVAL) {
+		/* Page not found in guest SLB */
+		kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+		kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
+	} else if (!is_mmio &&
+		   kvmppc_visible_gpa(vcpu, pte.raddr)) {
+		if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) {
+			/*
+			 * There is already a host HPTE there, presumably
+			 * a read-only one for a page the guest thinks
+			 * is writable, so get rid of it first.
+			 */
+			kvmppc_mmu_unmap_page(vcpu, &pte);
+		}
+		/* The guest's PTE is not mapped yet. Map on the host */
+		kvmppc_mmu_map_page(vcpu, &pte, iswrite);
+		if (data)
+			vcpu->stat.sp_storage++;
+		else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			 (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
+			kvmppc_patch_dcbz(vcpu, &pte);
+	} else {
+		/* MMIO */
+		vcpu->stat.mmio_exits++;
+		vcpu->arch.paddr_accessed = pte.raddr;
+		vcpu->arch.vaddr_accessed = pte.eaddr;
+		r = kvmppc_emulate_mmio(run, vcpu);
+		if ( r == RESUME_HOST_NV )
+			r = RESUME_HOST;
+	}
+
+	return r;
+}
+
+/* Give up external provider (FPU, Altivec, VSX) */
+void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+
+	/*
+	 * VSX instructions can access FP and vector registers, so if
+	 * we are giving up VSX, make sure we give up FP and VMX as well.
+	 */
+	if (msr & MSR_VSX)
+		msr |= MSR_FP | MSR_VEC;
+
+	msr &= vcpu->arch.guest_owned_ext;
+	if (!msr)
+		return;
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
+#endif
+
+	if (msr & MSR_FP) {
+		/*
+		 * Note that on CPUs with VSX, giveup_fpu stores
+		 * both the traditional FP registers and the added VSX
+		 * registers into thread.fp_state.fpr[].
+		 */
+		if (t->regs->msr & MSR_FP)
+			giveup_fpu(current);
+		t->fp_save_area = NULL;
+	}
+
+#ifdef CONFIG_ALTIVEC
+	if (msr & MSR_VEC) {
+		if (current->thread.regs->msr & MSR_VEC)
+			giveup_altivec(current);
+		t->vr_save_area = NULL;
+	}
+#endif
+
+	vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX);
+	kvmppc_recalc_shadow_msr(vcpu);
+}
+
+/* Give up facility (TAR / EBB / DSCR) */
+static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) {
+		/* Facility not available to the guest, ignore giveup request*/
+		return;
+	}
+
+	switch (fac) {
+	case FSCR_TAR_LG:
+		vcpu->arch.tar = mfspr(SPRN_TAR);
+		mtspr(SPRN_TAR, current->thread.tar);
+		vcpu->arch.shadow_fscr &= ~FSCR_TAR;
+		break;
+	}
+#endif
+}
+
+/* Handle external providers (FPU, Altivec, VSX) */
+static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
+			     ulong msr)
+{
+	struct thread_struct *t = &current->thread;
+
+	/* When we have paired singles, we emulate in software */
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
+		return RESUME_GUEST;
+
+	if (!(kvmppc_get_msr(vcpu) & msr)) {
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		return RESUME_GUEST;
+	}
+
+	if (msr == MSR_VSX) {
+		/* No VSX?  Give an illegal instruction interrupt */
+#ifdef CONFIG_VSX
+		if (!cpu_has_feature(CPU_FTR_VSX))
+#endif
+		{
+			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+			return RESUME_GUEST;
+		}
+
+		/*
+		 * We have to load up all the FP and VMX registers before
+		 * we can let the guest use VSX instructions.
+		 */
+		msr = MSR_FP | MSR_VEC | MSR_VSX;
+	}
+
+	/* See if we already own all the ext(s) needed */
+	msr &= ~vcpu->arch.guest_owned_ext;
+	if (!msr)
+		return RESUME_GUEST;
+
+#ifdef DEBUG_EXT
+	printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
+#endif
+
+	if (msr & MSR_FP) {
+		preempt_disable();
+		enable_kernel_fp();
+		load_fp_state(&vcpu->arch.fp);
+		t->fp_save_area = &vcpu->arch.fp;
+		preempt_enable();
+	}
+
+	if (msr & MSR_VEC) {
+#ifdef CONFIG_ALTIVEC
+		preempt_disable();
+		enable_kernel_altivec();
+		load_vr_state(&vcpu->arch.vr);
+		t->vr_save_area = &vcpu->arch.vr;
+		preempt_enable();
+#endif
+	}
+
+	t->regs->msr |= msr;
+	vcpu->arch.guest_owned_ext |= msr;
+	kvmppc_recalc_shadow_msr(vcpu);
+
+	return RESUME_GUEST;
+}
+
+/*
+ * Kernel code using FP or VMX could have flushed guest state to
+ * the thread_struct; if so, get it back now.
+ */
+static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
+{
+	unsigned long lost_ext;
+
+	lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr;
+	if (!lost_ext)
+		return;
+
+	if (lost_ext & MSR_FP) {
+		preempt_disable();
+		enable_kernel_fp();
+		load_fp_state(&vcpu->arch.fp);
+		preempt_enable();
+	}
+#ifdef CONFIG_ALTIVEC
+	if (lost_ext & MSR_VEC) {
+		preempt_disable();
+		enable_kernel_altivec();
+		load_vr_state(&vcpu->arch.vr);
+		preempt_enable();
+	}
+#endif
+	current->thread.regs->msr |= lost_ext;
+}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+static void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac)
+{
+	/* Inject the Interrupt Cause field and trigger a guest interrupt */
+	vcpu->arch.fscr &= ~(0xffULL << 56);
+	vcpu->arch.fscr |= (fac << 56);
+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL);
+}
+
+static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+	enum emulation_result er = EMULATE_FAIL;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_PR))
+		er = kvmppc_emulate_instruction(vcpu->run, vcpu);
+
+	if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) {
+		/* Couldn't emulate, trigger interrupt in guest */
+		kvmppc_trigger_fac_interrupt(vcpu, fac);
+	}
+}
+
+/* Enable facilities (TAR, EBB, DSCR) for the guest */
+static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac)
+{
+	bool guest_fac_enabled;
+	BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S));
+
+	/*
+	 * Not every facility is enabled by FSCR bits, check whether the
+	 * guest has this facility enabled at all.
+	 */
+	switch (fac) {
+	case FSCR_TAR_LG:
+	case FSCR_EBB_LG:
+		guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac));
+		break;
+	case FSCR_TM_LG:
+		guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM;
+		break;
+	default:
+		guest_fac_enabled = false;
+		break;
+	}
+
+	if (!guest_fac_enabled) {
+		/* Facility not enabled by the guest */
+		kvmppc_trigger_fac_interrupt(vcpu, fac);
+		return RESUME_GUEST;
+	}
+
+	switch (fac) {
+	case FSCR_TAR_LG:
+		/* TAR switching isn't lazy in Linux yet */
+		current->thread.tar = mfspr(SPRN_TAR);
+		mtspr(SPRN_TAR, vcpu->arch.tar);
+		vcpu->arch.shadow_fscr |= FSCR_TAR;
+		break;
+	default:
+		kvmppc_emulate_fac(vcpu, fac);
+		break;
+	}
+
+	return RESUME_GUEST;
+}
+
+void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr)
+{
+	if ((vcpu->arch.fscr & FSCR_TAR) && !(fscr & FSCR_TAR)) {
+		/* TAR got dropped, drop it in shadow too */
+		kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+	}
+	vcpu->arch.fscr = fscr;
+}
+#endif
+
+int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			  unsigned int exit_nr)
+{
+	int r = RESUME_HOST;
+	int s;
+
+	vcpu->stat.sum_exits++;
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+
+	/* We get here with MSR.EE=1 */
+
+	trace_kvm_exit(exit_nr, vcpu);
+	kvm_guest_exit();
+
+	switch (exit_nr) {
+	case BOOK3S_INTERRUPT_INST_STORAGE:
+	{
+		ulong shadow_srr1 = vcpu->arch.shadow_srr1;
+		vcpu->stat.pf_instruc++;
+
+		if (kvmppc_is_split_real(vcpu))
+			kvmppc_fixup_split_real(vcpu);
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		{
+			struct kvmppc_book3s_shadow_vcpu *svcpu;
+			u32 sr;
+
+			svcpu = svcpu_get(vcpu);
+			sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT];
+			svcpu_put(svcpu);
+			if (sr == SR_INVALID) {
+				kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu));
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+#endif
+
+		/* only care about PTEG not found errors, but leave NX alone */
+		if (shadow_srr1 & 0x40000000) {
+			int idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+			vcpu->stat.sp_instruc++;
+		} else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+			  (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+			/*
+			 * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
+			 *     so we can't use the NX bit inside the guest. Let's cross our fingers,
+			 *     that no guest that needs the dcbz hack does NX.
+			 */
+			kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL);
+			r = RESUME_GUEST;
+		} else {
+			u64 msr = kvmppc_get_msr(vcpu);
+			msr |= shadow_srr1 & 0x58000000;
+			kvmppc_set_msr_fast(vcpu, msr);
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_DATA_STORAGE:
+	{
+		ulong dar = kvmppc_get_fault_dar(vcpu);
+		u32 fault_dsisr = vcpu->arch.fault_dsisr;
+		vcpu->stat.pf_storage++;
+
+#ifdef CONFIG_PPC_BOOK3S_32
+		/* We set segments as unused segments when invalidating them. So
+		 * treat the respective fault as segment fault. */
+		{
+			struct kvmppc_book3s_shadow_vcpu *svcpu;
+			u32 sr;
+
+			svcpu = svcpu_get(vcpu);
+			sr = svcpu->sr[dar >> SID_SHIFT];
+			svcpu_put(svcpu);
+			if (sr == SR_INVALID) {
+				kvmppc_mmu_map_segment(vcpu, dar);
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+#endif
+
+		/*
+		 * We need to handle missing shadow PTEs, and
+		 * protection faults due to us mapping a page read-only
+		 * when the guest thinks it is writable.
+		 */
+		if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) {
+			int idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr);
+			srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		} else {
+			kvmppc_set_dar(vcpu, dar);
+			kvmppc_set_dsisr(vcpu, fault_dsisr);
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_DATA_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) {
+			kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu));
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_DATA_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_INST_SEGMENT:
+		if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) {
+			kvmppc_book3s_queue_irqprio(vcpu,
+				BOOK3S_INTERRUPT_INST_SEGMENT);
+		}
+		r = RESUME_GUEST;
+		break;
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_DECREMENTER:
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+	case BOOK3S_INTERRUPT_DOORBELL:
+	case BOOK3S_INTERRUPT_H_DOORBELL:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+	case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
+	case BOOK3S_INTERRUPT_EXTERNAL_HV:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PERFMON:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_PROGRAM:
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+	{
+		enum emulation_result er;
+		ulong flags;
+		u32 last_inst;
+		int emul;
+
+program_interrupt:
+		flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+
+		emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+		if (emul != EMULATE_DONE) {
+			r = RESUME_GUEST;
+			break;
+		}
+
+		if (kvmppc_get_msr(vcpu) & MSR_PR) {
+#ifdef EXIT_DEBUG
+			pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
+				kvmppc_get_pc(vcpu), last_inst);
+#endif
+			if ((last_inst & 0xff0007ff) !=
+			    (INS_DCBZ & 0xfffffff7)) {
+				kvmppc_core_queue_program(vcpu, flags);
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+
+		vcpu->stat.emulated_inst_exits++;
+		er = kvmppc_emulate_instruction(run, vcpu);
+		switch (er) {
+		case EMULATE_DONE:
+			r = RESUME_GUEST_NV;
+			break;
+		case EMULATE_AGAIN:
+			r = RESUME_GUEST;
+			break;
+		case EMULATE_FAIL:
+			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+			       __func__, kvmppc_get_pc(vcpu), last_inst);
+			kvmppc_core_queue_program(vcpu, flags);
+			r = RESUME_GUEST;
+			break;
+		case EMULATE_DO_MMIO:
+			run->exit_reason = KVM_EXIT_MMIO;
+			r = RESUME_HOST_NV;
+			break;
+		case EMULATE_EXIT_USER:
+			r = RESUME_HOST_NV;
+			break;
+		default:
+			BUG();
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_SYSCALL:
+	{
+		u32 last_sc;
+		int emul;
+
+		/* Get last sc for papr */
+		if (vcpu->arch.papr_enabled) {
+			/* The sc instuction points SRR0 to the next inst */
+			emul = kvmppc_get_last_inst(vcpu, INST_SC, &last_sc);
+			if (emul != EMULATE_DONE) {
+				kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) - 4);
+				r = RESUME_GUEST;
+				break;
+			}
+		}
+
+		if (vcpu->arch.papr_enabled &&
+		    (last_sc == 0x44000022) &&
+		    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+			/* SC 1 papr hypercalls */
+			ulong cmd = kvmppc_get_gpr(vcpu, 3);
+			int i;
+
+#ifdef CONFIG_PPC_BOOK3S_64
+			if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) {
+				r = RESUME_GUEST;
+				break;
+			}
+#endif
+
+			run->papr_hcall.nr = cmd;
+			for (i = 0; i < 9; ++i) {
+				ulong gpr = kvmppc_get_gpr(vcpu, 4 + i);
+				run->papr_hcall.args[i] = gpr;
+			}
+			run->exit_reason = KVM_EXIT_PAPR_HCALL;
+			vcpu->arch.hcall_needed = 1;
+			r = RESUME_HOST;
+		} else if (vcpu->arch.osi_enabled &&
+		    (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) {
+			/* MOL hypercalls */
+			u64 *gprs = run->osi.gprs;
+			int i;
+
+			run->exit_reason = KVM_EXIT_OSI;
+			for (i = 0; i < 32; i++)
+				gprs[i] = kvmppc_get_gpr(vcpu, i);
+			vcpu->arch.osi_needed = 1;
+			r = RESUME_HOST_NV;
+		} else if (!(kvmppc_get_msr(vcpu) & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			vcpu->stat.syscall_exits++;
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+			r = RESUME_GUEST;
+		}
+		break;
+	}
+	case BOOK3S_INTERRUPT_FP_UNAVAIL:
+	case BOOK3S_INTERRUPT_ALTIVEC:
+	case BOOK3S_INTERRUPT_VSX:
+	{
+		int ext_msr = 0;
+		int emul;
+		u32 last_inst;
+
+		if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) {
+			/* Do paired single instruction emulation */
+			emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
+						    &last_inst);
+			if (emul == EMULATE_DONE)
+				goto program_interrupt;
+			else
+				r = RESUME_GUEST;
+
+			break;
+		}
+
+		/* Enable external provider */
+		switch (exit_nr) {
+		case BOOK3S_INTERRUPT_FP_UNAVAIL:
+			ext_msr = MSR_FP;
+			break;
+
+		case BOOK3S_INTERRUPT_ALTIVEC:
+			ext_msr = MSR_VEC;
+			break;
+
+		case BOOK3S_INTERRUPT_VSX:
+			ext_msr = MSR_VSX;
+			break;
+		}
+
+		r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr);
+		break;
+	}
+	case BOOK3S_INTERRUPT_ALIGNMENT:
+	{
+		u32 last_inst;
+		int emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+
+		if (emul == EMULATE_DONE) {
+			u32 dsisr;
+			u64 dar;
+
+			dsisr = kvmppc_alignment_dsisr(vcpu, last_inst);
+			dar = kvmppc_alignment_dar(vcpu, last_inst);
+
+			kvmppc_set_dsisr(vcpu, dsisr);
+			kvmppc_set_dar(vcpu, dar);
+
+			kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		}
+		r = RESUME_GUEST;
+		break;
+	}
+#ifdef CONFIG_PPC_BOOK3S_64
+	case BOOK3S_INTERRUPT_FAC_UNAVAIL:
+		kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56);
+		r = RESUME_GUEST;
+		break;
+#endif
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+	case BOOK3S_INTERRUPT_TRACE:
+		kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+		r = RESUME_GUEST;
+		break;
+	default:
+	{
+		ulong shadow_srr1 = vcpu->arch.shadow_srr1;
+		/* Ugh - bork here! What did we get? */
+		printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n",
+			exit_nr, kvmppc_get_pc(vcpu), shadow_srr1);
+		r = RESUME_HOST;
+		BUG();
+		break;
+	}
+	}
+
+	if (!(r & RESUME_HOST)) {
+		/* To avoid clobbering exit_reason, only check for signals if
+		 * we aren't already exiting to userspace for some other
+		 * reason. */
+
+		/*
+		 * Interrupts could be timers for the guest which we have to
+		 * inject again, so let's postpone them until we're in the guest
+		 * and if we really did time things so badly, then we just exit
+		 * again due to a host external interrupt.
+		 */
+		s = kvmppc_prepare_to_enter(vcpu);
+		if (s <= 0)
+			r = s;
+		else {
+			/* interrupts now hard-disabled */
+			kvmppc_fix_ee_before_entry();
+		}
+
+		kvmppc_handle_lost_ext(vcpu);
+	}
+
+	trace_kvm_book3s_reenter(r, vcpu);
+
+	return r;
+}
+
+static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	sregs->pvr = vcpu->arch.pvr;
+
+	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i;
+			sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
+		}
+	} else {
+		for (i = 0; i < 16; i++)
+			sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i);
+
+		for (i = 0; i < 8; i++) {
+			sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw;
+			sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw;
+		}
+	}
+
+	return 0;
+}
+
+static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
+					    struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
+	int i;
+
+	kvmppc_set_pvr_pr(vcpu, sregs->pvr);
+
+	vcpu3s->sdr1 = sregs->u.s.sdr1;
+	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		for (i = 0; i < 64; i++) {
+			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
+						    sregs->u.s.ppc64.slb[i].slbe);
+		}
+	} else {
+		for (i = 0; i < 16; i++) {
+			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
+		}
+		for (i = 0; i < 8; i++) {
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false,
+				       (u32)sregs->u.s.ppc32.ibat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true,
+				       (u32)(sregs->u.s.ppc32.ibat[i] >> 32));
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false,
+				       (u32)sregs->u.s.ppc32.dbat[i]);
+			kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true,
+				       (u32)(sregs->u.s.ppc32.dbat[i] >> 32));
+		}
+	}
+
+	/* Flush the MMU after messing with the segments */
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+	return 0;
+}
+
+static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_DEBUG_INST:
+		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
+		break;
+	case KVM_REG_PPC_HIOR:
+		*val = get_reg_val(id, to_book3s(vcpu)->hior);
+		break;
+	case KVM_REG_PPC_LPCR:
+	case KVM_REG_PPC_LPCR_64:
+		/*
+		 * We are only interested in the LPCR_ILE bit
+		 */
+		if (vcpu->arch.intr_msr & MSR_LE)
+			*val = get_reg_val(id, LPCR_ILE);
+		else
+			*val = get_reg_val(id, 0);
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
+{
+	if (new_lpcr & LPCR_ILE)
+		vcpu->arch.intr_msr |= MSR_LE;
+	else
+		vcpu->arch.intr_msr &= ~MSR_LE;
+}
+
+static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
+				 union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_HIOR:
+		to_book3s(vcpu)->hior = set_reg_val(id, *val);
+		to_book3s(vcpu)->hior_explicit = true;
+		break;
+	case KVM_REG_PPC_LPCR:
+	case KVM_REG_PPC_LPCR_64:
+		kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm,
+						   unsigned int id)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s;
+	struct kvm_vcpu *vcpu;
+	int err = -ENOMEM;
+	unsigned long p;
+
+	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu)
+		goto out;
+
+	vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s));
+	if (!vcpu_book3s)
+		goto free_vcpu;
+	vcpu->arch.book3s = vcpu_book3s;
+
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	vcpu->arch.shadow_vcpu =
+		kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL);
+	if (!vcpu->arch.shadow_vcpu)
+		goto free_vcpu3s;
+#endif
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_shadow_vcpu;
+
+	err = -ENOMEM;
+	p = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!p)
+		goto uninit_vcpu;
+	vcpu->arch.shared = (void *)p;
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Always start the shared struct in native endian mode */
+#ifdef __BIG_ENDIAN__
+        vcpu->arch.shared_big_endian = true;
+#else
+        vcpu->arch.shared_big_endian = false;
+#endif
+
+	/*
+	 * Default to the same as the host if we're on sufficiently
+	 * recent machine that we have 1TB segments;
+	 * otherwise default to PPC970FX.
+	 */
+	vcpu->arch.pvr = 0x3C0301;
+	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
+		vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu->arch.intr_msr = MSR_SF;
+#else
+	/* default to book3s_32 (750) */
+	vcpu->arch.pvr = 0x84202;
+#endif
+	kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr);
+	vcpu->arch.slb_nr = 64;
+
+	vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE;
+
+	err = kvmppc_mmu_init(vcpu);
+	if (err < 0)
+		goto uninit_vcpu;
+
+	return vcpu;
+
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+free_shadow_vcpu:
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	kfree(vcpu->arch.shadow_vcpu);
+free_vcpu3s:
+#endif
+	vfree(vcpu_book3s);
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
+out:
+	return ERR_PTR(err);
+}
+
+static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared & PAGE_MASK);
+	kvm_vcpu_uninit(vcpu);
+#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
+	kfree(vcpu->arch.shadow_vcpu);
+#endif
+	vfree(vcpu_book3s);
+	kmem_cache_free(kvm_vcpu_cache, vcpu);
+}
+
+static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ret;
+#ifdef CONFIG_ALTIVEC
+	unsigned long uninitialized_var(vrsave);
+#endif
+
+	/* Check if we can run the vcpu at all */
+	if (!vcpu->arch.sane) {
+		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Interrupts could be timers for the guest which we have to inject
+	 * again, so let's postpone them until we're in the guest and if we
+	 * really did time things so badly, then we just exit again due to
+	 * a host external interrupt.
+	 */
+	ret = kvmppc_prepare_to_enter(vcpu);
+	if (ret <= 0)
+		goto out;
+	/* interrupts now hard-disabled */
+
+	/* Save FPU state in thread_struct */
+	if (current->thread.regs->msr & MSR_FP)
+		giveup_fpu(current);
+
+#ifdef CONFIG_ALTIVEC
+	/* Save Altivec state in thread_struct */
+	if (current->thread.regs->msr & MSR_VEC)
+		giveup_altivec(current);
+#endif
+
+#ifdef CONFIG_VSX
+	/* Save VSX state in thread_struct */
+	if (current->thread.regs->msr & MSR_VSX)
+		__giveup_vsx(current);
+#endif
+
+	/* Preload FPU if it's enabled */
+	if (kvmppc_get_msr(vcpu) & MSR_FP)
+		kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
+
+	kvmppc_fix_ee_before_entry();
+
+	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+
+	/* No need for kvm_guest_exit. It's done in handle_exit.
+	   We also get here with interrupts enabled. */
+
+	/* Make sure we save the guest FPU/Altivec/VSX state */
+	kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+
+	/* Make sure we save the guest TAR/EBB/DSCR state */
+	kvmppc_giveup_fac(vcpu, FSCR_TAR_LG);
+
+out:
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	return ret;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
+					 struct kvm_dirty_log *log)
+{
+	struct kvm_memory_slot *memslot;
+	struct kvm_vcpu *vcpu;
+	ulong ga, ga_end;
+	int is_dirty = 0;
+	int r;
+	unsigned long n;
+
+	mutex_lock(&kvm->slots_lock);
+
+	r = kvm_get_dirty_log(kvm, log, &is_dirty);
+	if (r)
+		goto out;
+
+	/* If nothing is dirty, don't bother messing with page tables. */
+	if (is_dirty) {
+		memslot = id_to_memslot(kvm->memslots, log->slot);
+
+		ga = memslot->base_gfn << PAGE_SHIFT;
+		ga_end = ga + (memslot->npages << PAGE_SHIFT);
+
+		kvm_for_each_vcpu(n, vcpu, kvm)
+			kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
+
+		n = kvm_dirty_bitmap_bytes(memslot);
+		memset(memslot->dirty_bitmap, 0, n);
+	}
+
+	r = 0;
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return r;
+}
+
+static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
+					 struct kvm_memory_slot *memslot)
+{
+	return;
+}
+
+static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
+					struct kvm_memory_slot *memslot,
+					struct kvm_userspace_memory_region *mem)
+{
+	return 0;
+}
+
+static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old)
+{
+	return;
+}
+
+static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free,
+					struct kvm_memory_slot *dont)
+{
+	return;
+}
+
+static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot,
+					 unsigned long npages)
+{
+	return 0;
+}
+
+
+#ifdef CONFIG_PPC64
+static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm,
+					 struct kvm_ppc_smmu_info *info)
+{
+	long int i;
+	struct kvm_vcpu *vcpu;
+
+	info->flags = 0;
+
+	/* SLB is always 64 entries */
+	info->slb_size = 64;
+
+	/* Standard 4k base page size segment */
+	info->sps[0].page_shift = 12;
+	info->sps[0].slb_enc = 0;
+	info->sps[0].enc[0].page_shift = 12;
+	info->sps[0].enc[0].pte_enc = 0;
+
+	/*
+	 * 64k large page size.
+	 * We only want to put this in if the CPUs we're emulating
+	 * support it, but unfortunately we don't have a vcpu easily
+	 * to hand here to test.  Just pick the first vcpu, and if
+	 * that doesn't exist yet, report the minimum capability,
+	 * i.e., no 64k pages.
+	 * 1T segment support goes along with 64k pages.
+	 */
+	i = 1;
+	vcpu = kvm_get_vcpu(kvm, 0);
+	if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) {
+		info->flags = KVM_PPC_1T_SEGMENTS;
+		info->sps[i].page_shift = 16;
+		info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01;
+		info->sps[i].enc[0].page_shift = 16;
+		info->sps[i].enc[0].pte_enc = 1;
+		++i;
+	}
+
+	/* Standard 16M large page size segment */
+	info->sps[i].page_shift = 24;
+	info->sps[i].slb_enc = SLB_VSID_L;
+	info->sps[i].enc[0].page_shift = 24;
+	info->sps[i].enc[0].pte_enc = 0;
+
+	return 0;
+}
+#else
+static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm,
+					 struct kvm_ppc_smmu_info *info)
+{
+	/* We should not get called */
+	BUG();
+}
+#endif /* CONFIG_PPC64 */
+
+static unsigned int kvm_global_user_count = 0;
+static DEFINE_SPINLOCK(kvm_global_user_count_lock);
+
+static int kvmppc_core_init_vm_pr(struct kvm *kvm)
+{
+	mutex_init(&kvm->arch.hpt_mutex);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Start out with the default set of hcalls enabled */
+	kvmppc_pr_init_default_hcalls(kvm);
+#endif
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		spin_lock(&kvm_global_user_count_lock);
+		if (++kvm_global_user_count == 1)
+			pSeries_disable_reloc_on_exc();
+		spin_unlock(&kvm_global_user_count_lock);
+	}
+	return 0;
+}
+
+static void kvmppc_core_destroy_vm_pr(struct kvm *kvm)
+{
+#ifdef CONFIG_PPC64
+	WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
+#endif
+
+	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
+		spin_lock(&kvm_global_user_count_lock);
+		BUG_ON(kvm_global_user_count == 0);
+		if (--kvm_global_user_count == 0)
+			pSeries_enable_reloc_on_exc();
+		spin_unlock(&kvm_global_user_count_lock);
+	}
+}
+
+static int kvmppc_core_check_processor_compat_pr(void)
+{
+	/* we are always compatible */
+	return 0;
+}
+
+static long kvm_arch_vm_ioctl_pr(struct file *filp,
+				 unsigned int ioctl, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static struct kvmppc_ops kvm_ops_pr = {
+	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr,
+	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr,
+	.get_one_reg = kvmppc_get_one_reg_pr,
+	.set_one_reg = kvmppc_set_one_reg_pr,
+	.vcpu_load   = kvmppc_core_vcpu_load_pr,
+	.vcpu_put    = kvmppc_core_vcpu_put_pr,
+	.set_msr     = kvmppc_set_msr_pr,
+	.vcpu_run    = kvmppc_vcpu_run_pr,
+	.vcpu_create = kvmppc_core_vcpu_create_pr,
+	.vcpu_free   = kvmppc_core_vcpu_free_pr,
+	.check_requests = kvmppc_core_check_requests_pr,
+	.get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr,
+	.flush_memslot = kvmppc_core_flush_memslot_pr,
+	.prepare_memory_region = kvmppc_core_prepare_memory_region_pr,
+	.commit_memory_region = kvmppc_core_commit_memory_region_pr,
+	.unmap_hva = kvm_unmap_hva_pr,
+	.unmap_hva_range = kvm_unmap_hva_range_pr,
+	.age_hva  = kvm_age_hva_pr,
+	.test_age_hva = kvm_test_age_hva_pr,
+	.set_spte_hva = kvm_set_spte_hva_pr,
+	.mmu_destroy  = kvmppc_mmu_destroy_pr,
+	.free_memslot = kvmppc_core_free_memslot_pr,
+	.create_memslot = kvmppc_core_create_memslot_pr,
+	.init_vm = kvmppc_core_init_vm_pr,
+	.destroy_vm = kvmppc_core_destroy_vm_pr,
+	.get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr,
+	.emulate_op = kvmppc_core_emulate_op_pr,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_pr,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_pr,
+	.fast_vcpu_kick = kvm_vcpu_kick,
+	.arch_vm_ioctl  = kvm_arch_vm_ioctl_pr,
+#ifdef CONFIG_PPC_BOOK3S_64
+	.hcall_implemented = kvmppc_hcall_impl_pr,
+#endif
+};
+
+
+int kvmppc_book3s_init_pr(void)
+{
+	int r;
+
+	r = kvmppc_core_check_processor_compat_pr();
+	if (r < 0)
+		return r;
+
+	kvm_ops_pr.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_pr;
+
+	r = kvmppc_mmu_hpte_sysinit();
+	return r;
+}
+
+void kvmppc_book3s_exit_pr(void)
+{
+	kvmppc_pr_ops = NULL;
+	kvmppc_mmu_hpte_sysexit();
+}
+
+/*
+ * We only support separate modules for book3s 64
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+
+module_init(kvmppc_book3s_init_pr);
+module_exit(kvmppc_book3s_exit_pr);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
+#endif
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
new file mode 100644
index 000000000..f2c75a1e0
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (C) 2011. Freescale Inc. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Paul Mackerras <paulus@samba.org>
+ *
+ * Description:
+ *
+ * Hypercall handling for running PAPR guests in PR KVM on Book 3S
+ * processors.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/anon_inodes.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+#define HPTE_SIZE	16		/* bytes per HPT entry */
+
+static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index)
+{
+	struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+	unsigned long pteg_addr;
+
+	pte_index <<= 4;
+	pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70;
+	pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
+	pteg_addr |= pte_index;
+
+	return pteg_addr;
+}
+
+static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu)
+{
+	long flags = kvmppc_get_gpr(vcpu, 4);
+	long pte_index = kvmppc_get_gpr(vcpu, 5);
+	__be64 pteg[2 * 8];
+	__be64 *hpte;
+	unsigned long pteg_addr, i;
+	long int ret;
+
+	i = pte_index & 7;
+	pte_index &= ~7UL;
+	pteg_addr = get_pteg_addr(vcpu, pte_index);
+
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg));
+	hpte = pteg;
+
+	ret = H_PTEG_FULL;
+	if (likely((flags & H_EXACT) == 0)) {
+		for (i = 0; ; ++i) {
+			if (i == 8)
+				goto done;
+			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0)
+				break;
+			hpte += 2;
+		}
+	} else {
+		hpte += i * 2;
+		if (*hpte & HPTE_V_VALID)
+			goto done;
+	}
+
+	hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6));
+	hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7));
+	pteg_addr += i * HPTE_SIZE;
+	copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE);
+	kvmppc_set_gpr(vcpu, 4, pte_index | i);
+	ret = H_SUCCESS;
+
+ done:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags= kvmppc_get_gpr(vcpu, 4);
+	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+	unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+	unsigned long v = 0, pteg, rb;
+	unsigned long pte[2];
+	long int ret;
+
+	pteg = get_pteg_addr(vcpu, pte_index);
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+	pte[0] = be64_to_cpu((__force __be64)pte[0]);
+	pte[1] = be64_to_cpu((__force __be64)pte[1]);
+
+	ret = H_NOT_FOUND;
+	if ((pte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) ||
+	    ((flags & H_ANDCOND) && (pte[0] & avpn) != 0))
+		goto done;
+
+	copy_to_user((void __user *)pteg, &v, sizeof(v));
+
+	rb = compute_tlbie_rb(pte[0], pte[1], pte_index);
+	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+
+	ret = H_SUCCESS;
+	kvmppc_set_gpr(vcpu, 4, pte[0]);
+	kvmppc_set_gpr(vcpu, 5, pte[1]);
+
+ done:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+/* Request defs for kvmppc_h_pr_bulk_remove() */
+#define H_BULK_REMOVE_TYPE             0xc000000000000000ULL
+#define   H_BULK_REMOVE_REQUEST        0x4000000000000000ULL
+#define   H_BULK_REMOVE_RESPONSE       0x8000000000000000ULL
+#define   H_BULK_REMOVE_END            0xc000000000000000ULL
+#define H_BULK_REMOVE_CODE             0x3000000000000000ULL
+#define   H_BULK_REMOVE_SUCCESS        0x0000000000000000ULL
+#define   H_BULK_REMOVE_NOT_FOUND      0x1000000000000000ULL
+#define   H_BULK_REMOVE_PARM           0x2000000000000000ULL
+#define   H_BULK_REMOVE_HW             0x3000000000000000ULL
+#define H_BULK_REMOVE_RC               0x0c00000000000000ULL
+#define H_BULK_REMOVE_FLAGS            0x0300000000000000ULL
+#define   H_BULK_REMOVE_ABSOLUTE       0x0000000000000000ULL
+#define   H_BULK_REMOVE_ANDCOND        0x0100000000000000ULL
+#define   H_BULK_REMOVE_AVPN           0x0200000000000000ULL
+#define H_BULK_REMOVE_PTEX             0x00ffffffffffffffULL
+#define H_BULK_REMOVE_MAX_BATCH        4
+
+static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu)
+{
+	int i;
+	int paramnr = 4;
+	int ret = H_SUCCESS;
+
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) {
+		unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i));
+		unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1);
+		unsigned long pteg, rb, flags;
+		unsigned long pte[2];
+		unsigned long v = 0;
+
+		if ((tsh & H_BULK_REMOVE_TYPE) == H_BULK_REMOVE_END) {
+			break; /* Exit success */
+		} else if ((tsh & H_BULK_REMOVE_TYPE) !=
+			   H_BULK_REMOVE_REQUEST) {
+			ret = H_PARAMETER;
+			break; /* Exit fail */
+		}
+
+		tsh &= H_BULK_REMOVE_PTEX | H_BULK_REMOVE_FLAGS;
+		tsh |= H_BULK_REMOVE_RESPONSE;
+
+		if ((tsh & H_BULK_REMOVE_ANDCOND) &&
+		    (tsh & H_BULK_REMOVE_AVPN)) {
+			tsh |= H_BULK_REMOVE_PARM;
+			kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh);
+			ret = H_PARAMETER;
+			break; /* Exit fail */
+		}
+
+		pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX);
+		copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+		pte[0] = be64_to_cpu((__force __be64)pte[0]);
+		pte[1] = be64_to_cpu((__force __be64)pte[1]);
+
+		/* tsl = AVPN */
+		flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26;
+
+		if ((pte[0] & HPTE_V_VALID) == 0 ||
+		    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != tsl) ||
+		    ((flags & H_ANDCOND) && (pte[0] & tsl) != 0)) {
+			tsh |= H_BULK_REMOVE_NOT_FOUND;
+		} else {
+			/* Splat the pteg in (userland) hpt */
+			copy_to_user((void __user *)pteg, &v, sizeof(v));
+
+			rb = compute_tlbie_rb(pte[0], pte[1],
+					      tsh & H_BULK_REMOVE_PTEX);
+			vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+			tsh |= H_BULK_REMOVE_SUCCESS;
+			tsh |= (pte[1] & (HPTE_R_C | HPTE_R_R)) << 43;
+		}
+		kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh);
+	}
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags = kvmppc_get_gpr(vcpu, 4);
+	unsigned long pte_index = kvmppc_get_gpr(vcpu, 5);
+	unsigned long avpn = kvmppc_get_gpr(vcpu, 6);
+	unsigned long rb, pteg, r, v;
+	unsigned long pte[2];
+	long int ret;
+
+	pteg = get_pteg_addr(vcpu, pte_index);
+	mutex_lock(&vcpu->kvm->arch.hpt_mutex);
+	copy_from_user(pte, (void __user *)pteg, sizeof(pte));
+	pte[0] = be64_to_cpu((__force __be64)pte[0]);
+	pte[1] = be64_to_cpu((__force __be64)pte[1]);
+
+	ret = H_NOT_FOUND;
+	if ((pte[0] & HPTE_V_VALID) == 0 ||
+	    ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn))
+		goto done;
+
+	v = pte[0];
+	r = pte[1];
+	r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI |
+	       HPTE_R_KEY_LO);
+	r |= (flags << 55) & HPTE_R_PP0;
+	r |= (flags << 48) & HPTE_R_KEY_HI;
+	r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
+
+	pte[1] = r;
+
+	rb = compute_tlbie_rb(v, r, pte_index);
+	vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false);
+	pte[0] = (__force u64)cpu_to_be64(pte[0]);
+	pte[1] = (__force u64)cpu_to_be64(pte[1]);
+	copy_to_user((void __user *)pteg, pte, sizeof(pte));
+	ret = H_SUCCESS;
+
+ done:
+	mutex_unlock(&vcpu->kvm->arch.hpt_mutex);
+	kvmppc_set_gpr(vcpu, 3, ret);
+
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+	long rc;
+
+	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_load(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_store(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+	long rc = kvmppc_xics_hcall(vcpu, cmd);
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
+{
+	int rc, idx;
+
+	if (cmd <= MAX_HCALL_OPCODE &&
+	    !test_bit(cmd/4, vcpu->kvm->arch.enabled_hcalls))
+		return EMULATE_FAIL;
+
+	switch (cmd) {
+	case H_ENTER:
+		return kvmppc_h_pr_enter(vcpu);
+	case H_REMOVE:
+		return kvmppc_h_pr_remove(vcpu);
+	case H_PROTECT:
+		return kvmppc_h_pr_protect(vcpu);
+	case H_BULK_REMOVE:
+		return kvmppc_h_pr_bulk_remove(vcpu);
+	case H_PUT_TCE:
+		return kvmppc_h_pr_put_tce(vcpu);
+	case H_CEDE:
+		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
+		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		vcpu->stat.halt_wakeup++;
+		return EMULATE_DONE;
+	case H_LOGICAL_CI_LOAD:
+		return kvmppc_h_pr_logical_ci_load(vcpu);
+	case H_LOGICAL_CI_STORE:
+		return kvmppc_h_pr_logical_ci_store(vcpu);
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+		if (kvmppc_xics_enabled(vcpu))
+			return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+		break;
+	case H_RTAS:
+		if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+			break;
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+		rc = kvmppc_rtas_hcall(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		if (rc)
+			break;
+		kvmppc_set_gpr(vcpu, 3, 0);
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_FAIL;
+}
+
+int kvmppc_hcall_impl_pr(unsigned long cmd)
+{
+	switch (cmd) {
+	case H_ENTER:
+	case H_REMOVE:
+	case H_PROTECT:
+	case H_BULK_REMOVE:
+	case H_PUT_TCE:
+	case H_CEDE:
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
+#ifdef CONFIG_KVM_XICS
+	case H_XIRR:
+	case H_CPPR:
+	case H_EOI:
+	case H_IPI:
+	case H_IPOLL:
+	case H_XIRR_X:
+#endif
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * List of hcall numbers to enable by default.
+ * For compatibility with old userspace, we enable by default
+ * all hcalls that were implemented before the hcall-enabling
+ * facility was added.  Note this list should not include H_RTAS.
+ */
+static unsigned int default_hcall_list[] = {
+	H_ENTER,
+	H_REMOVE,
+	H_PROTECT,
+	H_BULK_REMOVE,
+	H_PUT_TCE,
+	H_CEDE,
+#ifdef CONFIG_KVM_XICS
+	H_XIRR,
+	H_CPPR,
+	H_EOI,
+	H_IPI,
+	H_IPOLL,
+	H_XIRR_X,
+#endif
+	0
+};
+
+void kvmppc_pr_init_default_hcalls(struct kvm *kvm)
+{
+	int i;
+	unsigned int hcall;
+
+	for (i = 0; default_hcall_list[i]; ++i) {
+		hcall = default_hcall_list[i];
+		WARN_ON(!kvmppc_hcall_impl_pr(hcall));
+		__set_bit(hcall / 4, kvm->arch.enabled_hcalls);
+	}
+}
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
new file mode 100644
index 000000000..16c4d88ba
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -0,0 +1,169 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/mmu.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+
+#ifdef CONFIG_PPC_BOOK3S_64
+#include <asm/exception-64s.h>
+#endif
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in low physical memory          *
+ *                                                                           *
+ ****************************************************************************/
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define FUNC(name) 		name
+#else
+#define FUNC(name) 		GLUE(.,name)
+#endif
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+
+#define FUNC(name)		name
+
+.macro INTERRUPT_TRAMPOLINE intno
+
+.global kvmppc_trampoline_\intno
+kvmppc_trampoline_\intno:
+
+	mtspr	SPRN_SPRG_SCRATCH0, r13		/* Save r13 */
+
+	/*
+	 * First thing to do is to find out if we're coming
+	 * from a KVM guest or a Linux process.
+	 *
+	 * To distinguish, we check a magic byte in the PACA/current
+	 */
+	mfspr	r13, SPRN_SPRG_THREAD
+	lwz	r13, THREAD_KVM_SVCPU(r13)
+	/* PPC32 can have a NULL pointer - let's check for that */
+	mtspr   SPRN_SPRG_SCRATCH1, r12		/* Save r12 */
+	mfcr	r12
+	cmpwi	r13, 0
+	bne	1f
+2:	mtcr	r12
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	mfspr	r13, SPRN_SPRG_SCRATCH0		/* r13 = original r13 */
+	b	kvmppc_resume_\intno		/* Get back original handler */
+
+1:	tophys(r13, r13)
+	stw	r12, HSTATE_SCRATCH1(r13)
+	mfspr	r12, SPRN_SPRG_SCRATCH1
+	stw	r12, HSTATE_SCRATCH0(r13)
+	lbz	r12, HSTATE_IN_GUEST(r13)
+	cmpwi	r12, KVM_GUEST_MODE_NONE
+	bne	..kvmppc_handler_hasmagic_\intno
+	/* No KVM guest? Then jump back to the Linux handler! */
+	lwz	r12, HSTATE_SCRATCH1(r13)
+	b	2b
+
+	/* Now we know we're handling a KVM guest */
+..kvmppc_handler_hasmagic_\intno:
+
+	/* Should we just skip the faulting instruction? */
+	cmpwi	r12, KVM_GUEST_MODE_SKIP
+	beq	kvmppc_handler_skip_ins
+
+	/* Let's store which interrupt we're handling */
+	li	r12, \intno
+
+	/* Jump into the SLB exit code that goes to the highmem handler */
+	b	kvmppc_handler_trampoline_exit
+
+.endm
+
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_SYSTEM_RESET
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_MACHINE_CHECK
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DATA_STORAGE
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_INST_STORAGE
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_EXTERNAL
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALIGNMENT
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PROGRAM
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_FP_UNAVAIL
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_DECREMENTER
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_SYSCALL
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_TRACE
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_PERFMON
+INTERRUPT_TRAMPOLINE	BOOK3S_INTERRUPT_ALTIVEC
+
+/*
+ * Bring us back to the faulting code, but skip the
+ * faulting instruction.
+ *
+ * This is a generic exit path from the interrupt
+ * trampolines above.
+ *
+ * Input Registers:
+ *
+ * R12            = free
+ * R13            = Shadow VCPU (PACA)
+ * HSTATE.SCRATCH0 = guest R12
+ * HSTATE.SCRATCH1 = guest CR
+ * SPRG_SCRATCH0  = guest R13
+ *
+ */
+kvmppc_handler_skip_ins:
+
+	/* Patch the IP to the next instruction */
+	mfsrr0	r12
+	addi	r12, r12, 4
+	mtsrr0	r12
+
+	/* Clean up all state */
+	lwz	r12, HSTATE_SCRATCH1(r13)
+	mtcr	r12
+	PPC_LL	r12, HSTATE_SCRATCH0(r13)
+	GET_SCRATCH0(r13)
+
+	/* And get back into the code */
+	RFI
+#endif
+
+/*
+ * Call kvmppc_handler_trampoline_enter in real mode
+ *
+ * On entry, r4 contains the guest shadow MSR
+ * MSR.EE has to be 0 when calling this function
+ */
+_GLOBAL_TOC(kvmppc_entry_trampoline)
+	mfmsr	r5
+	LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
+	toreal(r7)
+
+	li	r6, MSR_IR | MSR_DR
+	andc	r6, r5, r6	/* Clear DR and IR in MSR value */
+	/*
+	 * Set EE in HOST_MSR so that it's enabled when we get into our
+	 * C exit handler function.
+	 */
+	ori	r5, r5, MSR_EE
+	mtsrr0	r7
+	mtsrr1	r6
+	RFI
+
+#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
new file mode 100644
index 000000000..ef27fbd5d
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/err.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/rtas.h>
+
+#ifdef CONFIG_KVM_XICS
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 3 || be32_to_cpu(args->nret) != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+	server = be32_to_cpu(args->args[1]);
+	priority = be32_to_cpu(args->args[2]);
+
+	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq, server, priority;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 3) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+
+	server = priority = 0;
+	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+	if (rc) {
+		rc = -3;
+		goto out;
+	}
+
+	args->rets[1] = cpu_to_be32(server);
+	args->rets[2] = cpu_to_be32(priority);
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+
+static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+
+	rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+
+static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+	u32 irq;
+	int rc;
+
+	if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) {
+		rc = -3;
+		goto out;
+	}
+
+	irq = be32_to_cpu(args->args[0]);
+
+	rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+	if (rc)
+		rc = -3;
+out:
+	args->rets[0] = cpu_to_be32(rc);
+}
+#endif /* CONFIG_KVM_XICS */
+
+struct rtas_handler {
+	void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
+	char *name;
+};
+
+static struct rtas_handler rtas_handlers[] = {
+#ifdef CONFIG_KVM_XICS
+	{ .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+	{ .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+	{ .name = "ibm,int-off",  .handler = kvm_rtas_int_off },
+	{ .name = "ibm,int-on",   .handler = kvm_rtas_int_on },
+#endif
+};
+
+struct rtas_token_definition {
+	struct list_head list;
+	struct rtas_handler *handler;
+	u64 token;
+};
+
+static int rtas_name_matches(char *s1, char *s2)
+{
+	struct kvm_rtas_token_args args;
+	return !strncmp(s1, s2, sizeof(args.name));
+}
+
+static int rtas_token_undefine(struct kvm *kvm, char *name)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		if (rtas_name_matches(d->handler->name, name)) {
+			list_del(&d->list);
+			kfree(d);
+			return 0;
+		}
+	}
+
+	/* It's not an error to undefine an undefined token */
+	return 0;
+}
+
+static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
+{
+	struct rtas_token_definition *d;
+	struct rtas_handler *h = NULL;
+	bool found;
+	int i;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
+		if (d->token == token)
+			return -EEXIST;
+	}
+
+	found = false;
+	for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) {
+		h = &rtas_handlers[i];
+		if (rtas_name_matches(h->name, name)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return -ENOENT;
+
+	d = kzalloc(sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->handler = h;
+	d->token = token;
+
+	list_add_tail(&d->list, &kvm->arch.rtas_tokens);
+
+	return 0;
+}
+
+int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
+{
+	struct kvm_rtas_token_args args;
+	int rc;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	mutex_lock(&kvm->lock);
+
+	if (args.token)
+		rc = rtas_token_define(kvm, args.name, args.token);
+	else
+		rc = rtas_token_undefine(kvm, args.name);
+
+	mutex_unlock(&kvm->lock);
+
+	return rc;
+}
+
+int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
+{
+	struct rtas_token_definition *d;
+	struct rtas_args args;
+	rtas_arg_t *orig_rets;
+	gpa_t args_phys;
+	int rc;
+
+	/*
+	 * r4 contains the guest physical address of the RTAS args
+	 * Mask off the top 4 bits since this is a guest real address
+	 */
+	args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM;
+
+	rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+	if (rc)
+		goto fail;
+
+	/*
+	 * args->rets is a pointer into args->args. Now that we've
+	 * copied args we need to fix it up to point into our copy,
+	 * not the guest args. We also need to save the original
+	 * value so we can restore it on the way out.
+	 */
+	orig_rets = args.rets;
+	args.rets = &args.args[be32_to_cpu(args.nargs)];
+
+	mutex_lock(&vcpu->kvm->lock);
+
+	rc = -ENOENT;
+	list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
+		if (d->token == be32_to_cpu(args.token)) {
+			d->handler->handler(vcpu, &args);
+			rc = 0;
+			break;
+		}
+	}
+
+	mutex_unlock(&vcpu->kvm->lock);
+
+	if (rc == 0) {
+		args.rets = orig_rets;
+		rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+		if (rc)
+			goto fail;
+	}
+
+	return rc;
+
+fail:
+	/*
+	 * We only get here if the guest has called RTAS with a bogus
+	 * args pointer. That means we can't get to the args, and so we
+	 * can't fail the RTAS call. So fail right out to userspace,
+	 * which should kill the guest.
+	 */
+	return rc;
+}
+EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall);
+
+void kvmppc_rtas_tokens_free(struct kvm *kvm)
+{
+	struct rtas_token_definition *d, *tmp;
+
+	lockdep_assert_held(&kvm->lock);
+
+	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+		list_del(&d->list);
+		kfree(d);
+	}
+}
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
new file mode 100644
index 000000000..acee37cde
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -0,0 +1,393 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2010
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+/* Real mode helpers */
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+
+#define GET_SHADOW_VCPU(reg)    \
+	mr	reg, r13
+
+#elif defined(CONFIG_PPC_BOOK3S_32)
+
+#define GET_SHADOW_VCPU(reg)    			\
+	tophys(reg, r2);       			\
+	lwz     reg, (THREAD + THREAD_KVM_SVCPU)(reg);	\
+	tophys(reg, reg)
+
+#endif
+
+/* Disable for nested KVM */
+#define USE_QUICK_LAST_INST
+
+
+/* Get helper functions for subarch specific functionality */
+
+#if defined(CONFIG_PPC_BOOK3S_64)
+#include "book3s_64_slb.S"
+#elif defined(CONFIG_PPC_BOOK3S_32)
+#include "book3s_32_sr.S"
+#endif
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_enter
+kvmppc_handler_trampoline_enter:
+
+	/* Required state:
+	 *
+	 * MSR = ~IR|DR
+	 * R1 = host R1
+	 * R2 = host R2
+	 * R4 = guest shadow MSR
+	 * R5 = normal host MSR
+	 * R6 = current host MSR (EE, IR, DR off)
+	 * LR = highmem guest exit code
+	 * all other volatile GPRS = free
+	 * SVCPU[CR] = guest CR
+	 * SVCPU[XER] = guest XER
+	 * SVCPU[CTR] = guest CTR
+	 * SVCPU[LR] = guest LR
+	 */
+
+	/* r3 = shadow vcpu */
+	GET_SHADOW_VCPU(r3)
+
+	/* Save guest exit handler address and MSR */
+	mflr	r0
+	PPC_STL	r0, HSTATE_VMHANDLER(r3)
+	PPC_STL	r5, HSTATE_HOST_MSR(r3)
+
+	/* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */
+	PPC_STL	r1, HSTATE_HOST_R1(r3)
+	PPC_STL	r2, HSTATE_HOST_R2(r3)
+
+	/* Activate guest mode, so faults get handled by KVM */
+	li	r11, KVM_GUEST_MODE_GUEST
+	stb	r11, HSTATE_IN_GUEST(r3)
+
+	/* Switch to guest segment. This is subarch specific. */
+	LOAD_GUEST_SEGMENTS
+
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+	/* Save host FSCR */
+	mfspr	r8, SPRN_FSCR
+	std	r8, HSTATE_HOST_FSCR(r13)
+	/* Set FSCR during guest execution */
+	ld	r9, SVCPU_SHADOW_FSCR(r13)
+	mtspr	SPRN_FSCR, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+	/* Some guests may need to have dcbz set to 32 byte length.
+	 *
+	 * Usually we ensure that by patching the guest's instructions
+	 * to trap on dcbz and emulate it in the hypervisor.
+	 *
+	 * If we can, we should tell the CPU to use 32 byte dcbz though,
+	 * because that's a lot faster.
+	 */
+	lbz	r0, HSTATE_RESTORE_HID5(r3)
+	cmpwi	r0, 0
+	beq	no_dcbz32_on
+
+	mfspr   r0,SPRN_HID5
+	ori     r0, r0, 0x80		/* XXX HID5_dcbz32 = 0x80 */
+	mtspr   SPRN_HID5,r0
+no_dcbz32_on:
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/* Enter guest */
+
+	PPC_LL	r8, SVCPU_CTR(r3)
+	PPC_LL	r9, SVCPU_LR(r3)
+	lwz	r10, SVCPU_CR(r3)
+	lwz	r11, SVCPU_XER(r3)
+
+	mtctr	r8
+	mtlr	r9
+	mtcr	r10
+	mtxer	r11
+
+	/* Move SRR0 and SRR1 into the respective regs */
+	PPC_LL  r9, SVCPU_PC(r3)
+	/* First clear RI in our current MSR value */
+	li	r0, MSR_RI
+	andc	r6, r6, r0
+
+	PPC_LL	r0, SVCPU_R0(r3)
+	PPC_LL	r1, SVCPU_R1(r3)
+	PPC_LL	r2, SVCPU_R2(r3)
+	PPC_LL	r5, SVCPU_R5(r3)
+	PPC_LL	r7, SVCPU_R7(r3)
+	PPC_LL	r8, SVCPU_R8(r3)
+	PPC_LL	r10, SVCPU_R10(r3)
+	PPC_LL	r11, SVCPU_R11(r3)
+	PPC_LL	r12, SVCPU_R12(r3)
+	PPC_LL	r13, SVCPU_R13(r3)
+
+	MTMSR_EERI(r6)
+	mtsrr0	r9
+	mtsrr1	r4
+
+	PPC_LL	r4, SVCPU_R4(r3)
+	PPC_LL	r6, SVCPU_R6(r3)
+	PPC_LL	r9, SVCPU_R9(r3)
+	PPC_LL	r3, (SVCPU_R3)(r3)
+
+	RFI
+kvmppc_handler_trampoline_enter_end:
+
+
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
+
+.global kvmppc_interrupt_pr
+kvmppc_interrupt_pr:
+
+	/* Register usage at this point:
+	 *
+	 * SPRG_SCRATCH0  = guest R13
+	 * R12            = exit handler id
+	 * R13            = shadow vcpu (32-bit) or PACA (64-bit)
+	 * HSTATE.SCRATCH0 = guest R12
+	 * HSTATE.SCRATCH1 = guest CR
+	 *
+	 */
+
+	/* Save registers */
+
+	PPC_STL	r0, SVCPU_R0(r13)
+	PPC_STL	r1, SVCPU_R1(r13)
+	PPC_STL	r2, SVCPU_R2(r13)
+	PPC_STL	r3, SVCPU_R3(r13)
+	PPC_STL	r4, SVCPU_R4(r13)
+	PPC_STL	r5, SVCPU_R5(r13)
+	PPC_STL	r6, SVCPU_R6(r13)
+	PPC_STL	r7, SVCPU_R7(r13)
+	PPC_STL	r8, SVCPU_R8(r13)
+	PPC_STL	r9, SVCPU_R9(r13)
+	PPC_STL	r10, SVCPU_R10(r13)
+	PPC_STL	r11, SVCPU_R11(r13)
+
+	/* Restore R1/R2 so we can handle faults */
+	PPC_LL	r1, HSTATE_HOST_R1(r13)
+	PPC_LL	r2, HSTATE_HOST_R2(r13)
+
+	/* Save guest PC and MSR */
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+	andi.	r0, r12, 0x2
+	cmpwi	cr1, r0, 0
+	beq	1f
+	mfspr	r3,SPRN_HSRR0
+	mfspr	r4,SPRN_HSRR1
+	andi.	r12,r12,0x3ffd
+	b	2f
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+#endif
+1:	mfsrr0	r3
+	mfsrr1	r4
+2:
+	PPC_STL	r3, SVCPU_PC(r13)
+	PPC_STL	r4, SVCPU_SHADOW_SRR1(r13)
+
+	/* Get scratch'ed off registers */
+	GET_SCRATCH0(r9)
+	PPC_LL	r8, HSTATE_SCRATCH0(r13)
+	lwz	r7, HSTATE_SCRATCH1(r13)
+
+	PPC_STL	r9, SVCPU_R13(r13)
+	PPC_STL	r8, SVCPU_R12(r13)
+	stw	r7, SVCPU_CR(r13)
+
+	/* Save more register state  */
+
+	mfxer	r5
+	mfdar	r6
+	mfdsisr	r7
+	mfctr	r8
+	mflr	r9
+
+	stw	r5, SVCPU_XER(r13)
+	PPC_STL	r6, SVCPU_FAULT_DAR(r13)
+	stw	r7, SVCPU_FAULT_DSISR(r13)
+	PPC_STL	r8, SVCPU_CTR(r13)
+	PPC_STL	r9, SVCPU_LR(r13)
+
+	/*
+	 * In order for us to easily get the last instruction,
+	 * we got the #vmexit at, we exploit the fact that the
+	 * virtual layout is still the same here, so we can just
+	 * ld from the guest's PC address
+	 */
+
+	/* We only load the last instruction when it's safe */
+	cmpwi	r12, BOOK3S_INTERRUPT_DATA_STORAGE
+	beq	ld_last_inst
+	cmpwi	r12, BOOK3S_INTERRUPT_PROGRAM
+	beq	ld_last_inst
+	cmpwi	r12, BOOK3S_INTERRUPT_SYSCALL
+	beq	ld_last_prev_inst
+	cmpwi	r12, BOOK3S_INTERRUPT_ALIGNMENT
+	beq-	ld_last_inst
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+	cmpwi	r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST
+	beq-	ld_last_inst
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+BEGIN_FTR_SECTION
+	cmpwi	r12, BOOK3S_INTERRUPT_FAC_UNAVAIL
+	beq-	ld_last_inst
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+#endif
+
+	b	no_ld_last_inst
+
+ld_last_prev_inst:
+	addi	r3, r3, -4
+
+ld_last_inst:
+	/* Save off the guest instruction we're at */
+
+	/* In case lwz faults */
+	li	r0, KVM_INST_FETCH_FAILED
+
+#ifdef USE_QUICK_LAST_INST
+
+	/* Set guest mode to 'jump over instruction' so if lwz faults
+	 * we'll just continue at the next IP. */
+	li	r9, KVM_GUEST_MODE_SKIP
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	/*    1) enable paging for data */
+	mfmsr	r9
+	ori	r11, r9, MSR_DR			/* Enable paging for data */
+	mtmsr	r11
+	sync
+	/*    2) fetch the instruction */
+	lwz	r0, 0(r3)
+	/*    3) disable paging again */
+	mtmsr	r9
+	sync
+
+#endif
+	stw	r0, SVCPU_LAST_INST(r13)
+
+no_ld_last_inst:
+
+	/* Unset guest mode */
+	li	r9, KVM_GUEST_MODE_NONE
+	stb	r9, HSTATE_IN_GUEST(r13)
+
+	/* Switch back to host MMU */
+	LOAD_HOST_SEGMENTS
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+	lbz	r5, HSTATE_RESTORE_HID5(r13)
+	cmpwi	r5, 0
+	beq	no_dcbz32_off
+
+	li	r4, 0
+	mfspr   r5,SPRN_HID5
+	rldimi  r5,r4,6,56
+	mtspr   SPRN_HID5,r5
+
+no_dcbz32_off:
+
+BEGIN_FTR_SECTION
+	/* Save guest FSCR on a FAC_UNAVAIL interrupt */
+	cmpwi	r12, BOOK3S_INTERRUPT_FAC_UNAVAIL
+	bne+	no_fscr_save
+	mfspr	r7, SPRN_FSCR
+	std	r7, SVCPU_SHADOW_FSCR(r13)
+no_fscr_save:
+	/* Restore host FSCR */
+	ld	r8, HSTATE_HOST_FSCR(r13)
+	mtspr	SPRN_FSCR, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+	/*
+	 * For some interrupts, we need to call the real Linux
+	 * handler, so it can do work for us. This has to happen
+	 * as if the interrupt arrived from the kernel though,
+	 * so let's fake it here where most state is restored.
+	 *
+	 * Having set up SRR0/1 with the address where we want
+	 * to continue with relocation on (potentially in module
+	 * space), we either just go straight there with rfi[d],
+	 * or we jump to an interrupt handler if there is an
+	 * interrupt to be handled first.  In the latter case,
+	 * the rfi[d] at the end of the interrupt handler will
+	 * get us back to where we want to continue.
+	 */
+
+	/* Register usage at this point:
+	 *
+	 * R1       = host R1
+	 * R2       = host R2
+	 * R10      = raw exit handler id
+	 * R12      = exit handler id
+	 * R13      = shadow vcpu (32-bit) or PACA (64-bit)
+	 * SVCPU.*  = guest *
+	 *
+	 */
+
+	PPC_LL	r6, HSTATE_HOST_MSR(r13)
+	PPC_LL	r8, HSTATE_VMHANDLER(r13)
+
+#ifdef CONFIG_PPC64
+BEGIN_FTR_SECTION
+	beq	cr1, 1f
+	mtspr	SPRN_HSRR1, r6
+	mtspr	SPRN_HSRR0, r8
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
+#endif
+1:	/* Restore host msr -> SRR1 */
+	mtsrr1	r6
+	/* Load highmem handler address */
+	mtsrr0	r8
+
+	/* RFI into the highmem handler, or jump to interrupt handler */
+	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
+	beqa	BOOK3S_INTERRUPT_EXTERNAL
+	cmpwi	r12, BOOK3S_INTERRUPT_DECREMENTER
+	beqa	BOOK3S_INTERRUPT_DECREMENTER
+	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON
+	beqa	BOOK3S_INTERRUPT_PERFMON
+	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL
+	beqa	BOOK3S_INTERRUPT_DOORBELL
+
+	RFI
+kvmppc_handler_trampoline_exit_end:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644
index 000000000..c6ca7db64
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -0,0 +1,1411 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/anon_inodes.h>
+#include <linux/spinlock.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/time.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#if 1
+#define XICS_DBG(fmt...) do { } while (0)
+#else
+#define XICS_DBG(fmt...) trace_printk(fmt)
+#endif
+
+#define ENABLE_REALMODE	true
+#define DEBUG_REALMODE	false
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a spin lock protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+/*
+ * Return value ideally indicates how the interrupt was handled, but no
+ * callers look at it (given that we don't implement KVM_IRQ_LINE_STATUS),
+ * so just return 0.
+ */
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u16 src;
+
+	XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+		return -EINVAL;
+	}
+	state = &ics->irq_state[src];
+	if (!state->exists)
+		return -EINVAL;
+
+	/*
+	 * We set state->asserted locklessly. This should be fine as
+	 * we are the only setter, thus concurrent access is undefined
+	 * to begin with.
+	 */
+	if (level == 1 || level == KVM_INTERRUPT_SET_LEVEL)
+		state->asserted = 1;
+	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
+		state->asserted = 0;
+		return 0;
+	}
+
+	/* Attempt delivery */
+	icp_deliver_irq(xics, NULL, irq);
+
+	return 0;
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+			     struct kvmppc_icp *icp)
+{
+	int i;
+
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		XICS_DBG("resend %#x prio %#x\n", state->number,
+			      state->priority);
+
+		arch_spin_unlock(&ics->lock);
+		local_irq_restore(flags);
+		icp_deliver_irq(xics, icp, state->number);
+		local_irq_save(flags);
+		arch_spin_lock(&ics->lock);
+	}
+
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+}
+
+static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+		       struct ics_irq_state *state,
+		       u32 server, u32 priority, u32 saved_priority)
+{
+	bool deliver;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+
+	state->server = server;
+	state->priority = priority;
+	state->saved_priority = saved_priority;
+	deliver = false;
+	if ((state->masked_pending || state->resend) && priority != MASKED) {
+		state->masked_pending = 0;
+		deliver = true;
+	}
+
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	return deliver;
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, server);
+	if (!icp)
+		return -EINVAL;
+
+	XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+		 irq, server, priority,
+		 state->masked_pending, state->resend);
+
+	if (write_xive(xics, ics, state, server, priority, priority))
+		icp_deliver_irq(xics, icp, irq);
+
+	return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+	unsigned long flags;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+	*server = state->server;
+	*priority = state->priority;
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	return 0;
+}
+
+int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	icp = kvmppc_xics_find_server(kvm, state->server);
+	if (!icp)
+		return -EINVAL;
+
+	if (write_xive(xics, ics, state, state->server, state->saved_priority,
+		       state->saved_priority))
+		icp_deliver_irq(xics, icp, irq);
+
+	return 0;
+}
+
+int kvmppc_xics_int_off(struct kvm *kvm, u32 irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u16 src;
+
+	if (!xics)
+		return -ENODEV;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics)
+		return -EINVAL;
+	state = &ics->irq_state[src];
+
+	write_xive(xics, ics, state, state->server, MASKED, state->priority);
+
+	return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+				  union kvmppc_icp_state old,
+				  union kvmppc_icp_state new,
+				  bool change_self)
+{
+	bool success;
+
+	/* Calculate new output value */
+	new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+	/* Attempt atomic update */
+	success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+	if (!success)
+		goto bail;
+
+	XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 icp->server_num,
+		 old.cppr, old.mfrr, old.pending_pri, old.xisr,
+		 old.need_resend, old.out_ee);
+	XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+		 new.cppr, new.mfrr, new.pending_pri, new.xisr,
+		 new.need_resend, new.out_ee);
+	/*
+	 * Check for output state update
+	 *
+	 * Note that this is racy since another processor could be updating
+	 * the state already. This is why we never clear the interrupt output
+	 * here, we only ever set it. The clear only happens prior to doing
+	 * an update and only by the processor itself. Currently we do it
+	 * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+	 *
+	 * We also do not try to figure out whether the EE state has changed,
+	 * we unconditionally set it if the new state calls for it. The reason
+	 * for that is that we opportunistically remove the pending interrupt
+	 * flag when raising CPPR, so we need to set it back here if an
+	 * interrupt is still pending.
+	 */
+	if (new.out_ee) {
+		kvmppc_book3s_queue_irqprio(icp->vcpu,
+					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+		if (!change_self)
+			kvmppc_fast_vcpu_kick(icp->vcpu);
+	}
+ bail:
+	return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+		 icp->server_num);
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;
+	unsigned long flags;
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->server_num) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n",
+				new_irq, state->server);
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differenciate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		XICS_DBG("irq %#x masked pending\n", new_irq);
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * ics spin lock.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_try_to_deliver() the target
+	 * processor may well have alrady consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->icsid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
+			goto again;
+		}
+	}
+ out:
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			  u8 new_cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool resend;
+
+	/*
+	 * This handles several related states in one operation:
+	 *
+	 * ICP State: Down_CPPR
+	 *
+	 * Load CPPR with new value and if the XISR is 0
+	 * then check for resends:
+	 *
+	 * ICP State: Resend
+	 *
+	 * If MFRR is more favored than CPPR, check for IPIs
+	 * and notify ICS of a potential resend. This is done
+	 * asynchronously (when used in real mode, we will have
+	 * to exit here).
+	 *
+	 * We do not handle the complete Check_IPI as documented
+	 * here. In the PAPR, this state will be used for both
+	 * Set_MFRR and Down_CPPR. However, we know that we aren't
+	 * changing the MFRR state here so we don't need to handle
+	 * the case of an MFRR causing a reject of a pending irq,
+	 * this will have been handled when the MFRR was set in the
+	 * first place.
+	 *
+	 * Thus we don't have to handle rejects, only resends.
+	 *
+	 * When implementing real mode for HV KVM, resend will lead to
+	 * a H_TOO_HARD return and the whole transaction will be handled
+	 * in virtual mode.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Down_CPPR */
+		new_state.cppr = new_cppr;
+
+		/*
+		 * Cut down Resend / Check_IPI / IPI
+		 *
+		 * The logic is that we cannot have a pending interrupt
+		 * trumped by an IPI at this point (see above), so we
+		 * know that either the pending interrupt is already an
+		 * IPI (in which case we don't care to override it) or
+		 * it's either more favored than us or non existent
+		 */
+		if (new_state.mfrr < new_cppr &&
+		    new_state.mfrr <= new_state.pending_pri) {
+			WARN_ON(new_state.xisr != XICS_IPI &&
+				new_state.xisr != 0);
+			new_state.pending_pri = new_state.mfrr;
+			new_state.xisr = XICS_IPI;
+		}
+
+		/* Latch/clear resend bit */
+		resend = new_state.need_resend;
+		new_state.need_resend = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Now handle resend checks. Those are asynchronous to the ICP
+	 * state update in HW (ie bus transactions) so we can handle them
+	 * separately here too
+	 */
+	if (resend)
+		icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 xirr;
+
+	/* First, remove EE from the processor */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * ICP State: Accept_Interrupt
+	 *
+	 * Return the pending interrupt (if any) along with the
+	 * current CPPR, then clear the XISR & set CPPR to the
+	 * pending priority
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+		if (!old_state.xisr)
+			break;
+		new_state.cppr = new_state.pending_pri;
+		new_state.pending_pri = 0xff;
+		new_state.xisr = 0;
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+	return xirr;
+}
+
+static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+				 unsigned long mfrr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp;
+	u32 reject;
+	bool resend;
+	bool local;
+
+	XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+		 vcpu->vcpu_id, server, mfrr);
+
+	icp = vcpu->arch.icp;
+	local = icp->server_num == server;
+	if (!local) {
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+		if (!icp)
+			return H_PARAMETER;
+	}
+
+	/*
+	 * ICP state: Set_MFRR
+	 *
+	 * If the CPPR is more favored than the new MFRR, then
+	 * nothing needs to be rejected as there can be no XISR to
+	 * reject.  If the MFRR is being made less favored then
+	 * there might be a previously-rejected interrupt needing
+	 * to be resent.
+	 *
+	 * ICP state: Check_IPI
+	 *
+	 * If the CPPR is less favored, then we might be replacing
+	 * an interrupt, and thus need to possibly reject it.
+	 *
+	 * ICP State: IPI
+	 *
+	 * Besides rejecting any pending interrupts, we also
+	 * update XISR and pending_pri to mark IPI as pending.
+	 *
+	 * PAPR does not describe this state, but if the MFRR is being
+	 * made less favored than its earlier value, there might be
+	 * a previously-rejected interrupt needing to be resent.
+	 * Ideally, we would want to resend only if
+	 *	prio(pending_interrupt) < mfrr &&
+	 *	prio(pending_interrupt) < cppr
+	 * where pending interrupt is the one that was rejected. But
+	 * we don't have that state, so we simply trigger a resend
+	 * whenever the MFRR is made less favored.
+	 */
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		/* Set_MFRR */
+		new_state.mfrr = mfrr;
+
+		/* Check_IPI */
+		reject = 0;
+		resend = false;
+		if (mfrr < new_state.cppr) {
+			/* Reject a pending interrupt if not an IPI */
+			if (mfrr <= new_state.pending_pri) {
+				reject = new_state.xisr;
+				new_state.pending_pri = mfrr;
+				new_state.xisr = XICS_IPI;
+			}
+		}
+
+		if (mfrr > old_state.mfrr) {
+			resend = new_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, local));
+
+	/* Handle reject */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+
+	/* Handle resend */
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return H_SUCCESS;
+}
+
+static int kvmppc_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
+{
+	union kvmppc_icp_state state;
+	struct kvmppc_icp *icp;
+
+	icp = vcpu->arch.icp;
+	if (icp->server_num != server) {
+		icp = kvmppc_xics_find_server(vcpu->kvm, server);
+		if (!icp)
+			return H_PARAMETER;
+	}
+	state = READ_ONCE(icp->state);
+	kvmppc_set_gpr(vcpu, 4, ((u32)state.cppr << 24) | state.xisr);
+	kvmppc_set_gpr(vcpu, 5, state.mfrr);
+	return H_SUCCESS;
+}
+
+static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	u32 reject;
+
+	XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+	/*
+	 * ICP State: Set_CPPR
+	 *
+	 * We can safely compare the new value with the current
+	 * value outside of the transaction as the CPPR is only
+	 * ever changed by the processor on itself
+	 */
+	if (cppr > icp->state.cppr)
+		icp_down_cppr(xics, icp, cppr);
+	else if (cppr == icp->state.cppr)
+		return;
+
+	/*
+	 * ICP State: Up_CPPR
+	 *
+	 * The processor is raising its priority, this can result
+	 * in a rejection of a pending interrupt:
+	 *
+	 * ICP State: Reject_Current
+	 *
+	 * We can remove EE from the current processor, the update
+	 * transaction will set it again if needed
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		reject = 0;
+		new_state.cppr = cppr;
+
+		if (cppr <= new_state.pending_pri) {
+			reject = new_state.xisr;
+			new_state.xisr = 0;
+			new_state.pending_pri = 0xff;
+		}
+
+	} while (!icp_try_update(icp, old_state, new_state, true));
+
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_deliver_irq).
+	 */
+	if (reject && reject != XICS_IPI)
+		icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *state;
+	u32 irq = xirr & 0x00ffffff;
+	u16 src;
+
+	XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+	/*
+	 * ICP State: EOI
+	 *
+	 * Note: If EOI is incorrectly used by SW to lower the CPPR
+	 * value (ie more favored), we do not check for rejection of
+	 * a pending interrupt, this is a SW error and PAPR sepcifies
+	 * that we don't have to deal with it.
+	 *
+	 * The sending of an EOI to the ICS is handled after the
+	 * CPPR update
+	 *
+	 * ICP State: Down_CPPR which we handle
+	 * in a separate function as it's shared with H_CPPR.
+	 */
+	icp_down_cppr(xics, icp, xirr >> 24);
+
+	/* IPIs have no EOI */
+	if (irq == XICS_IPI)
+		return H_SUCCESS;
+	/*
+	 * EOI handling: If the interrupt is still asserted, we need to
+	 * resend it. We can take a lockless "peek" at the ICS state here.
+	 *
+	 * "Message" interrupts will never have "asserted" set
+	 */
+	ics = kvmppc_xics_find_ics(xics, irq, &src);
+	if (!ics) {
+		XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+		return H_PARAMETER;
+	}
+	state = &ics->irq_state[src];
+
+	/* Still asserted, resend it */
+	if (state->asserted)
+		icp_deliver_irq(xics, icp, irq);
+
+	kvm_notify_acked_irq(vcpu->kvm, 0, irq);
+
+	return H_SUCCESS;
+}
+
+static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+
+	XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
+		 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
+
+	if (icp->rm_action & XICS_RM_KICK_VCPU) {
+		icp->n_rm_kick_vcpu++;
+		kvmppc_fast_vcpu_kick(icp->rm_kick_target);
+	}
+	if (icp->rm_action & XICS_RM_CHECK_RESEND) {
+		icp->n_rm_check_resend++;
+		icp_check_resend(xics, icp->rm_resend_icp);
+	}
+	if (icp->rm_action & XICS_RM_REJECT) {
+		icp->n_rm_reject++;
+		icp_deliver_irq(xics, icp, icp->rm_reject);
+	}
+	if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
+		icp->n_rm_notify_eoi++;
+		kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
+	}
+
+	icp->rm_action = 0;
+
+	return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	unsigned long res;
+	int rc = H_SUCCESS;
+
+	/* Check if we have an ICP */
+	if (!xics || !vcpu->arch.icp)
+		return H_HARDWARE;
+
+	/* These requests don't have real-mode implementations at present */
+	switch (req) {
+	case H_XIRR_X:
+		res = kvmppc_h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		kvmppc_set_gpr(vcpu, 5, get_tb());
+		return rc;
+	case H_IPOLL:
+		rc = kvmppc_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
+		return rc;
+	}
+
+	/* Check for real mode returning too hard */
+	if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm))
+		return kvmppc_xics_rm_complete(vcpu, req);
+
+	switch (req) {
+	case H_XIRR:
+		res = kvmppc_h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 4, res);
+		break;
+	case H_CPPR:
+		kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_EOI:
+		rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+		break;
+	case H_IPI:
+		rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+				  kvmppc_get_gpr(vcpu, 5));
+		break;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+	struct kvmppc_xics *xics = m->private;
+	struct kvm *kvm = xics->kvm;
+	struct kvm_vcpu *vcpu;
+	int icsid, i;
+	unsigned long flags;
+	unsigned long t_rm_kick_vcpu, t_rm_check_resend;
+	unsigned long t_rm_reject, t_rm_notify_eoi;
+	unsigned long t_reject, t_check_resend;
+
+	if (!kvm)
+		return 0;
+
+	t_rm_kick_vcpu = 0;
+	t_rm_notify_eoi = 0;
+	t_rm_check_resend = 0;
+	t_rm_reject = 0;
+	t_check_resend = 0;
+	t_reject = 0;
+
+	seq_printf(m, "=========\nICP state\n=========\n");
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		struct kvmppc_icp *icp = vcpu->arch.icp;
+		union kvmppc_icp_state state;
+
+		if (!icp)
+			continue;
+
+		state.raw = READ_ONCE(icp->state.raw);
+		seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n",
+			   icp->server_num, state.xisr,
+			   state.pending_pri, state.cppr, state.mfrr,
+			   state.out_ee, state.need_resend);
+		t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
+		t_rm_notify_eoi += icp->n_rm_notify_eoi;
+		t_rm_check_resend += icp->n_rm_check_resend;
+		t_rm_reject += icp->n_rm_reject;
+		t_check_resend += icp->n_check_resend;
+		t_reject += icp->n_reject;
+	}
+
+	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+			t_rm_kick_vcpu, t_rm_check_resend,
+			t_rm_reject, t_rm_notify_eoi);
+	seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
+			t_check_resend, t_reject);
+	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!ics)
+			continue;
+
+		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+			   icsid);
+
+		local_irq_save(flags);
+		arch_spin_lock(&ics->lock);
+
+		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+			struct ics_irq_state *irq = &ics->irq_state[i];
+
+			seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+				   irq->number, irq->server, irq->priority,
+				   irq->saved_priority, irq->asserted,
+				   irq->resend, irq->masked_pending);
+
+		}
+		arch_spin_unlock(&ics->lock);
+		local_irq_restore(flags);
+	}
+	return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+	.open = xics_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+	char *name;
+
+	name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+	if (!name) {
+		pr_err("%s: no memory for name\n", __func__);
+		return;
+	}
+
+	xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+					   xics, &xics_debug_fops);
+
+	pr_debug("%s: created %s\n", __func__, name);
+	kfree(name);
+}
+
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+					struct kvmppc_xics *xics, int irq)
+{
+	struct kvmppc_ics *ics;
+	int i, icsid;
+
+	icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+	mutex_lock(&kvm->lock);
+
+	/* ICS already exists - somebody else got here first */
+	if (xics->ics[icsid])
+		goto out;
+
+	/* Create the ICS */
+	ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+	if (!ics)
+		goto out;
+
+	ics->icsid = icsid;
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+		ics->irq_state[i].priority = MASKED;
+		ics->irq_state[i].saved_priority = MASKED;
+	}
+	smp_wmb();
+	xics->ics[icsid] = ics;
+
+	if (icsid > xics->max_icsid)
+		xics->max_icsid = icsid;
+
+ out:
+	mutex_unlock(&kvm->lock);
+	return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+{
+	struct kvmppc_icp *icp;
+
+	if (!vcpu->kvm->arch.xics)
+		return -ENODEV;
+
+	if (kvmppc_xics_find_server(vcpu->kvm, server_num))
+		return -EEXIST;
+
+	icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+	if (!icp)
+		return -ENOMEM;
+
+	icp->vcpu = vcpu;
+	icp->server_num = server_num;
+	icp->state.mfrr = MASKED;
+	icp->state.pending_pri = MASKED;
+	vcpu->arch.icp = icp;
+
+	XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+	return 0;
+}
+
+u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	union kvmppc_icp_state state;
+
+	if (!icp)
+		return 0;
+	state = icp->state;
+	return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) |
+		((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) |
+		((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) |
+		((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT);
+}
+
+int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+	struct kvmppc_icp *icp = vcpu->arch.icp;
+	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+	union kvmppc_icp_state old_state, new_state;
+	struct kvmppc_ics *ics;
+	u8 cppr, mfrr, pending_pri;
+	u32 xisr;
+	u16 src;
+	bool resend;
+
+	if (!icp || !xics)
+		return -ENOENT;
+
+	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+		KVM_REG_PPC_ICP_XISR_MASK;
+	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+	pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT;
+
+	/* Require the new state to be internally consistent */
+	if (xisr == 0) {
+		if (pending_pri != 0xff)
+			return -EINVAL;
+	} else if (xisr == XICS_IPI) {
+		if (pending_pri != mfrr || pending_pri >= cppr)
+			return -EINVAL;
+	} else {
+		if (pending_pri >= mfrr || pending_pri >= cppr)
+			return -EINVAL;
+		ics = kvmppc_xics_find_ics(xics, xisr, &src);
+		if (!ics)
+			return -EINVAL;
+	}
+
+	new_state.raw = 0;
+	new_state.cppr = cppr;
+	new_state.xisr = xisr;
+	new_state.mfrr = mfrr;
+	new_state.pending_pri = pending_pri;
+
+	/*
+	 * Deassert the CPU interrupt request.
+	 * icp_try_update will reassert it if necessary.
+	 */
+	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+	/*
+	 * Note that if we displace an interrupt from old_state.xisr,
+	 * we don't mark it as rejected.  We expect userspace to set
+	 * the state of the interrupt sources to be consistent with
+	 * the ICP states (either before or afterwards, which doesn't
+	 * matter).  We do handle resends due to CPPR becoming less
+	 * favoured because that is necessary to end up with a
+	 * consistent state in the situation where userspace restores
+	 * the ICS states before the ICP states.
+	 */
+	do {
+		old_state = READ_ONCE(icp->state);
+
+		if (new_state.mfrr <= old_state.mfrr) {
+			resend = false;
+			new_state.need_resend = old_state.need_resend;
+		} else {
+			resend = old_state.need_resend;
+			new_state.need_resend = 0;
+		}
+	} while (!icp_try_update(icp, old_state, new_state, false));
+
+	if (resend)
+		icp_check_resend(xics, icp);
+
+	return 0;
+}
+
+static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	int ret;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val, prio;
+	unsigned long flags;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return -ENOENT;
+
+	irqp = &ics->irq_state[idx];
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+	ret = -ENOENT;
+	if (irqp->exists) {
+		val = irqp->server;
+		prio = irqp->priority;
+		if (prio == MASKED) {
+			val |= KVM_XICS_MASKED;
+			prio = irqp->saved_priority;
+		}
+		val |= prio << KVM_XICS_PRIORITY_SHIFT;
+		if (irqp->asserted)
+			val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING;
+		else if (irqp->masked_pending || irqp->resend)
+			val |= KVM_XICS_PENDING;
+		ret = 0;
+	}
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	if (!ret && put_user(val, ubufp))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 prio;
+	u32 server;
+	unsigned long flags;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics) {
+		ics = kvmppc_xics_create_ics(xics->kvm, xics, irq);
+		if (!ics)
+			return -ENOMEM;
+	}
+	irqp = &ics->irq_state[idx];
+	if (get_user(val, ubufp))
+		return -EFAULT;
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	prio = val >> KVM_XICS_PRIORITY_SHIFT;
+	if (prio != MASKED &&
+	    kvmppc_xics_find_server(xics->kvm, server) == NULL)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
+	irqp->server = server;
+	irqp->saved_priority = prio;
+	if (val & KVM_XICS_MASKED)
+		prio = MASKED;
+	irqp->priority = prio;
+	irqp->resend = 0;
+	irqp->masked_pending = 0;
+	irqp->asserted = 0;
+	if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
+		irqp->asserted = 1;
+	irqp->exists = 1;
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
+
+	if (val & KVM_XICS_PENDING)
+		icp_deliver_irq(xics, NULL, irqp->number);
+
+	return 0;
+}
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+
+	return ics_deliver_irq(xics, irq, level);
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
+		int irq_source_id, int level, bool line_status)
+{
+	if (!level)
+		return -1;
+	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
+			   level, line_status);
+}
+
+static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_set_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_get_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
+}
+
+static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xics_free(struct kvm_device *dev)
+{
+	struct kvmppc_xics *xics = dev->private;
+	int i;
+	struct kvm *kvm = xics->kvm;
+
+	debugfs_remove(xics->dentry);
+
+	if (kvm)
+		kvm->arch.xics = NULL;
+
+	for (i = 0; i <= xics->max_icsid; i++)
+		kfree(xics->ics[i]);
+	kfree(xics);
+	kfree(dev);
+}
+
+static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
+{
+	struct kvmppc_xics *xics;
+	struct kvm *kvm = dev->kvm;
+	int ret = 0;
+
+	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+	if (!xics)
+		return -ENOMEM;
+
+	dev->private = xics;
+	xics->dev = dev;
+	xics->kvm = kvm;
+
+	/* Already there ? */
+	mutex_lock(&kvm->lock);
+	if (kvm->arch.xics)
+		ret = -EEXIST;
+	else
+		kvm->arch.xics = xics;
+	mutex_unlock(&kvm->lock);
+
+	if (ret) {
+		kfree(xics);
+		return ret;
+	}
+
+	xics_debugfs_init(xics);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+		/* Enable real mode support */
+		xics->real_mode = ENABLE_REALMODE;
+		xics->real_mode_dbg = DEBUG_REALMODE;
+	}
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
+	return 0;
+}
+
+struct kvm_device_ops kvm_xics_ops = {
+	.name = "kvm-xics",
+	.create = kvmppc_xics_create,
+	.destroy = kvmppc_xics_free,
+	.set_attr = xics_set_attr,
+	.get_attr = xics_get_attr,
+	.has_attr = xics_has_attr,
+};
+
+int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 xcpu)
+{
+	struct kvmppc_xics *xics = dev->private;
+	int r = -EBUSY;
+
+	if (dev->ops != &kvm_xics_ops)
+		return -EPERM;
+	if (xics->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type)
+		return -EBUSY;
+
+	r = kvmppc_xics_create_icp(vcpu, xcpu);
+	if (!r)
+		vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+
+	return r;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+	if (!vcpu->arch.icp)
+		return;
+	kfree(vcpu->arch.icp);
+	vcpu->arch.icp = NULL;
+	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+}
+
+static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e,
+			struct kvm *kvm, int irq_source_id, int level,
+			bool line_status)
+{
+	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
+}
+
+int kvm_irq_map_gsi(struct kvm *kvm,
+		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
+{
+	entries->gsi = gsi;
+	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
+	entries->set = xics_set_irq;
+	entries->irqchip.irqchip = 0;
+	entries->irqchip.pin = gsi;
+	return 1;
+}
+
+int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+	return pin;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644
index 000000000..56ea44f98
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID	1023
+#define KVMPPC_XICS_ICS_SHIFT	10
+#define KVMPPC_XICS_IRQ_PER_ICS	(1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK	(KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ	16
+#define KVMPPC_XICS_NR_IRQS	((KVMPPC_XICS_MAX_ICS_ID + 1) * \
+				 KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED	0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+	u32 number;
+	u32 server;
+	u8  priority;
+	u8  saved_priority;
+	u8  resend;
+	u8  masked_pending;
+	u8  asserted; /* Only for LSI */
+	u8  exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+	unsigned long raw;
+	struct {
+		u8 out_ee:1;
+		u8 need_resend:1;
+		u8 cppr;
+		u8 mfrr;
+		u8 pending_pri;
+		u32 xisr;
+	};
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE	(KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+	struct kvm_vcpu *vcpu;
+	unsigned long server_num;
+	union kvmppc_icp_state state;
+	unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+
+	/* Real mode might find something too hard, here's the action
+	 * it might request from virtual mode
+	 */
+#define XICS_RM_KICK_VCPU	0x1
+#define XICS_RM_CHECK_RESEND	0x2
+#define XICS_RM_REJECT		0x4
+#define XICS_RM_NOTIFY_EOI	0x8
+	u32 rm_action;
+	struct kvm_vcpu *rm_kick_target;
+	struct kvmppc_icp *rm_resend_icp;
+	u32  rm_reject;
+	u32  rm_eoied_irq;
+
+	/* Counters for each reason we exited real mode */
+	unsigned long n_rm_kick_vcpu;
+	unsigned long n_rm_check_resend;
+	unsigned long n_rm_reject;
+	unsigned long n_rm_notify_eoi;
+	/* Counters for handling ICP processing in real mode */
+	unsigned long n_check_resend;
+	unsigned long n_reject;
+
+	/* Debug stuff for real mode */
+	union kvmppc_icp_state rm_dbgstate;
+	struct kvm_vcpu *rm_dbgtgt;
+};
+
+struct kvmppc_ics {
+	arch_spinlock_t lock;
+	u16 icsid;
+	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct dentry *dentry;
+	u32 max_icsid;
+	bool real_mode;
+	bool real_mode_dbg;
+	u32 err_noics;
+	u32 err_noicp;
+	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+							 u32 nr)
+{
+	struct kvm_vcpu *vcpu = NULL;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
+			return vcpu->arch.icp;
+	}
+	return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+						      u32 irq, u16 *source)
+{
+	u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+	u16 src = irq & KVMPPC_XICS_SRC_MASK;
+	struct kvmppc_ics *ics;
+
+	if (source)
+		*source = src;
+	if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+		return NULL;
+	ics = xics->ics[icsid];
+	if (!ics)
+		return NULL;
+	return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
new file mode 100644
index 000000000..6c1316a15
--- /dev/null
+++ b/arch/powerpc/kvm/booke.c
@@ -0,0 +1,2160 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ *          Scott Wood <scottwood@freescale.com>
+ *          Varun Sethi <varun.sethi@freescale.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+
+#include <asm/cputable.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include <asm/cacheflush.h>
+#include <asm/dbell.h>
+#include <asm/hw_irq.h>
+#include <asm/irq.h>
+#include <asm/time.h>
+
+#include "timing.h"
+#include "booke.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace_booke.h"
+
+unsigned long kvmppc_booke_handlers;
+
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+	{ "mmio",       VCPU_STAT(mmio_exits) },
+	{ "sig",        VCPU_STAT(signal_exits) },
+	{ "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
+	{ "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
+	{ "dtlb_r",     VCPU_STAT(dtlb_real_miss_exits) },
+	{ "dtlb_v",     VCPU_STAT(dtlb_virt_miss_exits) },
+	{ "sysc",       VCPU_STAT(syscall_exits) },
+	{ "isi",        VCPU_STAT(isi_exits) },
+	{ "dsi",        VCPU_STAT(dsi_exits) },
+	{ "inst_emu",   VCPU_STAT(emulated_inst_exits) },
+	{ "dec",        VCPU_STAT(dec_exits) },
+	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
+	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
+	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+	{ "doorbell", VCPU_STAT(dbell_exits) },
+	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
+	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+	{ NULL }
+};
+
+/* TODO: use vcpu_printf() */
+void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	printk("pc:   %08lx msr:  %08llx\n", vcpu->arch.pc, vcpu->arch.shared->msr);
+	printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
+	printk("srr0: %08llx srr1: %08llx\n", vcpu->arch.shared->srr0,
+					    vcpu->arch.shared->srr1);
+
+	printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
+
+	for (i = 0; i < 32; i += 4) {
+		printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i,
+		       kvmppc_get_gpr(vcpu, i),
+		       kvmppc_get_gpr(vcpu, i+1),
+		       kvmppc_get_gpr(vcpu, i+2),
+		       kvmppc_get_gpr(vcpu, i+3));
+	}
+}
+
+#ifdef CONFIG_SPE
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	enable_kernel_spe();
+	kvmppc_save_guest_spe(vcpu);
+	vcpu->arch.shadow_msr &= ~MSR_SPE;
+	preempt_enable();
+}
+
+static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu)
+{
+	preempt_disable();
+	enable_kernel_spe();
+	kvmppc_load_guest_spe(vcpu);
+	vcpu->arch.shadow_msr |= MSR_SPE;
+	preempt_enable();
+}
+
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.shared->msr & MSR_SPE) {
+		if (!(vcpu->arch.shadow_msr & MSR_SPE))
+			kvmppc_vcpu_enable_spe(vcpu);
+	} else if (vcpu->arch.shadow_msr & MSR_SPE) {
+		kvmppc_vcpu_disable_spe(vcpu);
+	}
+}
+#else
+static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
+{
+}
+#endif
+
+/*
+ * Load up guest vcpu FP state if it's needed.
+ * It also set the MSR_FP in thread so that host know
+ * we're holding FPU, and then host can help to save
+ * guest vcpu FP state if other threads require to use FPU.
+ * This simulates an FP unavailable fault.
+ *
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_FPU
+	if (!(current->thread.regs->msr & MSR_FP)) {
+		enable_kernel_fp();
+		load_fp_state(&vcpu->arch.fp);
+		current->thread.fp_save_area = &vcpu->arch.fp;
+		current->thread.regs->msr |= MSR_FP;
+	}
+#endif
+}
+
+/*
+ * Save guest vcpu FP state into thread.
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_PPC_FPU
+	if (current->thread.regs->msr & MSR_FP)
+		giveup_fpu(current);
+	current->thread.fp_save_area = NULL;
+#endif
+}
+
+static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV)
+	/* We always treat the FP bit as enabled from the host
+	   perspective, so only need to adjust the shadow MSR */
+	vcpu->arch.shadow_msr &= ~MSR_FP;
+	vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP;
+#endif
+}
+
+/*
+ * Simulate AltiVec unavailable fault to load guest state
+ * from thread to AltiVec unit.
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_load_guest_altivec(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
+		if (!(current->thread.regs->msr & MSR_VEC)) {
+			enable_kernel_altivec();
+			load_vr_state(&vcpu->arch.vr);
+			current->thread.vr_save_area = &vcpu->arch.vr;
+			current->thread.regs->msr |= MSR_VEC;
+		}
+	}
+#endif
+}
+
+/*
+ * Save guest vcpu AltiVec state into thread.
+ * It requires to be called with preemption disabled.
+ */
+static inline void kvmppc_save_guest_altivec(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_ALTIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
+		if (current->thread.regs->msr & MSR_VEC)
+			giveup_altivec(current);
+		current->thread.vr_save_area = NULL;
+	}
+#endif
+}
+
+static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu)
+{
+	/* Synchronize guest's desire to get debug interrupts into shadow MSR */
+#ifndef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_msr &= ~MSR_DE;
+	vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE;
+#endif
+
+	/* Force enable debug interrupts when user space wants to debug */
+	if (vcpu->guest_debug) {
+#ifdef CONFIG_KVM_BOOKE_HV
+		/*
+		 * Since there is no shadow MSR, sync MSR_DE into the guest
+		 * visible MSR.
+		 */
+		vcpu->arch.shared->msr |= MSR_DE;
+#else
+		vcpu->arch.shadow_msr |= MSR_DE;
+		vcpu->arch.shared->msr &= ~MSR_DE;
+#endif
+	}
+}
+
+/*
+ * Helper function for "full" MSR writes.  No need to call this if only
+ * EE/CE/ME/DE/RI are changing.
+ */
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+	u32 old_msr = vcpu->arch.shared->msr;
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	new_msr |= MSR_GS;
+#endif
+
+	vcpu->arch.shared->msr = new_msr;
+
+	kvmppc_mmu_msr_notify(vcpu, old_msr);
+	kvmppc_vcpu_sync_spe(vcpu);
+	kvmppc_vcpu_sync_fpu(vcpu);
+	kvmppc_vcpu_sync_debug(vcpu);
+}
+
+static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
+                                       unsigned int priority)
+{
+	trace_kvm_booke_queue_irqprio(vcpu, priority);
+	set_bit(priority, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu,
+				 ulong dear_flags, ulong esr_flags)
+{
+	vcpu->arch.queued_dear = dear_flags;
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
+}
+
+void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu,
+				    ulong dear_flags, ulong esr_flags)
+{
+	vcpu->arch.queued_dear = dear_flags;
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
+}
+
+void kvmppc_core_queue_itlb_miss(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
+}
+
+void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong esr_flags)
+{
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
+}
+
+static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags,
+					ulong esr_flags)
+{
+	vcpu->arch.queued_dear = dear_flags;
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT);
+}
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags)
+{
+	vcpu->arch.queued_esr = esr_flags;
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
+}
+
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);
+}
+
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+	return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+	unsigned int prio = BOOKE_IRQPRIO_EXTERNAL;
+
+	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
+		prio = BOOKE_IRQPRIO_EXTERNAL_LEVEL;
+
+	kvmppc_booke_queue_irqprio(vcpu, prio);
+}
+
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
+	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+}
+
+static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DEBUG);
+}
+
+void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu)
+{
+	clear_bit(BOOKE_IRQPRIO_DEBUG, &vcpu->arch.pending_exceptions);
+}
+
+static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	kvmppc_set_srr0(vcpu, srr0);
+	kvmppc_set_srr1(vcpu, srr1);
+}
+
+static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.csrr0 = srr0;
+	vcpu->arch.csrr1 = srr1;
+}
+
+static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) {
+		vcpu->arch.dsrr0 = srr0;
+		vcpu->arch.dsrr1 = srr1;
+	} else {
+		set_guest_csrr(vcpu, srr0, srr1);
+	}
+}
+
+static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.mcsrr0 = srr0;
+	vcpu->arch.mcsrr1 = srr1;
+}
+
+/* Deliver the interrupt of the corresponding priority, if possible. */
+static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
+{
+	int allowed = 0;
+	ulong msr_mask = 0;
+	bool update_esr = false, update_dear = false, update_epr = false;
+	ulong crit_raw = vcpu->arch.shared->critical;
+	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
+	bool crit;
+	bool keep_irq = false;
+	enum int_class int_class;
+	ulong new_msr = vcpu->arch.shared->msr;
+
+	/* Truncate crit indicators in 32 bit mode */
+	if (!(vcpu->arch.shared->msr & MSR_SF)) {
+		crit_raw &= 0xffffffff;
+		crit_r1 &= 0xffffffff;
+	}
+
+	/* Critical section when crit == r1 */
+	crit = (crit_raw == crit_r1);
+	/* ... and we're in supervisor mode */
+	crit = crit && !(vcpu->arch.shared->msr & MSR_PR);
+
+	if (priority == BOOKE_IRQPRIO_EXTERNAL_LEVEL) {
+		priority = BOOKE_IRQPRIO_EXTERNAL;
+		keep_irq = true;
+	}
+
+	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
+		update_epr = true;
+
+	switch (priority) {
+	case BOOKE_IRQPRIO_DTLB_MISS:
+	case BOOKE_IRQPRIO_DATA_STORAGE:
+	case BOOKE_IRQPRIO_ALIGNMENT:
+		update_dear = true;
+		/* fall through */
+	case BOOKE_IRQPRIO_INST_STORAGE:
+	case BOOKE_IRQPRIO_PROGRAM:
+		update_esr = true;
+		/* fall through */
+	case BOOKE_IRQPRIO_ITLB_MISS:
+	case BOOKE_IRQPRIO_SYSCALL:
+	case BOOKE_IRQPRIO_FP_UNAVAIL:
+#ifdef CONFIG_SPE_POSSIBLE
+	case BOOKE_IRQPRIO_SPE_UNAVAIL:
+	case BOOKE_IRQPRIO_SPE_FP_DATA:
+	case BOOKE_IRQPRIO_SPE_FP_ROUND:
+#endif
+#ifdef CONFIG_ALTIVEC
+	case BOOKE_IRQPRIO_ALTIVEC_UNAVAIL:
+	case BOOKE_IRQPRIO_ALTIVEC_ASSIST:
+#endif
+	case BOOKE_IRQPRIO_AP_UNAVAIL:
+		allowed = 1;
+		msr_mask = MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
+		break;
+	case BOOKE_IRQPRIO_WATCHDOG:
+	case BOOKE_IRQPRIO_CRITICAL:
+	case BOOKE_IRQPRIO_DBELL_CRIT:
+		allowed = vcpu->arch.shared->msr & MSR_CE;
+		allowed = allowed && !crit;
+		msr_mask = MSR_ME;
+		int_class = INT_CLASS_CRIT;
+		break;
+	case BOOKE_IRQPRIO_MACHINE_CHECK:
+		allowed = vcpu->arch.shared->msr & MSR_ME;
+		allowed = allowed && !crit;
+		int_class = INT_CLASS_MC;
+		break;
+	case BOOKE_IRQPRIO_DECREMENTER:
+	case BOOKE_IRQPRIO_FIT:
+		keep_irq = true;
+		/* fall through */
+	case BOOKE_IRQPRIO_EXTERNAL:
+	case BOOKE_IRQPRIO_DBELL:
+		allowed = vcpu->arch.shared->msr & MSR_EE;
+		allowed = allowed && !crit;
+		msr_mask = MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
+		break;
+	case BOOKE_IRQPRIO_DEBUG:
+		allowed = vcpu->arch.shared->msr & MSR_DE;
+		allowed = allowed && !crit;
+		msr_mask = MSR_ME;
+		if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
+			int_class = INT_CLASS_DBG;
+		else
+			int_class = INT_CLASS_CRIT;
+
+		break;
+	}
+
+	if (allowed) {
+		switch (int_class) {
+		case INT_CLASS_NONCRIT:
+			set_guest_srr(vcpu, vcpu->arch.pc,
+				      vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_CRIT:
+			set_guest_csrr(vcpu, vcpu->arch.pc,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_DBG:
+			set_guest_dsrr(vcpu, vcpu->arch.pc,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_MC:
+			set_guest_mcsrr(vcpu, vcpu->arch.pc,
+					vcpu->arch.shared->msr);
+			break;
+		}
+
+		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
+		if (update_esr == true)
+			kvmppc_set_esr(vcpu, vcpu->arch.queued_esr);
+		if (update_dear == true)
+			kvmppc_set_dar(vcpu, vcpu->arch.queued_dear);
+		if (update_epr == true) {
+			if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
+				kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+			else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) {
+				BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC);
+				kvmppc_mpic_set_epr(vcpu);
+			}
+		}
+
+		new_msr &= msr_mask;
+#if defined(CONFIG_64BIT)
+		if (vcpu->arch.epcr & SPRN_EPCR_ICM)
+			new_msr |= MSR_CM;
+#endif
+		kvmppc_set_msr(vcpu, new_msr);
+
+		if (!keep_irq)
+			clear_bit(priority, &vcpu->arch.pending_exceptions);
+	}
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/*
+	 * If an interrupt is pending but masked, raise a guest doorbell
+	 * so that we are notified when the guest enables the relevant
+	 * MSR bit.
+	 */
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC);
+#endif
+
+	return allowed;
+}
+
+/*
+ * Return the number of jiffies until the next timeout.  If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
+ * because the larger value can break the timer APIs.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+	u64 tb, wdt_tb, wdt_ticks = 0;
+	u64 nr_jiffies = 0;
+	u32 period = TCR_GET_WP(vcpu->arch.tcr);
+
+	wdt_tb = 1ULL << (63 - period);
+	tb = get_tb();
+	/*
+	 * The watchdog timeout will hapeen when TB bit corresponding
+	 * to watchdog will toggle from 0 to 1.
+	 */
+	if (tb & wdt_tb)
+		wdt_ticks = wdt_tb;
+
+	wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
+
+	/* Convert timebase ticks to jiffies */
+	nr_jiffies = wdt_ticks;
+
+	if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+		nr_jiffies++;
+
+	return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+	unsigned long nr_jiffies;
+	unsigned long flags;
+
+	/*
+	 * If TSR_ENW and TSR_WIS are not set then no need to exit to
+	 * userspace, so clear the KVM_REQ_WATCHDOG request.
+	 */
+	if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
+		clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
+
+	spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
+	nr_jiffies = watchdog_next_timeout(vcpu);
+	/*
+	 * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
+	 * then do not run the watchdog timer as this can break timer APIs.
+	 */
+	if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
+		mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+	else
+		del_timer(&vcpu->arch.wdt_timer);
+	spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	u32 tsr, new_tsr;
+	int final;
+
+	do {
+		new_tsr = tsr = vcpu->arch.tsr;
+		final = 0;
+
+		/* Time out event */
+		if (tsr & TSR_ENW) {
+			if (tsr & TSR_WIS)
+				final = 1;
+			else
+				new_tsr = tsr | TSR_WIS;
+		} else {
+			new_tsr = tsr | TSR_ENW;
+		}
+	} while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+	if (new_tsr & TSR_WIS) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * If this is final watchdog expiry and some action is required
+	 * then exit to userspace.
+	 */
+	if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
+	    vcpu->arch.watchdog_enabled) {
+		smp_wmb();
+		kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
+		kvm_vcpu_kick(vcpu);
+	}
+
+	/*
+	 * Stop running the watchdog timer after final expiration to
+	 * prevent the host from being flooded with timers if the
+	 * guest sets a short period.
+	 * Timers will resume when TSR/TCR is updated next time.
+	 */
+	if (!final)
+		arm_next_watchdog(vcpu);
+}
+
+static void update_timer_ints(struct kvm_vcpu *vcpu)
+{
+	if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
+		kvmppc_core_queue_dec(vcpu);
+	else
+		kvmppc_core_dequeue_dec(vcpu);
+
+	if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+		kvmppc_core_queue_watchdog(vcpu);
+	else
+		kvmppc_core_dequeue_watchdog(vcpu);
+}
+
+static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
+{
+	unsigned long *pending = &vcpu->arch.pending_exceptions;
+	unsigned int priority;
+
+	priority = __ffs(*pending);
+	while (priority < BOOKE_IRQPRIO_MAX) {
+		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
+			break;
+
+		priority = find_next_bit(pending,
+		                         BITS_PER_BYTE * sizeof(*pending),
+		                         priority + 1);
+	}
+
+	/* Tell the guest about our interrupt status */
+	vcpu->arch.shared->int_pending = !!*pending;
+}
+
+/* Check pending exceptions and deliver one, if possible. */
+int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	int r = 0;
+	WARN_ON_ONCE(!irqs_disabled());
+
+	kvmppc_core_check_exceptions(vcpu);
+
+	if (vcpu->requests) {
+		/* Exception delivery raised request; start over */
+		return 1;
+	}
+
+	if (vcpu->arch.shared->msr & MSR_WE) {
+		local_irq_enable();
+		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		hard_irq_disable();
+
+		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+		r = 1;
+	};
+
+	return r;
+}
+
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+{
+	int r = 1; /* Indicate we want to get back into the guest */
+
+	if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu))
+		update_timer_ints(vcpu);
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+		kvmppc_core_flush_tlb(vcpu);
+#endif
+
+	if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) {
+		vcpu->run->exit_reason = KVM_EXIT_WATCHDOG;
+		r = 0;
+	}
+
+	if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) {
+		vcpu->run->epr.epr = 0;
+		vcpu->arch.epr_needed = true;
+		vcpu->run->exit_reason = KVM_EXIT_EPR;
+		r = 0;
+	}
+
+	return r;
+}
+
+int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+	int ret, s;
+	struct debug_reg debug;
+
+	if (!vcpu->arch.sane) {
+		kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		return -EINVAL;
+	}
+
+	s = kvmppc_prepare_to_enter(vcpu);
+	if (s <= 0) {
+		ret = s;
+		goto out;
+	}
+	/* interrupts now hard-disabled */
+
+#ifdef CONFIG_PPC_FPU
+	/* Save userspace FPU state in stack */
+	enable_kernel_fp();
+
+	/*
+	 * Since we can't trap on MSR_FP in GS-mode, we consider the guest
+	 * as always using the FPU.
+	 */
+	kvmppc_load_guest_fp(vcpu);
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	/* Save userspace AltiVec state in stack */
+	if (cpu_has_feature(CPU_FTR_ALTIVEC))
+		enable_kernel_altivec();
+	/*
+	 * Since we can't trap on MSR_VEC in GS-mode, we consider the guest
+	 * as always using the AltiVec.
+	 */
+	kvmppc_load_guest_altivec(vcpu);
+#endif
+
+	/* Switch to guest debug context */
+	debug = vcpu->arch.dbg_reg;
+	switch_booke_debug_regs(&debug);
+	debug = current->thread.debug;
+	current->thread.debug = vcpu->arch.dbg_reg;
+
+	vcpu->arch.pgdir = current->mm->pgd;
+	kvmppc_fix_ee_before_entry();
+
+	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+
+	/* No need for kvm_guest_exit. It's done in handle_exit.
+	   We also get here with interrupts enabled. */
+
+	/* Switch back to user space debug context */
+	switch_booke_debug_regs(&debug);
+	current->thread.debug = debug;
+
+#ifdef CONFIG_PPC_FPU
+	kvmppc_save_guest_fp(vcpu);
+#endif
+
+#ifdef CONFIG_ALTIVEC
+	kvmppc_save_guest_altivec(vcpu);
+#endif
+
+out:
+	vcpu->mode = OUTSIDE_GUEST_MODE;
+	return ret;
+}
+
+static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+
+	er = kvmppc_emulate_instruction(run, vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		/* don't overwrite subtypes, just account kvm_stats */
+		kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
+		/* Future optimization: only reload non-volatiles if
+		 * they were actually modified by emulation. */
+		return RESUME_GUEST_NV;
+
+	case EMULATE_AGAIN:
+		return RESUME_GUEST;
+
+	case EMULATE_FAIL:
+		printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+		       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
+		/* For debugging, encode the failing instruction and
+		 * report it to userspace. */
+		run->hw.hardware_exit_reason = ~0ULL << 32;
+		run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
+		kvmppc_core_queue_program(vcpu, ESR_PIL);
+		return RESUME_HOST;
+
+	case EMULATE_EXIT_USER:
+		return RESUME_HOST;
+
+	default:
+		BUG();
+	}
+}
+
+static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	struct debug_reg *dbg_reg = &(vcpu->arch.dbg_reg);
+	u32 dbsr = vcpu->arch.dbsr;
+
+	if (vcpu->guest_debug == 0) {
+		/*
+		 * Debug resources belong to Guest.
+		 * Imprecise debug event is not injected
+		 */
+		if (dbsr & DBSR_IDE) {
+			dbsr &= ~DBSR_IDE;
+			if (!dbsr)
+				return RESUME_GUEST;
+		}
+
+		if (dbsr && (vcpu->arch.shared->msr & MSR_DE) &&
+			    (vcpu->arch.dbg_reg.dbcr0 & DBCR0_IDM))
+			kvmppc_core_queue_debug(vcpu);
+
+		/* Inject a program interrupt if trap debug is not allowed */
+		if ((dbsr & DBSR_TIE) && !(vcpu->arch.shared->msr & MSR_DE))
+			kvmppc_core_queue_program(vcpu, ESR_PTR);
+
+		return RESUME_GUEST;
+	}
+
+	/*
+	 * Debug resource owned by userspace.
+	 * Clear guest dbsr (vcpu->arch.dbsr)
+	 */
+	vcpu->arch.dbsr = 0;
+	run->debug.arch.status = 0;
+	run->debug.arch.address = vcpu->arch.pc;
+
+	if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) {
+		run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT;
+	} else {
+		if (dbsr & (DBSR_DAC1W | DBSR_DAC2W))
+			run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE;
+		else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R))
+			run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ;
+		if (dbsr & (DBSR_DAC1R | DBSR_DAC1W))
+			run->debug.arch.address = dbg_reg->dac1;
+		else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W))
+			run->debug.arch.address = dbg_reg->dac2;
+	}
+
+	return RESUME_HOST;
+}
+
+static void kvmppc_fill_pt_regs(struct pt_regs *regs)
+{
+	ulong r1, ip, msr, lr;
+
+	asm("mr %0, 1" : "=r"(r1));
+	asm("mflr %0" : "=r"(lr));
+	asm("mfmsr %0" : "=r"(msr));
+	asm("bl 1f; 1: mflr %0" : "=r"(ip));
+
+	memset(regs, 0, sizeof(*regs));
+	regs->gpr[1] = r1;
+	regs->nip = ip;
+	regs->msr = msr;
+	regs->link = lr;
+}
+
+/*
+ * For interrupts needed to be handled by host interrupt handlers,
+ * corresponding host handler are called from here in similar way
+ * (but not exact) as they are called from low level handler
+ * (such as from arch/powerpc/kernel/head_fsl_booke.S).
+ */
+static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
+				     unsigned int exit_nr)
+{
+	struct pt_regs regs;
+
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_EXTERNAL:
+		kvmppc_fill_pt_regs(&regs);
+		do_IRQ(&regs);
+		break;
+	case BOOKE_INTERRUPT_DECREMENTER:
+		kvmppc_fill_pt_regs(&regs);
+		timer_interrupt(&regs);
+		break;
+#if defined(CONFIG_PPC_DOORBELL)
+	case BOOKE_INTERRUPT_DOORBELL:
+		kvmppc_fill_pt_regs(&regs);
+		doorbell_exception(&regs);
+		break;
+#endif
+	case BOOKE_INTERRUPT_MACHINE_CHECK:
+		/* FIXME */
+		break;
+	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR:
+		kvmppc_fill_pt_regs(&regs);
+		performance_monitor_exception(&regs);
+		break;
+	case BOOKE_INTERRUPT_WATCHDOG:
+		kvmppc_fill_pt_regs(&regs);
+#ifdef CONFIG_BOOKE_WDT
+		WatchdogException(&regs);
+#else
+		unknown_exception(&regs);
+#endif
+		break;
+	case BOOKE_INTERRUPT_CRITICAL:
+		unknown_exception(&regs);
+		break;
+	case BOOKE_INTERRUPT_DEBUG:
+		/* Save DBSR before preemption is enabled */
+		vcpu->arch.dbsr = mfspr(SPRN_DBSR);
+		kvmppc_clear_dbsr();
+		break;
+	}
+}
+
+static int kvmppc_resume_inst_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				  enum emulation_result emulated, u32 last_inst)
+{
+	switch (emulated) {
+	case EMULATE_AGAIN:
+		return RESUME_GUEST;
+
+	case EMULATE_FAIL:
+		pr_debug("%s: load instruction from guest address %lx failed\n",
+		       __func__, vcpu->arch.pc);
+		/* For debugging, encode the failing instruction and
+		 * report it to userspace. */
+		run->hw.hardware_exit_reason = ~0ULL << 32;
+		run->hw.hardware_exit_reason |= last_inst;
+		kvmppc_core_queue_program(vcpu, ESR_PIL);
+		return RESUME_HOST;
+
+	default:
+		BUG();
+	}
+}
+
+/**
+ * kvmppc_handle_exit
+ *
+ * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV)
+ */
+int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int exit_nr)
+{
+	int r = RESUME_HOST;
+	int s;
+	int idx;
+	u32 last_inst = KVM_INST_FETCH_FAILED;
+	enum emulation_result emulated = EMULATE_DONE;
+
+	/* update before a new last_exit_type is rewritten */
+	kvmppc_update_timing_stats(vcpu);
+
+	/* restart interrupts if they were meant for the host */
+	kvmppc_restart_interrupt(vcpu, exit_nr);
+
+	/*
+	 * get last instruction before beeing preempted
+	 * TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR & ESR_DATA
+	 */
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_DATA_STORAGE:
+	case BOOKE_INTERRUPT_DTLB_MISS:
+	case BOOKE_INTERRUPT_HV_PRIV:
+		emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+		break;
+	case BOOKE_INTERRUPT_PROGRAM:
+		/* SW breakpoints arrive as illegal instructions on HV */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+			emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+		break;
+	default:
+		break;
+	}
+
+	local_irq_enable();
+
+	trace_kvm_exit(exit_nr, vcpu);
+	kvm_guest_exit();
+
+	run->exit_reason = KVM_EXIT_UNKNOWN;
+	run->ready_for_interrupt_injection = 1;
+
+	if (emulated != EMULATE_DONE) {
+		r = kvmppc_resume_inst_load(run, vcpu, emulated, last_inst);
+		goto out;
+	}
+
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_MACHINE_CHECK:
+		printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));
+		kvmppc_dump_vcpu(vcpu);
+		/* For debugging, send invalid exit reason to user space */
+		run->hw.hardware_exit_reason = ~1ULL << 32;
+		run->hw.hardware_exit_reason |= mfspr(SPRN_MCSR);
+		r = RESUME_HOST;
+		break;
+
+	case BOOKE_INTERRUPT_EXTERNAL:
+		kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_DECREMENTER:
+		kvmppc_account_exit(vcpu, DEC_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_WATCHDOG:
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_DOORBELL:
+		kvmppc_account_exit(vcpu, DBELL_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL_CRIT:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_CE or MSR_ME was not
+		 * set.  Once we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_EE was not set.  Once
+		 * we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_PERFORMANCE_MONITOR:
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_HV_PRIV:
+		r = emulation_exit(run, vcpu);
+		break;
+
+	case BOOKE_INTERRUPT_PROGRAM:
+		if ((vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) &&
+			(last_inst == KVMPPC_INST_SW_BREAKPOINT)) {
+			/*
+			 * We are here because of an SW breakpoint instr,
+			 * so lets return to host to handle.
+			 */
+			r = kvmppc_handle_debug(run, vcpu);
+			run->exit_reason = KVM_EXIT_DEBUG;
+			kvmppc_account_exit(vcpu, DEBUG_EXITS);
+			break;
+		}
+
+		if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) {
+			/*
+			 * Program traps generated by user-level software must
+			 * be handled by the guest kernel.
+			 *
+			 * In GS mode, hypervisor privileged instructions trap
+			 * on BOOKE_INTERRUPT_HV_PRIV, not here, so these are
+			 * actual program interrupts, handled by the guest.
+			 */
+			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
+			r = RESUME_GUEST;
+			kvmppc_account_exit(vcpu, USR_PR_INST);
+			break;
+		}
+
+		r = emulation_exit(run, vcpu);
+		break;
+
+	case BOOKE_INTERRUPT_FP_UNAVAIL:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+		kvmppc_account_exit(vcpu, FP_UNAVAIL);
+		r = RESUME_GUEST;
+		break;
+
+#ifdef CONFIG_SPE
+	case BOOKE_INTERRUPT_SPE_UNAVAIL: {
+		if (vcpu->arch.shared->msr & MSR_SPE)
+			kvmppc_vcpu_enable_spe(vcpu);
+		else
+			kvmppc_booke_queue_irqprio(vcpu,
+						   BOOKE_IRQPRIO_SPE_UNAVAIL);
+		r = RESUME_GUEST;
+		break;
+	}
+
+	case BOOKE_INTERRUPT_SPE_FP_DATA:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_SPE_FP_ROUND:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND);
+		r = RESUME_GUEST;
+		break;
+#elif defined(CONFIG_SPE_POSSIBLE)
+	case BOOKE_INTERRUPT_SPE_UNAVAIL:
+		/*
+		 * Guest wants SPE, but host kernel doesn't support it.  Send
+		 * an "unimplemented operation" program check to the guest.
+		 */
+		kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV);
+		r = RESUME_GUEST;
+		break;
+
+	/*
+	 * These really should never happen without CONFIG_SPE,
+	 * as we should never enable the real MSR[SPE] in the guest.
+	 */
+	case BOOKE_INTERRUPT_SPE_FP_DATA:
+	case BOOKE_INTERRUPT_SPE_FP_ROUND:
+		printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n",
+		       __func__, exit_nr, vcpu->arch.pc);
+		run->hw.hardware_exit_reason = exit_nr;
+		r = RESUME_HOST;
+		break;
+#endif /* CONFIG_SPE_POSSIBLE */
+
+/*
+ * On cores with Vector category, KVM is loaded only if CONFIG_ALTIVEC,
+ * see kvmppc_core_check_processor_compat().
+ */
+#ifdef CONFIG_ALTIVEC
+	case BOOKE_INTERRUPT_ALTIVEC_UNAVAIL:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_ALTIVEC_ASSIST:
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_ASSIST);
+		r = RESUME_GUEST;
+		break;
+#endif
+
+	case BOOKE_INTERRUPT_DATA_STORAGE:
+		kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear,
+		                               vcpu->arch.fault_esr);
+		kvmppc_account_exit(vcpu, DSI_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_INST_STORAGE:
+		kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr);
+		kvmppc_account_exit(vcpu, ISI_EXITS);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_ALIGNMENT:
+		kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear,
+		                            vcpu->arch.fault_esr);
+		r = RESUME_GUEST;
+		break;
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	case BOOKE_INTERRUPT_HV_SYSCALL:
+		if (!(vcpu->arch.shared->msr & MSR_PR)) {
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+		} else {
+			/*
+			 * hcall from guest userspace -- send privileged
+			 * instruction program check.
+			 */
+			kvmppc_core_queue_program(vcpu, ESR_PPR);
+		}
+
+		r = RESUME_GUEST;
+		break;
+#else
+	case BOOKE_INTERRUPT_SYSCALL:
+		if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
+			/* KVM PV hypercalls */
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+			r = RESUME_GUEST;
+		} else {
+			/* Guest syscalls */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
+		}
+		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
+		r = RESUME_GUEST;
+		break;
+#endif
+
+	case BOOKE_INTERRUPT_DTLB_MISS: {
+		unsigned long eaddr = vcpu->arch.fault_dear;
+		int gtlb_index;
+		gpa_t gpaddr;
+		gfn_t gfn;
+
+#ifdef CONFIG_KVM_E500V2
+		if (!(vcpu->arch.shared->msr & MSR_PR) &&
+		    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
+			kvmppc_map_magic(vcpu);
+			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+			r = RESUME_GUEST;
+
+			break;
+		}
+#endif
+
+		/* Check the guest TLB. */
+		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
+		if (gtlb_index < 0) {
+			/* The guest didn't have a mapping for it. */
+			kvmppc_core_queue_dtlb_miss(vcpu,
+			                            vcpu->arch.fault_dear,
+			                            vcpu->arch.fault_esr);
+			kvmppc_mmu_dtlb_miss(vcpu);
+			kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
+			r = RESUME_GUEST;
+			break;
+		}
+
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
+		gfn = gpaddr >> PAGE_SHIFT;
+
+		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+			/* The guest TLB had a mapping, but the shadow TLB
+			 * didn't, and it is RAM. This could be because:
+			 * a) the entry is mapping the host kernel, or
+			 * b) the guest used a large mapping which we're faking
+			 * Either way, we need to satisfy the fault without
+			 * invoking the guest. */
+			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
+			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+			r = RESUME_GUEST;
+		} else {
+			/* Guest has mapped and accessed a page which is not
+			 * actually RAM. */
+			vcpu->arch.paddr_accessed = gpaddr;
+			vcpu->arch.vaddr_accessed = eaddr;
+			r = kvmppc_emulate_mmio(run, vcpu);
+			kvmppc_account_exit(vcpu, MMIO_EXITS);
+		}
+
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		break;
+	}
+
+	case BOOKE_INTERRUPT_ITLB_MISS: {
+		unsigned long eaddr = vcpu->arch.pc;
+		gpa_t gpaddr;
+		gfn_t gfn;
+		int gtlb_index;
+
+		r = RESUME_GUEST;
+
+		/* Check the guest TLB. */
+		gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr);
+		if (gtlb_index < 0) {
+			/* The guest didn't have a mapping for it. */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
+			kvmppc_mmu_itlb_miss(vcpu);
+			kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS);
+			break;
+		}
+
+		kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
+
+		idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+		gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
+		gfn = gpaddr >> PAGE_SHIFT;
+
+		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
+			/* The guest TLB had a mapping, but the shadow TLB
+			 * didn't. This could be because:
+			 * a) the entry is mapping the host kernel, or
+			 * b) the guest used a large mapping which we're faking
+			 * Either way, we need to satisfy the fault without
+			 * invoking the guest. */
+			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index);
+		} else {
+			/* Guest mapped and leaped at non-RAM! */
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
+		}
+
+		srcu_read_unlock(&vcpu->kvm->srcu, idx);
+		break;
+	}
+
+	case BOOKE_INTERRUPT_DEBUG: {
+		r = kvmppc_handle_debug(run, vcpu);
+		if (r == RESUME_HOST)
+			run->exit_reason = KVM_EXIT_DEBUG;
+		kvmppc_account_exit(vcpu, DEBUG_EXITS);
+		break;
+	}
+
+	default:
+		printk(KERN_EMERG "exit_nr %d\n", exit_nr);
+		BUG();
+	}
+
+out:
+	/*
+	 * To avoid clobbering exit_reason, only check for signals if we
+	 * aren't already exiting to userspace for some other reason.
+	 */
+	if (!(r & RESUME_HOST)) {
+		s = kvmppc_prepare_to_enter(vcpu);
+		if (s <= 0)
+			r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+		else {
+			/* interrupts now hard-disabled */
+			kvmppc_fix_ee_before_entry();
+			kvmppc_load_guest_fp(vcpu);
+			kvmppc_load_guest_altivec(vcpu);
+		}
+	}
+
+	return r;
+}
+
+static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr)
+{
+	u32 old_tsr = vcpu->arch.tsr;
+
+	vcpu->arch.tsr = new_tsr;
+
+	if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
+	update_timer_ints(vcpu);
+}
+
+/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	int i;
+	int r;
+
+	vcpu->arch.pc = 0;
+	vcpu->arch.shared->pir = vcpu->vcpu_id;
+	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
+	kvmppc_set_msr(vcpu, 0);
+
+#ifndef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS;
+	vcpu->arch.shadow_pid = 1;
+	vcpu->arch.shared->msr = 0;
+#endif
+
+	/* Eye-catching numbers so we know if the guest takes an interrupt
+	 * before it's programmed its own IVPR/IVORs. */
+	vcpu->arch.ivpr = 0x55550000;
+	for (i = 0; i < BOOKE_IRQPRIO_MAX; i++)
+		vcpu->arch.ivor[i] = 0x7700 | i * 4;
+
+	kvmppc_init_timing_stats(vcpu);
+
+	r = kvmppc_core_vcpu_setup(vcpu);
+	kvmppc_sanity_check(vcpu);
+	return r;
+}
+
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	/* setup watchdog timer once */
+	spin_lock_init(&vcpu->arch.wdt_lock);
+	setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+		    (unsigned long)vcpu);
+
+	/*
+	 * Clear DBSR.MRR to avoid guest debug interrupt as
+	 * this is of host interest
+	 */
+	mtspr(SPRN_DBSR, DBSR_MRR);
+	return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	del_timer_sync(&vcpu->arch.wdt_timer);
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	regs->pc = vcpu->arch.pc;
+	regs->cr = kvmppc_get_cr(vcpu);
+	regs->ctr = vcpu->arch.ctr;
+	regs->lr = vcpu->arch.lr;
+	regs->xer = kvmppc_get_xer(vcpu);
+	regs->msr = vcpu->arch.shared->msr;
+	regs->srr0 = kvmppc_get_srr0(vcpu);
+	regs->srr1 = kvmppc_get_srr1(vcpu);
+	regs->pid = vcpu->arch.pid;
+	regs->sprg0 = kvmppc_get_sprg0(vcpu);
+	regs->sprg1 = kvmppc_get_sprg1(vcpu);
+	regs->sprg2 = kvmppc_get_sprg2(vcpu);
+	regs->sprg3 = kvmppc_get_sprg3(vcpu);
+	regs->sprg4 = kvmppc_get_sprg4(vcpu);
+	regs->sprg5 = kvmppc_get_sprg5(vcpu);
+	regs->sprg6 = kvmppc_get_sprg6(vcpu);
+	regs->sprg7 = kvmppc_get_sprg7(vcpu);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	int i;
+
+	vcpu->arch.pc = regs->pc;
+	kvmppc_set_cr(vcpu, regs->cr);
+	vcpu->arch.ctr = regs->ctr;
+	vcpu->arch.lr = regs->lr;
+	kvmppc_set_xer(vcpu, regs->xer);
+	kvmppc_set_msr(vcpu, regs->msr);
+	kvmppc_set_srr0(vcpu, regs->srr0);
+	kvmppc_set_srr1(vcpu, regs->srr1);
+	kvmppc_set_pid(vcpu, regs->pid);
+	kvmppc_set_sprg0(vcpu, regs->sprg0);
+	kvmppc_set_sprg1(vcpu, regs->sprg1);
+	kvmppc_set_sprg2(vcpu, regs->sprg2);
+	kvmppc_set_sprg3(vcpu, regs->sprg3);
+	kvmppc_set_sprg4(vcpu, regs->sprg4);
+	kvmppc_set_sprg5(vcpu, regs->sprg5);
+	kvmppc_set_sprg6(vcpu, regs->sprg6);
+	kvmppc_set_sprg7(vcpu, regs->sprg7);
+
+	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
+
+	return 0;
+}
+
+static void get_sregs_base(struct kvm_vcpu *vcpu,
+                           struct kvm_sregs *sregs)
+{
+	u64 tb = get_tb();
+
+	sregs->u.e.features |= KVM_SREGS_E_BASE;
+
+	sregs->u.e.csrr0 = vcpu->arch.csrr0;
+	sregs->u.e.csrr1 = vcpu->arch.csrr1;
+	sregs->u.e.mcsr = vcpu->arch.mcsr;
+	sregs->u.e.esr = kvmppc_get_esr(vcpu);
+	sregs->u.e.dear = kvmppc_get_dar(vcpu);
+	sregs->u.e.tsr = vcpu->arch.tsr;
+	sregs->u.e.tcr = vcpu->arch.tcr;
+	sregs->u.e.dec = kvmppc_get_dec(vcpu, tb);
+	sregs->u.e.tb = tb;
+	sregs->u.e.vrsave = vcpu->arch.vrsave;
+}
+
+static int set_sregs_base(struct kvm_vcpu *vcpu,
+                          struct kvm_sregs *sregs)
+{
+	if (!(sregs->u.e.features & KVM_SREGS_E_BASE))
+		return 0;
+
+	vcpu->arch.csrr0 = sregs->u.e.csrr0;
+	vcpu->arch.csrr1 = sregs->u.e.csrr1;
+	vcpu->arch.mcsr = sregs->u.e.mcsr;
+	kvmppc_set_esr(vcpu, sregs->u.e.esr);
+	kvmppc_set_dar(vcpu, sregs->u.e.dear);
+	vcpu->arch.vrsave = sregs->u.e.vrsave;
+	kvmppc_set_tcr(vcpu, sregs->u.e.tcr);
+
+	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) {
+		vcpu->arch.dec = sregs->u.e.dec;
+		kvmppc_emulate_dec(vcpu);
+	}
+
+	if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR)
+		kvmppc_set_tsr(vcpu, sregs->u.e.tsr);
+
+	return 0;
+}
+
+static void get_sregs_arch206(struct kvm_vcpu *vcpu,
+                              struct kvm_sregs *sregs)
+{
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206;
+
+	sregs->u.e.pir = vcpu->vcpu_id;
+	sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0;
+	sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1;
+	sregs->u.e.decar = vcpu->arch.decar;
+	sregs->u.e.ivpr = vcpu->arch.ivpr;
+}
+
+static int set_sregs_arch206(struct kvm_vcpu *vcpu,
+                             struct kvm_sregs *sregs)
+{
+	if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206))
+		return 0;
+
+	if (sregs->u.e.pir != vcpu->vcpu_id)
+		return -EINVAL;
+
+	vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0;
+	vcpu->arch.mcsrr1 = sregs->u.e.mcsrr1;
+	vcpu->arch.decar = sregs->u.e.decar;
+	vcpu->arch.ivpr = sregs->u.e.ivpr;
+
+	return 0;
+}
+
+int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	sregs->u.e.features |= KVM_SREGS_E_IVOR;
+
+	sregs->u.e.ivor_low[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+	sregs->u.e.ivor_low[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+	sregs->u.e.ivor_low[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+	sregs->u.e.ivor_low[3] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+	sregs->u.e.ivor_low[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+	sregs->u.e.ivor_low[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+	sregs->u.e.ivor_low[6] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+	sregs->u.e.ivor_low[7] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+	sregs->u.e.ivor_low[8] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+	sregs->u.e.ivor_low[9] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+	sregs->u.e.ivor_low[10] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+	sregs->u.e.ivor_low[11] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+	sregs->u.e.ivor_low[12] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+	sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+	sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+	sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+	return 0;
+}
+
+int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = sregs->u.e.ivor_low[0];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = sregs->u.e.ivor_low[1];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = sregs->u.e.ivor_low[2];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = sregs->u.e.ivor_low[3];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = sregs->u.e.ivor_low[4];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = sregs->u.e.ivor_low[5];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = sregs->u.e.ivor_low[6];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = sregs->u.e.ivor_low[7];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = sregs->u.e.ivor_low[8];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = sregs->u.e.ivor_low[9];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = sregs->u.e.ivor_low[10];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = sregs->u.e.ivor_low[11];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = sregs->u.e.ivor_low[12];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = sregs->u.e.ivor_low[13];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = sregs->u.e.ivor_low[14];
+	vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = sregs->u.e.ivor_low[15];
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	sregs->pvr = vcpu->arch.pvr;
+
+	get_sregs_base(vcpu, sregs);
+	get_sregs_arch206(vcpu, sregs);
+	return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs);
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+	int ret;
+
+	if (vcpu->arch.pvr != sregs->pvr)
+		return -EINVAL;
+
+	ret = set_sregs_base(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	ret = set_sregs_arch206(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs);
+}
+
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_IAC1:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac1);
+		break;
+	case KVM_REG_PPC_IAC2:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac2);
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case KVM_REG_PPC_IAC3:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac3);
+		break;
+	case KVM_REG_PPC_IAC4:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.iac4);
+		break;
+#endif
+	case KVM_REG_PPC_DAC1:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.dac1);
+		break;
+	case KVM_REG_PPC_DAC2:
+		*val = get_reg_val(id, vcpu->arch.dbg_reg.dac2);
+		break;
+	case KVM_REG_PPC_EPR: {
+		u32 epr = kvmppc_get_epr(vcpu);
+		*val = get_reg_val(id, epr);
+		break;
+	}
+#if defined(CONFIG_64BIT)
+	case KVM_REG_PPC_EPCR:
+		*val = get_reg_val(id, vcpu->arch.epcr);
+		break;
+#endif
+	case KVM_REG_PPC_TCR:
+		*val = get_reg_val(id, vcpu->arch.tcr);
+		break;
+	case KVM_REG_PPC_TSR:
+		*val = get_reg_val(id, vcpu->arch.tsr);
+		break;
+	case KVM_REG_PPC_DEBUG_INST:
+		*val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
+		break;
+	case KVM_REG_PPC_VRSAVE:
+		*val = get_reg_val(id, vcpu->arch.vrsave);
+		break;
+	default:
+		r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val);
+		break;
+	}
+
+	return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+			union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_IAC1:
+		vcpu->arch.dbg_reg.iac1 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_IAC2:
+		vcpu->arch.dbg_reg.iac2 = set_reg_val(id, *val);
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case KVM_REG_PPC_IAC3:
+		vcpu->arch.dbg_reg.iac3 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_IAC4:
+		vcpu->arch.dbg_reg.iac4 = set_reg_val(id, *val);
+		break;
+#endif
+	case KVM_REG_PPC_DAC1:
+		vcpu->arch.dbg_reg.dac1 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DAC2:
+		vcpu->arch.dbg_reg.dac2 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_EPR: {
+		u32 new_epr = set_reg_val(id, *val);
+		kvmppc_set_epr(vcpu, new_epr);
+		break;
+	}
+#if defined(CONFIG_64BIT)
+	case KVM_REG_PPC_EPCR: {
+		u32 new_epcr = set_reg_val(id, *val);
+		kvmppc_set_epcr(vcpu, new_epcr);
+		break;
+	}
+#endif
+	case KVM_REG_PPC_OR_TSR: {
+		u32 tsr_bits = set_reg_val(id, *val);
+		kvmppc_set_tsr_bits(vcpu, tsr_bits);
+		break;
+	}
+	case KVM_REG_PPC_CLEAR_TSR: {
+		u32 tsr_bits = set_reg_val(id, *val);
+		kvmppc_clr_tsr_bits(vcpu, tsr_bits);
+		break;
+	}
+	case KVM_REG_PPC_TSR: {
+		u32 tsr = set_reg_val(id, *val);
+		kvmppc_set_tsr(vcpu, tsr);
+		break;
+	}
+	case KVM_REG_PPC_TCR: {
+		u32 tcr = set_reg_val(id, *val);
+		kvmppc_set_tcr(vcpu, tcr);
+		break;
+	}
+	case KVM_REG_PPC_VRSAVE:
+		vcpu->arch.vrsave = set_reg_val(id, *val);
+		break;
+	default:
+		r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val);
+		break;
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                  struct kvm_translation *tr)
+{
+	int r;
+
+	r = kvmppc_core_vcpu_translate(vcpu, tr);
+	return r;
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+	return -ENOTSUPP;
+}
+
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+			      struct kvm_memory_slot *dont)
+{
+}
+
+int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+			       unsigned long npages)
+{
+	return 0;
+}
+
+int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot,
+				      struct kvm_userspace_memory_region *mem)
+{
+	return 0;
+}
+
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+				struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old)
+{
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+}
+
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr)
+{
+#if defined(CONFIG_64BIT)
+	vcpu->arch.epcr = new_epcr;
+#ifdef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM;
+	if (vcpu->arch.epcr  & SPRN_EPCR_ICM)
+		vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM;
+#endif
+#endif
+}
+
+void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
+{
+	vcpu->arch.tcr = new_tcr;
+	arm_next_watchdog(vcpu);
+	update_timer_ints(vcpu);
+}
+
+void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
+{
+	set_bits(tsr_bits, &vcpu->arch.tsr);
+	smp_wmb();
+	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+	kvm_vcpu_kick(vcpu);
+}
+
+void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
+{
+	clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+	/*
+	 * We may have stopped the watchdog due to
+	 * being stuck on final expiration.
+	 */
+	if (tsr_bits & (TSR_ENW | TSR_WIS))
+		arm_next_watchdog(vcpu);
+
+	update_timer_ints(vcpu);
+}
+
+void kvmppc_decrementer_func(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.tcr & TCR_ARE) {
+		vcpu->arch.dec = vcpu->arch.decar;
+		kvmppc_emulate_dec(vcpu);
+	}
+
+	kvmppc_set_tsr_bits(vcpu, TSR_DIS);
+}
+
+static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg,
+				       uint64_t addr, int index)
+{
+	switch (index) {
+	case 0:
+		dbg_reg->dbcr0 |= DBCR0_IAC1;
+		dbg_reg->iac1 = addr;
+		break;
+	case 1:
+		dbg_reg->dbcr0 |= DBCR0_IAC2;
+		dbg_reg->iac2 = addr;
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case 2:
+		dbg_reg->dbcr0 |= DBCR0_IAC3;
+		dbg_reg->iac3 = addr;
+		break;
+	case 3:
+		dbg_reg->dbcr0 |= DBCR0_IAC4;
+		dbg_reg->iac4 = addr;
+		break;
+#endif
+	default:
+		return -EINVAL;
+	}
+
+	dbg_reg->dbcr0 |= DBCR0_IDM;
+	return 0;
+}
+
+static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr,
+				       int type, int index)
+{
+	switch (index) {
+	case 0:
+		if (type & KVMPPC_DEBUG_WATCH_READ)
+			dbg_reg->dbcr0 |= DBCR0_DAC1R;
+		if (type & KVMPPC_DEBUG_WATCH_WRITE)
+			dbg_reg->dbcr0 |= DBCR0_DAC1W;
+		dbg_reg->dac1 = addr;
+		break;
+	case 1:
+		if (type & KVMPPC_DEBUG_WATCH_READ)
+			dbg_reg->dbcr0 |= DBCR0_DAC2R;
+		if (type & KVMPPC_DEBUG_WATCH_WRITE)
+			dbg_reg->dbcr0 |= DBCR0_DAC2W;
+		dbg_reg->dac2 = addr;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	dbg_reg->dbcr0 |= DBCR0_IDM;
+	return 0;
+}
+void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set)
+{
+	/* XXX: Add similar MSR protection for BookE-PR */
+#ifdef CONFIG_KVM_BOOKE_HV
+	BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP));
+	if (set) {
+		if (prot_bitmap & MSR_UCLE)
+			vcpu->arch.shadow_msrp |= MSRP_UCLEP;
+		if (prot_bitmap & MSR_DE)
+			vcpu->arch.shadow_msrp |= MSRP_DEP;
+		if (prot_bitmap & MSR_PMM)
+			vcpu->arch.shadow_msrp |= MSRP_PMMP;
+	} else {
+		if (prot_bitmap & MSR_UCLE)
+			vcpu->arch.shadow_msrp &= ~MSRP_UCLEP;
+		if (prot_bitmap & MSR_DE)
+			vcpu->arch.shadow_msrp &= ~MSRP_DEP;
+		if (prot_bitmap & MSR_PMM)
+			vcpu->arch.shadow_msrp &= ~MSRP_PMMP;
+	}
+#endif
+}
+
+int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid,
+		 enum xlate_readwrite xlrw, struct kvmppc_pte *pte)
+{
+	int gtlb_index;
+	gpa_t gpaddr;
+
+#ifdef CONFIG_KVM_E500V2
+	if (!(vcpu->arch.shared->msr & MSR_PR) &&
+	    (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) {
+		pte->eaddr = eaddr;
+		pte->raddr = (vcpu->arch.magic_page_pa & PAGE_MASK) |
+			     (eaddr & ~PAGE_MASK);
+		pte->vpage = eaddr >> PAGE_SHIFT;
+		pte->may_read = true;
+		pte->may_write = true;
+		pte->may_execute = true;
+
+		return 0;
+	}
+#endif
+
+	/* Check the guest TLB. */
+	switch (xlid) {
+	case XLATE_INST:
+		gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr);
+		break;
+	case XLATE_DATA:
+		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
+		break;
+	default:
+		BUG();
+	}
+
+	/* Do we have a TLB entry at all? */
+	if (gtlb_index < 0)
+		return -ENOENT;
+
+	gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr);
+
+	pte->eaddr = eaddr;
+	pte->raddr = (gpaddr & PAGE_MASK) | (eaddr & ~PAGE_MASK);
+	pte->vpage = eaddr >> PAGE_SHIFT;
+
+	/* XXX read permissions from the guest TLB */
+	pte->may_read = true;
+	pte->may_write = true;
+	pte->may_execute = true;
+
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					 struct kvm_guest_debug *dbg)
+{
+	struct debug_reg *dbg_reg;
+	int n, b = 0, w = 0;
+
+	if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
+		vcpu->arch.dbg_reg.dbcr0 = 0;
+		vcpu->guest_debug = 0;
+		kvm_guest_protect_msr(vcpu, MSR_DE, false);
+		return 0;
+	}
+
+	kvm_guest_protect_msr(vcpu, MSR_DE, true);
+	vcpu->guest_debug = dbg->control;
+	vcpu->arch.dbg_reg.dbcr0 = 0;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vcpu->arch.dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC;
+
+	/* Code below handles only HW breakpoints */
+	dbg_reg = &(vcpu->arch.dbg_reg);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/*
+	 * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1
+	 * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0
+	 */
+	dbg_reg->dbcr1 = 0;
+	dbg_reg->dbcr2 = 0;
+#else
+	/*
+	 * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1
+	 * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR
+	 * is set.
+	 */
+	dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US |
+			  DBCR1_IAC4US;
+	dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US;
+#endif
+
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+		return 0;
+
+	for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) {
+		uint64_t addr = dbg->arch.bp[n].addr;
+		uint32_t type = dbg->arch.bp[n].type;
+
+		if (type == KVMPPC_DEBUG_NONE)
+			continue;
+
+		if (type & !(KVMPPC_DEBUG_WATCH_READ |
+			     KVMPPC_DEBUG_WATCH_WRITE |
+			     KVMPPC_DEBUG_BREAKPOINT))
+			return -EINVAL;
+
+		if (type & KVMPPC_DEBUG_BREAKPOINT) {
+			/* Setting H/W breakpoint */
+			if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++))
+				return -EINVAL;
+		} else {
+			/* Setting H/W watchpoint */
+			if (kvmppc_booke_add_watchpoint(dbg_reg, addr,
+							type, w++))
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	vcpu->cpu = smp_processor_id();
+	current->thread.kvm_vcpu = vcpu;
+}
+
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	current->thread.kvm_vcpu = NULL;
+	vcpu->cpu = -1;
+
+	/* Clear pending debug event in DBSR */
+	kvmppc_clear_dbsr();
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return kvm->arch.kvm_ops->init_vm(kvm);
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	return kvm->arch.kvm_ops->vcpu_create(kvm, id);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu);
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvm->arch.kvm_ops->destroy_vm(kvm);
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu);
+}
+
+int __init kvmppc_booke_init(void)
+{
+#ifndef CONFIG_KVM_BOOKE_HV
+	unsigned long ivor[16];
+	unsigned long *handler = kvmppc_booke_handler_addr;
+	unsigned long max_ivor = 0;
+	unsigned long handler_len;
+	int i;
+
+	/* We install our own exception handlers by hijacking IVPR. IVPR must
+	 * be 16-bit aligned, so we need a 64KB allocation. */
+	kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+	                                         VCPU_SIZE_ORDER);
+	if (!kvmppc_booke_handlers)
+		return -ENOMEM;
+
+	/* XXX make sure our handlers are smaller than Linux's */
+
+	/* Copy our interrupt handlers to match host IVORs. That way we don't
+	 * have to swap the IVORs on every guest/host transition. */
+	ivor[0] = mfspr(SPRN_IVOR0);
+	ivor[1] = mfspr(SPRN_IVOR1);
+	ivor[2] = mfspr(SPRN_IVOR2);
+	ivor[3] = mfspr(SPRN_IVOR3);
+	ivor[4] = mfspr(SPRN_IVOR4);
+	ivor[5] = mfspr(SPRN_IVOR5);
+	ivor[6] = mfspr(SPRN_IVOR6);
+	ivor[7] = mfspr(SPRN_IVOR7);
+	ivor[8] = mfspr(SPRN_IVOR8);
+	ivor[9] = mfspr(SPRN_IVOR9);
+	ivor[10] = mfspr(SPRN_IVOR10);
+	ivor[11] = mfspr(SPRN_IVOR11);
+	ivor[12] = mfspr(SPRN_IVOR12);
+	ivor[13] = mfspr(SPRN_IVOR13);
+	ivor[14] = mfspr(SPRN_IVOR14);
+	ivor[15] = mfspr(SPRN_IVOR15);
+
+	for (i = 0; i < 16; i++) {
+		if (ivor[i] > max_ivor)
+			max_ivor = i;
+
+		handler_len = handler[i + 1] - handler[i];
+		memcpy((void *)kvmppc_booke_handlers + ivor[i],
+		       (void *)handler[i], handler_len);
+	}
+
+	handler_len = handler[max_ivor + 1] - handler[max_ivor];
+	flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
+			   ivor[max_ivor] + handler_len);
+#endif /* !BOOKE_HV */
+	return 0;
+}
+
+void __exit kvmppc_booke_exit(void)
+{
+	free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
+	kvm_exit();
+}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
new file mode 100644
index 000000000..22ba08ea6
--- /dev/null
+++ b/arch/powerpc/kvm/booke.h
@@ -0,0 +1,129 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __KVM_BOOKE_H__
+#define __KVM_BOOKE_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include <asm/switch_to.h>
+#include "timing.h"
+
+/* interrupt priortity ordering */
+#define BOOKE_IRQPRIO_DATA_STORAGE 0
+#define BOOKE_IRQPRIO_INST_STORAGE 1
+#define BOOKE_IRQPRIO_ALIGNMENT 2
+#define BOOKE_IRQPRIO_PROGRAM 3
+#define BOOKE_IRQPRIO_FP_UNAVAIL 4
+#ifdef CONFIG_SPE_POSSIBLE
+#define BOOKE_IRQPRIO_SPE_UNAVAIL 5
+#define BOOKE_IRQPRIO_SPE_FP_DATA 6
+#define BOOKE_IRQPRIO_SPE_FP_ROUND 7
+#endif
+#ifdef CONFIG_PPC_E500MC
+#define BOOKE_IRQPRIO_ALTIVEC_UNAVAIL 5
+#define BOOKE_IRQPRIO_ALTIVEC_ASSIST 6
+#endif
+#define BOOKE_IRQPRIO_SYSCALL 8
+#define BOOKE_IRQPRIO_AP_UNAVAIL 9
+#define BOOKE_IRQPRIO_DTLB_MISS 10
+#define BOOKE_IRQPRIO_ITLB_MISS 11
+#define BOOKE_IRQPRIO_MACHINE_CHECK 12
+#define BOOKE_IRQPRIO_DEBUG 13
+#define BOOKE_IRQPRIO_CRITICAL 14
+#define BOOKE_IRQPRIO_WATCHDOG 15
+#define BOOKE_IRQPRIO_EXTERNAL 16
+#define BOOKE_IRQPRIO_FIT 17
+#define BOOKE_IRQPRIO_DECREMENTER 18
+#define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
+/* Internal pseudo-irqprio for level triggered externals */
+#define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20
+#define BOOKE_IRQPRIO_DBELL 21
+#define BOOKE_IRQPRIO_DBELL_CRIT 22
+#define BOOKE_IRQPRIO_MAX 23
+
+#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \
+			  (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \
+			  (1 << BOOKE_IRQPRIO_DBELL) | \
+			  (1 << BOOKE_IRQPRIO_DECREMENTER) | \
+			  (1 << BOOKE_IRQPRIO_FIT) | \
+			  (1 << BOOKE_IRQPRIO_EXTERNAL))
+
+#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \
+			  (1 << BOOKE_IRQPRIO_WATCHDOG) | \
+			  (1 << BOOKE_IRQPRIO_CRITICAL))
+
+extern unsigned long kvmppc_booke_handlers;
+extern unsigned long kvmppc_booke_handler_addr[];
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
+
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr);
+void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
+void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
+void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
+
+int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                            unsigned int inst, int *advance);
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val);
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val);
+
+/* low-level asm code to transfer guest state */
+void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu);
+void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu);
+
+/* high-level function, manages flags, host state */
+void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
+
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu);
+
+enum int_class {
+	INT_CLASS_NONCRIT,
+	INT_CLASS_CRIT,
+	INT_CLASS_MC,
+	INT_CLASS_DBG,
+};
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type);
+
+extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_e500(struct kvm_run *run,
+				       struct kvm_vcpu *vcpu,
+				       unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong *spr_val);
+extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_emulate_op_e500(struct kvm_run *run,
+				       struct kvm_vcpu *vcpu,
+				       unsigned int inst, int *advance);
+extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong spr_val);
+extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn,
+					  ulong *spr_val);
+
+static inline void kvmppc_clear_dbsr(void)
+{
+	mtspr(SPRN_DBSR, mfspr(SPRN_DBSR));
+}
+#endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
new file mode 100644
index 000000000..a82f64502
--- /dev/null
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -0,0 +1,522 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/disassemble.h>
+
+#include "booke.h"
+
+#define OP_19_XOP_RFI     50
+#define OP_19_XOP_RFCI    51
+#define OP_19_XOP_RFDI    39
+
+#define OP_31_XOP_MFMSR   83
+#define OP_31_XOP_WRTEE   131
+#define OP_31_XOP_MTMSR   146
+#define OP_31_XOP_WRTEEI  163
+
+static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.pc = vcpu->arch.shared->srr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1);
+}
+
+static void kvmppc_emul_rfdi(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.pc = vcpu->arch.dsrr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.dsrr1);
+}
+
+static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.pc = vcpu->arch.csrr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.csrr1);
+}
+
+int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                            unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int rs = get_rs(inst);
+	int rt = get_rt(inst);
+
+	switch (get_op(inst)) {
+	case 19:
+		switch (get_xop(inst)) {
+		case OP_19_XOP_RFI:
+			kvmppc_emul_rfi(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS);
+			*advance = 0;
+			break;
+
+		case OP_19_XOP_RFCI:
+			kvmppc_emul_rfci(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS);
+			*advance = 0;
+			break;
+
+		case OP_19_XOP_RFDI:
+			kvmppc_emul_rfdi(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFDI_EXITS);
+			*advance = 0;
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+			break;
+		}
+		break;
+
+	case 31:
+		switch (get_xop(inst)) {
+
+		case OP_31_XOP_MFMSR:
+			kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr);
+			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
+			break;
+
+		case OP_31_XOP_MTMSR:
+			kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
+			kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs));
+			break;
+
+		case OP_31_XOP_WRTEE:
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
+					| (kvmppc_get_gpr(vcpu, rs) & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+			break;
+
+		case OP_31_XOP_WRTEEI:
+			vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE)
+							 | (inst & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+		}
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
+
+/*
+ * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode).
+ * Their backing store is in real registers, and these functions
+ * will return the wrong result if called for them in another context
+ * (such as debugging).
+ */
+int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+{
+	int emulated = EMULATE_DONE;
+	bool debug_inst = false;
+
+	switch (sprn) {
+	case SPRN_DEAR:
+		vcpu->arch.shared->dar = spr_val;
+		break;
+	case SPRN_ESR:
+		vcpu->arch.shared->esr = spr_val;
+		break;
+	case SPRN_CSRR0:
+		vcpu->arch.csrr0 = spr_val;
+		break;
+	case SPRN_CSRR1:
+		vcpu->arch.csrr1 = spr_val;
+		break;
+	case SPRN_DSRR0:
+		vcpu->arch.dsrr0 = spr_val;
+		break;
+	case SPRN_DSRR1:
+		vcpu->arch.dsrr1 = spr_val;
+		break;
+	case SPRN_IAC1:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac1 = spr_val;
+		break;
+	case SPRN_IAC2:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac2 = spr_val;
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case SPRN_IAC3:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac3 = spr_val;
+		break;
+	case SPRN_IAC4:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.iac4 = spr_val;
+		break;
+#endif
+	case SPRN_DAC1:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dac1 = spr_val;
+		break;
+	case SPRN_DAC2:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dac2 = spr_val;
+		break;
+	case SPRN_DBCR0:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		spr_val &= (DBCR0_IDM | DBCR0_IC | DBCR0_BT | DBCR0_TIE |
+			DBCR0_IAC1 | DBCR0_IAC2 | DBCR0_IAC3 | DBCR0_IAC4  |
+			DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W);
+
+		vcpu->arch.dbg_reg.dbcr0 = spr_val;
+		break;
+	case SPRN_DBCR1:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dbcr1 = spr_val;
+		break;
+	case SPRN_DBCR2:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		debug_inst = true;
+		vcpu->arch.dbg_reg.dbcr2 = spr_val;
+		break;
+	case SPRN_DBSR:
+		/*
+		 * If userspace is debugging guest then guest
+		 * can not access debug registers.
+		 */
+		if (vcpu->guest_debug)
+			break;
+
+		vcpu->arch.dbsr &= ~spr_val;
+		if (!(vcpu->arch.dbsr & ~DBSR_IDE))
+			kvmppc_core_dequeue_debug(vcpu);
+		break;
+	case SPRN_TSR:
+		kvmppc_clr_tsr_bits(vcpu, spr_val);
+		break;
+	case SPRN_TCR:
+		/*
+		 * WRC is a 2-bit field that is supposed to preserve its
+		 * value once written to non-zero.
+		 */
+		if (vcpu->arch.tcr & TCR_WRC_MASK) {
+			spr_val &= ~TCR_WRC_MASK;
+			spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
+		}
+		kvmppc_set_tcr(vcpu, spr_val);
+		break;
+
+	case SPRN_DECAR:
+		vcpu->arch.decar = spr_val;
+		break;
+	/*
+	 * Note: SPRG4-7 are user-readable.
+	 * These values are loaded into the real SPRGs when resuming the
+	 * guest (PR-mode only).
+	 */
+	case SPRN_SPRG4:
+		kvmppc_set_sprg4(vcpu, spr_val);
+		break;
+	case SPRN_SPRG5:
+		kvmppc_set_sprg5(vcpu, spr_val);
+		break;
+	case SPRN_SPRG6:
+		kvmppc_set_sprg6(vcpu, spr_val);
+		break;
+	case SPRN_SPRG7:
+		kvmppc_set_sprg7(vcpu, spr_val);
+		break;
+
+	case SPRN_IVPR:
+		vcpu->arch.ivpr = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVPR, spr_val);
+#endif
+		break;
+	case SPRN_IVOR0:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val;
+		break;
+	case SPRN_IVOR1:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val;
+		break;
+	case SPRN_IVOR2:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR2, spr_val);
+#endif
+		break;
+	case SPRN_IVOR3:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val;
+		break;
+	case SPRN_IVOR4:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val;
+		break;
+	case SPRN_IVOR5:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val;
+		break;
+	case SPRN_IVOR6:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val;
+		break;
+	case SPRN_IVOR7:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR8:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR8, spr_val);
+#endif
+		break;
+	case SPRN_IVOR9:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR10:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val;
+		break;
+	case SPRN_IVOR11:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val;
+		break;
+	case SPRN_IVOR12:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val;
+		break;
+	case SPRN_IVOR13:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val;
+		break;
+	case SPRN_IVOR14:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val;
+		break;
+	case SPRN_IVOR15:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
+		break;
+	case SPRN_MCSR:
+		vcpu->arch.mcsr &= ~spr_val;
+		break;
+#if defined(CONFIG_64BIT)
+	case SPRN_EPCR:
+		kvmppc_set_epcr(vcpu, spr_val);
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+#endif
+		break;
+#endif
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (debug_inst) {
+		current->thread.debug = vcpu->arch.dbg_reg;
+		switch_booke_debug_regs(&vcpu->arch.dbg_reg);
+	}
+	return emulated;
+}
+
+int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+	case SPRN_IVPR:
+		*spr_val = vcpu->arch.ivpr;
+		break;
+	case SPRN_DEAR:
+		*spr_val = vcpu->arch.shared->dar;
+		break;
+	case SPRN_ESR:
+		*spr_val = vcpu->arch.shared->esr;
+		break;
+	case SPRN_EPR:
+		*spr_val = vcpu->arch.epr;
+		break;
+	case SPRN_CSRR0:
+		*spr_val = vcpu->arch.csrr0;
+		break;
+	case SPRN_CSRR1:
+		*spr_val = vcpu->arch.csrr1;
+		break;
+	case SPRN_DSRR0:
+		*spr_val = vcpu->arch.dsrr0;
+		break;
+	case SPRN_DSRR1:
+		*spr_val = vcpu->arch.dsrr1;
+		break;
+	case SPRN_IAC1:
+		*spr_val = vcpu->arch.dbg_reg.iac1;
+		break;
+	case SPRN_IAC2:
+		*spr_val = vcpu->arch.dbg_reg.iac2;
+		break;
+#if CONFIG_PPC_ADV_DEBUG_IACS > 2
+	case SPRN_IAC3:
+		*spr_val = vcpu->arch.dbg_reg.iac3;
+		break;
+	case SPRN_IAC4:
+		*spr_val = vcpu->arch.dbg_reg.iac4;
+		break;
+#endif
+	case SPRN_DAC1:
+		*spr_val = vcpu->arch.dbg_reg.dac1;
+		break;
+	case SPRN_DAC2:
+		*spr_val = vcpu->arch.dbg_reg.dac2;
+		break;
+	case SPRN_DBCR0:
+		*spr_val = vcpu->arch.dbg_reg.dbcr0;
+		if (vcpu->guest_debug)
+			*spr_val = *spr_val | DBCR0_EDM;
+		break;
+	case SPRN_DBCR1:
+		*spr_val = vcpu->arch.dbg_reg.dbcr1;
+		break;
+	case SPRN_DBCR2:
+		*spr_val = vcpu->arch.dbg_reg.dbcr2;
+		break;
+	case SPRN_DBSR:
+		*spr_val = vcpu->arch.dbsr;
+		break;
+	case SPRN_TSR:
+		*spr_val = vcpu->arch.tsr;
+		break;
+	case SPRN_TCR:
+		*spr_val = vcpu->arch.tcr;
+		break;
+
+	case SPRN_IVOR0:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+		break;
+	case SPRN_IVOR1:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+		break;
+	case SPRN_IVOR2:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+		break;
+	case SPRN_IVOR3:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+		break;
+	case SPRN_IVOR4:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+		break;
+	case SPRN_IVOR5:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+		break;
+	case SPRN_IVOR6:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+		break;
+	case SPRN_IVOR7:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+		break;
+	case SPRN_IVOR8:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+		break;
+	case SPRN_IVOR9:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+		break;
+	case SPRN_IVOR10:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+		break;
+	case SPRN_IVOR11:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+		break;
+	case SPRN_IVOR12:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+		break;
+	case SPRN_IVOR13:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+		break;
+	case SPRN_IVOR14:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+		break;
+	case SPRN_IVOR15:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+		break;
+	case SPRN_MCSR:
+		*spr_val = vcpu->arch.mcsr;
+		break;
+#if defined(CONFIG_64BIT)
+	case SPRN_EPCR:
+		*spr_val = vcpu->arch.epcr;
+		break;
+#endif
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
new file mode 100644
index 000000000..84c308a9a
--- /dev/null
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -0,0 +1,547 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+
+/* The host stack layout: */
+#define HOST_R1         0 /* Implied by stwu. */
+#define HOST_CALLEE_LR  4
+#define HOST_RUN        8
+/* r2 is special: it holds 'current', and it made nonvolatile in the
+ * kernel with the -ffixed-r2 gcc option. */
+#define HOST_R2         12
+#define HOST_CR         16
+#define HOST_NV_GPRS    20
+#define __HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * 4))
+#define HOST_NV_GPR(n)  __HOST_NV_GPR(__REG_##n)
+#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + 4)
+#define HOST_STACK_SIZE (((HOST_MIN_STACK_SIZE + 15) / 16) * 16) /* Align. */
+#define HOST_STACK_LR   (HOST_STACK_SIZE + 4) /* In caller stack frame. */
+
+#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                        (1<<BOOKE_INTERRUPT_DEBUG))
+
+#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
+                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                        (1<<BOOKE_INTERRUPT_ALIGNMENT))
+
+#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \
+                       (1<<BOOKE_INTERRUPT_INST_STORAGE) | \
+                       (1<<BOOKE_INTERRUPT_PROGRAM) | \
+                       (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
+                       (1<<BOOKE_INTERRUPT_ALIGNMENT))
+
+.macro __KVM_HANDLER ivor_nr scratch srr0
+	/* Get pointer to vcpu and record exit number. */
+	mtspr	\scratch , r4
+	mfspr   r4, SPRN_SPRG_THREAD
+	lwz     r4, THREAD_KVM_VCPU(r4)
+	stw	r3, VCPU_GPR(R3)(r4)
+	stw	r5, VCPU_GPR(R5)(r4)
+	stw	r6, VCPU_GPR(R6)(r4)
+	mfspr	r3, \scratch
+	mfctr	r5
+	stw	r3, VCPU_GPR(R4)(r4)
+	stw	r5, VCPU_CTR(r4)
+	mfspr	r3, \srr0
+	lis	r6, kvmppc_resume_host@h
+	stw	r3, VCPU_PC(r4)
+	li	r5, \ivor_nr
+	ori	r6, r6, kvmppc_resume_host@l
+	mtctr	r6
+	bctr
+.endm
+
+.macro KVM_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+	__KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
+.macro KVM_DBG_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+	mtspr   \scratch, r4
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	stw	r3, VCPU_CRIT_SAVE(r4)
+	mfcr	r3
+	mfspr	r4, SPRN_CSRR1
+	andi.	r4, r4, MSR_PR
+	bne	1f
+	/* debug interrupt happened in enter/exit path */
+	mfspr   r4, SPRN_CSRR1
+	rlwinm  r4, r4, 0, ~MSR_DE
+	mtspr   SPRN_CSRR1, r4
+	lis	r4, 0xffff
+	ori	r4, r4, 0xffff
+	mtspr	SPRN_DBSR, r4
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	mtcr	r3
+	lwz     r3, VCPU_CRIT_SAVE(r4)
+	mfspr   r4, \scratch
+	rfci
+1:	/* debug interrupt happened in guest */
+	mtcr	r3
+	mfspr	r4, SPRN_SPRG_THREAD
+	lwz	r4, THREAD_KVM_VCPU(r4)
+	lwz     r3, VCPU_CRIT_SAVE(r4)
+	mfspr   r4, \scratch
+	__KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
+.macro KVM_HANDLER_ADDR ivor_nr
+	.long	kvmppc_handler_\ivor_nr
+.endm
+
+.macro KVM_HANDLER_END
+	.long	kvmppc_handlers_end
+.endm
+
+_GLOBAL(kvmppc_handlers_start)
+KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK  SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0
+KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
+_GLOBAL(kvmppc_handlers_end)
+
+/* Registers:
+ *  SPRG_SCRATCH0: guest r4
+ *  r4: vcpu pointer
+ *  r5: KVM exit number
+ */
+_GLOBAL(kvmppc_resume_host)
+	mfcr	r3
+	stw	r3, VCPU_CR(r4)
+	stw	r7, VCPU_GPR(R7)(r4)
+	stw	r8, VCPU_GPR(R8)(r4)
+	stw	r9, VCPU_GPR(R9)(r4)
+
+	li	r6, 1
+	slw	r6, r6, r5
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save exit time */
+1:
+	mfspr	r7, SPRN_TBRU
+	mfspr	r8, SPRN_TBRL
+	mfspr	r9, SPRN_TBRU
+	cmpw	r9, r7
+	bne	1b
+	stw	r8, VCPU_TIMING_EXIT_TBL(r4)
+	stw	r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
+	/* Save the faulting instruction and all GPRs for emulation. */
+	andi.	r7, r6, NEED_INST_MASK
+	beq	..skip_inst_copy
+	mfspr	r9, SPRN_SRR0
+	mfmsr	r8
+	ori	r7, r8, MSR_DS
+	mtmsr	r7
+	isync
+	lwz	r9, 0(r9)
+	mtmsr	r8
+	isync
+	stw	r9, VCPU_LAST_INST(r4)
+
+	stw	r15, VCPU_GPR(R15)(r4)
+	stw	r16, VCPU_GPR(R16)(r4)
+	stw	r17, VCPU_GPR(R17)(r4)
+	stw	r18, VCPU_GPR(R18)(r4)
+	stw	r19, VCPU_GPR(R19)(r4)
+	stw	r20, VCPU_GPR(R20)(r4)
+	stw	r21, VCPU_GPR(R21)(r4)
+	stw	r22, VCPU_GPR(R22)(r4)
+	stw	r23, VCPU_GPR(R23)(r4)
+	stw	r24, VCPU_GPR(R24)(r4)
+	stw	r25, VCPU_GPR(R25)(r4)
+	stw	r26, VCPU_GPR(R26)(r4)
+	stw	r27, VCPU_GPR(R27)(r4)
+	stw	r28, VCPU_GPR(R28)(r4)
+	stw	r29, VCPU_GPR(R29)(r4)
+	stw	r30, VCPU_GPR(R30)(r4)
+	stw	r31, VCPU_GPR(R31)(r4)
+..skip_inst_copy:
+
+	/* Also grab DEAR and ESR before the host can clobber them. */
+
+	andi.	r7, r6, NEED_DEAR_MASK
+	beq	..skip_dear
+	mfspr	r9, SPRN_DEAR
+	stw	r9, VCPU_FAULT_DEAR(r4)
+..skip_dear:
+
+	andi.	r7, r6, NEED_ESR_MASK
+	beq	..skip_esr
+	mfspr	r9, SPRN_ESR
+	stw	r9, VCPU_FAULT_ESR(r4)
+..skip_esr:
+
+	/* Save remaining volatile guest register state to vcpu. */
+	stw	r0, VCPU_GPR(R0)(r4)
+	stw	r1, VCPU_GPR(R1)(r4)
+	stw	r2, VCPU_GPR(R2)(r4)
+	stw	r10, VCPU_GPR(R10)(r4)
+	stw	r11, VCPU_GPR(R11)(r4)
+	stw	r12, VCPU_GPR(R12)(r4)
+	stw	r13, VCPU_GPR(R13)(r4)
+	stw	r14, VCPU_GPR(R14)(r4) /* We need a NV GPR below. */
+	mflr	r3
+	stw	r3, VCPU_LR(r4)
+	mfxer	r3
+	stw	r3, VCPU_XER(r4)
+
+	/* Restore host stack pointer and PID before IVPR, since the host
+	 * exception handlers use them. */
+	lwz	r1, VCPU_HOST_STACK(r4)
+	lwz	r3, VCPU_HOST_PID(r4)
+	mtspr	SPRN_PID, r3
+
+#ifdef CONFIG_FSL_BOOKE
+	/* we cheat and know that Linux doesn't use PID1 which is always 0 */
+	lis	r3, 0
+	mtspr	SPRN_PID1, r3
+#endif
+
+	/* Restore host IVPR before re-enabling interrupts. We cheat and know
+	 * that Linux IVPR is always 0xc0000000. */
+	lis	r3, 0xc000
+	mtspr	SPRN_IVPR, r3
+
+	/* Switch to kernel stack and jump to handler. */
+	LOAD_REG_ADDR(r3, kvmppc_handle_exit)
+	mtctr	r3
+	lwz	r3, HOST_RUN(r1)
+	lwz	r2, HOST_R2(r1)
+	mr	r14, r4 /* Save vcpu pointer. */
+
+	bctrl	/* kvmppc_handle_exit() */
+
+	/* Restore vcpu pointer and the nonvolatiles we used. */
+	mr	r4, r14
+	lwz	r14, VCPU_GPR(R14)(r4)
+
+	/* Sometimes instruction emulation must restore complete GPR state. */
+	andi.	r5, r3, RESUME_FLAG_NV
+	beq	..skip_nv_load
+	lwz	r15, VCPU_GPR(R15)(r4)
+	lwz	r16, VCPU_GPR(R16)(r4)
+	lwz	r17, VCPU_GPR(R17)(r4)
+	lwz	r18, VCPU_GPR(R18)(r4)
+	lwz	r19, VCPU_GPR(R19)(r4)
+	lwz	r20, VCPU_GPR(R20)(r4)
+	lwz	r21, VCPU_GPR(R21)(r4)
+	lwz	r22, VCPU_GPR(R22)(r4)
+	lwz	r23, VCPU_GPR(R23)(r4)
+	lwz	r24, VCPU_GPR(R24)(r4)
+	lwz	r25, VCPU_GPR(R25)(r4)
+	lwz	r26, VCPU_GPR(R26)(r4)
+	lwz	r27, VCPU_GPR(R27)(r4)
+	lwz	r28, VCPU_GPR(R28)(r4)
+	lwz	r29, VCPU_GPR(R29)(r4)
+	lwz	r30, VCPU_GPR(R30)(r4)
+	lwz	r31, VCPU_GPR(R31)(r4)
+..skip_nv_load:
+
+	/* Should we return to the guest? */
+	andi.	r5, r3, RESUME_FLAG_HOST
+	beq	lightweight_exit
+
+	srawi	r3, r3, 2 /* Shift -ERR back down. */
+
+heavyweight_exit:
+	/* Not returning to guest. */
+
+#ifdef CONFIG_SPE
+	/* save guest SPEFSCR and load host SPEFSCR */
+	mfspr	r9, SPRN_SPEFSCR
+	stw	r9, VCPU_SPEFSCR(r4)
+	lwz	r9, VCPU_HOST_SPEFSCR(r4)
+	mtspr	SPRN_SPEFSCR, r9
+#endif
+
+	/* We already saved guest volatile register state; now save the
+	 * non-volatiles. */
+	stw	r15, VCPU_GPR(R15)(r4)
+	stw	r16, VCPU_GPR(R16)(r4)
+	stw	r17, VCPU_GPR(R17)(r4)
+	stw	r18, VCPU_GPR(R18)(r4)
+	stw	r19, VCPU_GPR(R19)(r4)
+	stw	r20, VCPU_GPR(R20)(r4)
+	stw	r21, VCPU_GPR(R21)(r4)
+	stw	r22, VCPU_GPR(R22)(r4)
+	stw	r23, VCPU_GPR(R23)(r4)
+	stw	r24, VCPU_GPR(R24)(r4)
+	stw	r25, VCPU_GPR(R25)(r4)
+	stw	r26, VCPU_GPR(R26)(r4)
+	stw	r27, VCPU_GPR(R27)(r4)
+	stw	r28, VCPU_GPR(R28)(r4)
+	stw	r29, VCPU_GPR(R29)(r4)
+	stw	r30, VCPU_GPR(R30)(r4)
+	stw	r31, VCPU_GPR(R31)(r4)
+
+	/* Load host non-volatile register state from host stack. */
+	lwz	r14, HOST_NV_GPR(R14)(r1)
+	lwz	r15, HOST_NV_GPR(R15)(r1)
+	lwz	r16, HOST_NV_GPR(R16)(r1)
+	lwz	r17, HOST_NV_GPR(R17)(r1)
+	lwz	r18, HOST_NV_GPR(R18)(r1)
+	lwz	r19, HOST_NV_GPR(R19)(r1)
+	lwz	r20, HOST_NV_GPR(R20)(r1)
+	lwz	r21, HOST_NV_GPR(R21)(r1)
+	lwz	r22, HOST_NV_GPR(R22)(r1)
+	lwz	r23, HOST_NV_GPR(R23)(r1)
+	lwz	r24, HOST_NV_GPR(R24)(r1)
+	lwz	r25, HOST_NV_GPR(R25)(r1)
+	lwz	r26, HOST_NV_GPR(R26)(r1)
+	lwz	r27, HOST_NV_GPR(R27)(r1)
+	lwz	r28, HOST_NV_GPR(R28)(r1)
+	lwz	r29, HOST_NV_GPR(R29)(r1)
+	lwz	r30, HOST_NV_GPR(R30)(r1)
+	lwz	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Return to kvm_vcpu_run(). */
+	lwz	r4, HOST_STACK_LR(r1)
+	lwz	r5, HOST_CR(r1)
+	addi	r1, r1, HOST_STACK_SIZE
+	mtlr	r4
+	mtcr	r5
+	/* r3 still contains the return code from kvmppc_handle_exit(). */
+	blr
+
+
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+	stwu	r1, -HOST_STACK_SIZE(r1)
+	stw	r1, VCPU_HOST_STACK(r4)	/* Save stack pointer to vcpu. */
+
+	/* Save host state to stack. */
+	stw	r3, HOST_RUN(r1)
+	mflr	r3
+	stw	r3, HOST_STACK_LR(r1)
+	mfcr	r5
+	stw	r5, HOST_CR(r1)
+
+	/* Save host non-volatile register state to stack. */
+	stw	r14, HOST_NV_GPR(R14)(r1)
+	stw	r15, HOST_NV_GPR(R15)(r1)
+	stw	r16, HOST_NV_GPR(R16)(r1)
+	stw	r17, HOST_NV_GPR(R17)(r1)
+	stw	r18, HOST_NV_GPR(R18)(r1)
+	stw	r19, HOST_NV_GPR(R19)(r1)
+	stw	r20, HOST_NV_GPR(R20)(r1)
+	stw	r21, HOST_NV_GPR(R21)(r1)
+	stw	r22, HOST_NV_GPR(R22)(r1)
+	stw	r23, HOST_NV_GPR(R23)(r1)
+	stw	r24, HOST_NV_GPR(R24)(r1)
+	stw	r25, HOST_NV_GPR(R25)(r1)
+	stw	r26, HOST_NV_GPR(R26)(r1)
+	stw	r27, HOST_NV_GPR(R27)(r1)
+	stw	r28, HOST_NV_GPR(R28)(r1)
+	stw	r29, HOST_NV_GPR(R29)(r1)
+	stw	r30, HOST_NV_GPR(R30)(r1)
+	stw	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Load guest non-volatiles. */
+	lwz	r14, VCPU_GPR(R14)(r4)
+	lwz	r15, VCPU_GPR(R15)(r4)
+	lwz	r16, VCPU_GPR(R16)(r4)
+	lwz	r17, VCPU_GPR(R17)(r4)
+	lwz	r18, VCPU_GPR(R18)(r4)
+	lwz	r19, VCPU_GPR(R19)(r4)
+	lwz	r20, VCPU_GPR(R20)(r4)
+	lwz	r21, VCPU_GPR(R21)(r4)
+	lwz	r22, VCPU_GPR(R22)(r4)
+	lwz	r23, VCPU_GPR(R23)(r4)
+	lwz	r24, VCPU_GPR(R24)(r4)
+	lwz	r25, VCPU_GPR(R25)(r4)
+	lwz	r26, VCPU_GPR(R26)(r4)
+	lwz	r27, VCPU_GPR(R27)(r4)
+	lwz	r28, VCPU_GPR(R28)(r4)
+	lwz	r29, VCPU_GPR(R29)(r4)
+	lwz	r30, VCPU_GPR(R30)(r4)
+	lwz	r31, VCPU_GPR(R31)(r4)
+
+#ifdef CONFIG_SPE
+	/* save host SPEFSCR and load guest SPEFSCR */
+	mfspr	r3, SPRN_SPEFSCR
+	stw	r3, VCPU_HOST_SPEFSCR(r4)
+	lwz	r3, VCPU_SPEFSCR(r4)
+	mtspr	SPRN_SPEFSCR, r3
+#endif
+
+lightweight_exit:
+	stw	r2, HOST_R2(r1)
+
+	mfspr	r3, SPRN_PID
+	stw	r3, VCPU_HOST_PID(r4)
+	lwz	r3, VCPU_SHADOW_PID(r4)
+	mtspr	SPRN_PID, r3
+
+#ifdef CONFIG_FSL_BOOKE
+	lwz	r3, VCPU_SHADOW_PID1(r4)
+	mtspr	SPRN_PID1, r3
+#endif
+
+	/* Load some guest volatiles. */
+	lwz	r0, VCPU_GPR(R0)(r4)
+	lwz	r2, VCPU_GPR(R2)(r4)
+	lwz	r9, VCPU_GPR(R9)(r4)
+	lwz	r10, VCPU_GPR(R10)(r4)
+	lwz	r11, VCPU_GPR(R11)(r4)
+	lwz	r12, VCPU_GPR(R12)(r4)
+	lwz	r13, VCPU_GPR(R13)(r4)
+	lwz	r3, VCPU_LR(r4)
+	mtlr	r3
+	lwz	r3, VCPU_XER(r4)
+	mtxer	r3
+
+	/* Switch the IVPR. XXX If we take a TLB miss after this we're screwed,
+	 * so how do we make sure vcpu won't fault? */
+	lis	r8, kvmppc_booke_handlers@ha
+	lwz	r8, kvmppc_booke_handlers@l(r8)
+	mtspr	SPRN_IVPR, r8
+
+	lwz	r5, VCPU_SHARED(r4)
+
+	/* Can't switch the stack pointer until after IVPR is switched,
+	 * because host interrupt handlers would get confused. */
+	lwz	r1, VCPU_GPR(R1)(r4)
+
+	/*
+	 * Host interrupt handlers may have clobbered these
+	 * guest-readable SPRGs, or the guest kernel may have
+	 * written directly to the shared area, so we
+	 * need to reload them here with the guest's values.
+	 */
+	PPC_LD(r3, VCPU_SHARED_SPRG4, r5)
+	mtspr	SPRN_SPRG4W, r3
+	PPC_LD(r3, VCPU_SHARED_SPRG5, r5)
+	mtspr	SPRN_SPRG5W, r3
+	PPC_LD(r3, VCPU_SHARED_SPRG6, r5)
+	mtspr	SPRN_SPRG6W, r3
+	PPC_LD(r3, VCPU_SHARED_SPRG7, r5)
+	mtspr	SPRN_SPRG7W, r3
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save enter time */
+1:
+	mfspr	r6, SPRN_TBRU
+	mfspr	r7, SPRN_TBRL
+	mfspr	r8, SPRN_TBRU
+	cmpw	r8, r6
+	bne	1b
+	stw	r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	stw	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
+	/* Finish loading guest volatiles and jump to guest. */
+	lwz	r3, VCPU_CTR(r4)
+	lwz	r5, VCPU_CR(r4)
+	lwz	r6, VCPU_PC(r4)
+	lwz	r7, VCPU_SHADOW_MSR(r4)
+	mtctr	r3
+	mtcr	r5
+	mtsrr0	r6
+	mtsrr1	r7
+	lwz	r5, VCPU_GPR(R5)(r4)
+	lwz	r6, VCPU_GPR(R6)(r4)
+	lwz	r7, VCPU_GPR(R7)(r4)
+	lwz	r8, VCPU_GPR(R8)(r4)
+
+	/* Clear any debug events which occurred since we disabled MSR[DE].
+	 * XXX This gives us a 3-instruction window in which a breakpoint
+	 * intended for guest context could fire in the host instead. */
+	lis	r3, 0xffff
+	ori	r3, r3, 0xffff
+	mtspr	SPRN_DBSR, r3
+
+	lwz	r3, VCPU_GPR(R3)(r4)
+	lwz	r4, VCPU_GPR(R4)(r4)
+	rfi
+
+	.data
+	.align	4
+	.globl	kvmppc_booke_handler_addr
+kvmppc_booke_handler_addr:
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA
+KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND
+KVM_HANDLER_END /*Always keep this in end*/
+
+#ifdef CONFIG_SPE
+_GLOBAL(kvmppc_save_guest_spe)
+	cmpi	0,r3,0
+	beqlr-
+	SAVE_32EVRS(0, r4, r3, VCPU_EVR)
+	evxor   evr6, evr6, evr6
+	evmwumiaa evr6, evr6, evr6
+	li	r4,VCPU_ACC
+	evstddx evr6, r4, r3		/* save acc */
+	blr
+
+_GLOBAL(kvmppc_load_guest_spe)
+	cmpi	0,r3,0
+	beqlr-
+	li      r4,VCPU_ACC
+	evlddx  evr6,r4,r3
+	evmra   evr6,evr6		/* load acc */
+	REST_32EVRS(0, r4, r3, VCPU_EVR)
+	blr
+#endif
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
new file mode 100644
index 000000000..81bd8a07a
--- /dev/null
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -0,0 +1,689 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Varun Sethi <varun.sethi@freescale.com>
+ * Author: Scott Wood <scotwood@freescale.com>
+ * Author: Mihai Caraman <mihai.caraman@freescale.com>
+ *
+ * This file is derived from arch/powerpc/kvm/booke_interrupts.S
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+#include <asm/bitsperlong.h>
+
+#ifdef CONFIG_64BIT
+#include <asm/exception-64e.h>
+#include <asm/hw_irq.h>
+#include <asm/irqflags.h>
+#else
+#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
+#endif
+
+#define LONGBYTES		(BITS_PER_LONG / 8)
+
+#define VCPU_GUEST_SPRG(n)	(VCPU_GUEST_SPRGS + (n * LONGBYTES))
+
+/* The host stack layout: */
+#define HOST_R1         0 /* Implied by stwu. */
+#define HOST_CALLEE_LR  PPC_LR_STKOFF
+#define HOST_RUN        (HOST_CALLEE_LR + LONGBYTES)
+/*
+ * r2 is special: it holds 'current', and it made nonvolatile in the
+ * kernel with the -ffixed-r2 gcc option.
+ */
+#define HOST_R2         (HOST_RUN + LONGBYTES)
+#define HOST_CR         (HOST_R2 + LONGBYTES)
+#define HOST_NV_GPRS    (HOST_CR + LONGBYTES)
+#define __HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
+#define HOST_NV_GPR(n)  __HOST_NV_GPR(__REG_##n)
+#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES)
+#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
+/* LR in caller stack frame. */
+#define HOST_STACK_LR	(HOST_STACK_SIZE + PPC_LR_STKOFF)
+
+#define NEED_EMU		0x00000001 /* emulation -- save nv regs */
+#define NEED_DEAR		0x00000002 /* save faulting DEAR */
+#define NEED_ESR		0x00000004 /* save faulting ESR */
+
+/*
+ * On entry:
+ * r4 = vcpu, r5 = srr0, r6 = srr1
+ * saved in vcpu: cr, ctr, r3-r13
+ */
+.macro kvm_handler_common intno, srr0, flags
+	/* Restore host stack pointer */
+	PPC_STL	r1, VCPU_GPR(R1)(r4)
+	PPC_STL	r2, VCPU_GPR(R2)(r4)
+	PPC_LL	r1, VCPU_HOST_STACK(r4)
+	PPC_LL	r2, HOST_R2(r1)
+
+	mfspr	r10, SPRN_PID
+	lwz	r8, VCPU_HOST_PID(r4)
+	PPC_LL	r11, VCPU_SHARED(r4)
+	PPC_STL	r14, VCPU_GPR(R14)(r4) /* We need a non-volatile GPR. */
+	li	r14, \intno
+
+	stw	r10, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r8
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save exit time */
+1:	mfspr	r7, SPRN_TBRU
+	mfspr	r8, SPRN_TBRL
+	mfspr	r9, SPRN_TBRU
+	cmpw	r9, r7
+	stw	r8, VCPU_TIMING_EXIT_TBL(r4)
+	bne-	1b
+	stw	r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
+	oris	r8, r6, MSR_CE@h
+	PPC_STD(r6, VCPU_SHARED_MSR, r11)
+	ori	r8, r8, MSR_ME | MSR_RI
+	PPC_STL	r5, VCPU_PC(r4)
+
+	/*
+	 * Make sure CE/ME/RI are set (if appropriate for exception type)
+	 * whether or not the guest had it set.  Since mfmsr/mtmsr are
+	 * somewhat expensive, skip in the common case where the guest
+	 * had all these bits set (and thus they're still set if
+	 * appropriate for the exception type).
+	 */
+	cmpw	r6, r8
+	beq	1f
+	mfmsr	r7
+	.if	\srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0
+	oris	r7, r7, MSR_CE@h
+	.endif
+	.if	\srr0 != SPRN_MCSRR0
+	ori	r7, r7, MSR_ME | MSR_RI
+	.endif
+	mtmsr	r7
+1:
+
+	.if	\flags & NEED_EMU
+	PPC_STL	r15, VCPU_GPR(R15)(r4)
+	PPC_STL	r16, VCPU_GPR(R16)(r4)
+	PPC_STL	r17, VCPU_GPR(R17)(r4)
+	PPC_STL	r18, VCPU_GPR(R18)(r4)
+	PPC_STL	r19, VCPU_GPR(R19)(r4)
+	PPC_STL	r20, VCPU_GPR(R20)(r4)
+	PPC_STL	r21, VCPU_GPR(R21)(r4)
+	PPC_STL	r22, VCPU_GPR(R22)(r4)
+	PPC_STL	r23, VCPU_GPR(R23)(r4)
+	PPC_STL	r24, VCPU_GPR(R24)(r4)
+	PPC_STL	r25, VCPU_GPR(R25)(r4)
+	PPC_STL	r26, VCPU_GPR(R26)(r4)
+	PPC_STL	r27, VCPU_GPR(R27)(r4)
+	PPC_STL	r28, VCPU_GPR(R28)(r4)
+	PPC_STL	r29, VCPU_GPR(R29)(r4)
+	PPC_STL	r30, VCPU_GPR(R30)(r4)
+	PPC_STL	r31, VCPU_GPR(R31)(r4)
+
+	/*
+	 * We don't use external PID support. lwepx faults would need to be
+	 * handled by KVM and this implies aditional code in DO_KVM (for
+	 * DTB_MISS, DSI and LRAT) to check ESR[EPID] and EPLC[EGS] which
+	 * is too intrusive for the host. Get last instuction in
+	 * kvmppc_get_last_inst().
+	 */
+	li	r9, KVM_INST_FETCH_FAILED
+	stw	r9, VCPU_LAST_INST(r4)
+	.endif
+
+	.if	\flags & NEED_ESR
+	mfspr	r8, SPRN_ESR
+	PPC_STL	r8, VCPU_FAULT_ESR(r4)
+	.endif
+
+	.if	\flags & NEED_DEAR
+	mfspr	r9, SPRN_DEAR
+	PPC_STL	r9, VCPU_FAULT_DEAR(r4)
+	.endif
+
+	b	kvmppc_resume_host
+.endm
+
+#ifdef CONFIG_64BIT
+/* Exception types */
+#define EX_GEN			1
+#define EX_GDBELL		2
+#define EX_DBG			3
+#define EX_MC			4
+#define EX_CRIT			5
+#define EX_TLB			6
+
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags
+ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	mr	r11, r4
+	/*
+	 * Get vcpu from Paca: paca->__current.thread->kvm_vcpu
+	 */
+	PPC_LL	r4, PACACURRENT(r13)
+	PPC_LL	r4, (THREAD + THREAD_KVM_VCPU)(r4)
+	stw	r10, VCPU_CR(r4)
+	PPC_STL r11, VCPU_GPR(R4)(r4)
+	PPC_STL	r5, VCPU_GPR(R5)(r4)
+	PPC_STL	r6, VCPU_GPR(R6)(r4)
+	PPC_STL	r8, VCPU_GPR(R8)(r4)
+	PPC_STL	r9, VCPU_GPR(R9)(r4)
+	.if \type == EX_TLB
+	PPC_LL	r5, EX_TLB_R13(r12)
+	PPC_LL	r6, EX_TLB_R10(r12)
+	PPC_LL	r8, EX_TLB_R11(r12)
+	mfspr	r12, \scratch
+	.else
+	mfspr	r5, \scratch
+	PPC_LL	r6, (\paca_ex + \ex_r10)(r13)
+	PPC_LL	r8, (\paca_ex + \ex_r11)(r13)
+	.endif
+	PPC_STL r5, VCPU_GPR(R13)(r4)
+	PPC_STL r3, VCPU_GPR(R3)(r4)
+	PPC_STL r7, VCPU_GPR(R7)(r4)
+	PPC_STL r12, VCPU_GPR(R12)(r4)
+	PPC_STL r6, VCPU_GPR(R10)(r4)
+	PPC_STL r8, VCPU_GPR(R11)(r4)
+	mfctr	r5
+	PPC_STL	r5, VCPU_CTR(r4)
+	mfspr	r5, \srr0
+	mfspr	r6, \srr1
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+#define EX_PARAMS(type) \
+	EX_##type, \
+	SPRN_SPRG_##type##_SCRATCH, \
+	PACA_EX##type, \
+	EX_R10, \
+	EX_R11
+
+#define EX_PARAMS_TLB \
+	EX_TLB, \
+	SPRN_SPRG_GEN_SCRATCH, \
+	PACA_EXTLB, \
+	EX_TLB_R10, \
+	EX_TLB_R11
+
+kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \
+	SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU)
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\
+	SPRN_CSRR0, SPRN_CSRR1, 0
+/*
+ * Only bolted TLB miss exception handlers are supported for now
+ */
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALTIVEC_ASSIST, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \
+	SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
+	SPRN_DSRR0, SPRN_DSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
+	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_LRAT_ERROR, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+#else
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	PPC_LL	r11, THREAD_KVM_VCPU(r10)
+	PPC_STL r3, VCPU_GPR(R3)(r11)
+	mfspr	r3, SPRN_SPRG_RSCRATCH0
+	PPC_STL	r4, VCPU_GPR(R4)(r11)
+	PPC_LL	r4, THREAD_NORMSAVE(0)(r10)
+	PPC_STL	r5, VCPU_GPR(R5)(r11)
+	stw	r13, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(R10)(r11)
+	PPC_LL	r3, THREAD_NORMSAVE(2)(r10)
+	PPC_STL	r6, VCPU_GPR(R6)(r11)
+	PPC_STL	r4, VCPU_GPR(R11)(r11)
+	mfspr	r6, \srr1
+	PPC_STL	r7, VCPU_GPR(R7)(r11)
+	PPC_STL	r8, VCPU_GPR(R8)(r11)
+	PPC_STL	r9, VCPU_GPR(R9)(r11)
+	PPC_STL r3, VCPU_GPR(R13)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(R12)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+.macro kvm_lvl_handler intno scratch srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	mfspr	r10, SPRN_SPRG_THREAD
+	PPC_LL	r11, THREAD_KVM_VCPU(r10)
+	PPC_STL r3, VCPU_GPR(R3)(r11)
+	mfspr	r3, \scratch
+	PPC_STL	r4, VCPU_GPR(R4)(r11)
+	PPC_LL	r4, GPR9(r8)
+	PPC_STL	r5, VCPU_GPR(R5)(r11)
+	stw	r9, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(R8)(r11)
+	PPC_LL	r3, GPR10(r8)
+	PPC_STL	r6, VCPU_GPR(R6)(r11)
+	PPC_STL	r4, VCPU_GPR(R9)(r11)
+	mfspr	r6, \srr1
+	PPC_LL	r4, GPR11(r8)
+	PPC_STL	r7, VCPU_GPR(R7)(r11)
+	PPC_STL r3, VCPU_GPR(R10)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(R12)(r11)
+	PPC_STL r13, VCPU_GPR(R13)(r11)
+	PPC_STL	r4, VCPU_GPR(R11)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \
+	SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU)
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
+#endif
+
+/* Registers:
+ *  SPRG_SCRATCH0: guest r10
+ *  r4: vcpu pointer
+ *  r11: vcpu->arch.shared
+ *  r14: KVM exit number
+ */
+_GLOBAL(kvmppc_resume_host)
+	/* Save remaining volatile guest register state to vcpu. */
+	mfspr	r3, SPRN_VRSAVE
+	PPC_STL	r0, VCPU_GPR(R0)(r4)
+	mflr	r5
+	mfspr	r6, SPRN_SPRG4
+	PPC_STL	r5, VCPU_LR(r4)
+	mfspr	r7, SPRN_SPRG5
+	stw	r3, VCPU_VRSAVE(r4)
+#ifdef CONFIG_64BIT
+	PPC_LL	r3, PACA_SPRG_VDSO(r13)
+#endif
+	mfspr	r5, SPRN_SPRG9
+	PPC_STD(r6, VCPU_SHARED_SPRG4, r11)
+	mfspr	r8, SPRN_SPRG6
+	PPC_STD(r7, VCPU_SHARED_SPRG5, r11)
+	mfspr	r9, SPRN_SPRG7
+#ifdef CONFIG_64BIT
+	mtspr	SPRN_SPRG_VDSO_WRITE, r3
+#endif
+	PPC_STD(r5, VCPU_SPRG9, r4)
+	PPC_STD(r8, VCPU_SHARED_SPRG6, r11)
+	mfxer	r3
+	PPC_STD(r9, VCPU_SHARED_SPRG7, r11)
+
+	/* save guest MAS registers and restore host mas4 & mas6 */
+	mfspr	r5, SPRN_MAS0
+	PPC_STL	r3, VCPU_XER(r4)
+	mfspr	r6, SPRN_MAS1
+	stw	r5, VCPU_SHARED_MAS0(r11)
+	mfspr	r7, SPRN_MAS2
+	stw	r6, VCPU_SHARED_MAS1(r11)
+	PPC_STD(r7, VCPU_SHARED_MAS2, r11)
+	mfspr	r5, SPRN_MAS3
+	mfspr	r6, SPRN_MAS4
+	stw	r5, VCPU_SHARED_MAS7_3+4(r11)
+	mfspr	r7, SPRN_MAS6
+	stw	r6, VCPU_SHARED_MAS4(r11)
+	mfspr	r5, SPRN_MAS7
+	lwz	r6, VCPU_HOST_MAS4(r4)
+	stw	r7, VCPU_SHARED_MAS6(r11)
+	lwz	r8, VCPU_HOST_MAS6(r4)
+	mtspr	SPRN_MAS4, r6
+	stw	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r8
+	/* Enable MAS register updates via exception */
+	mfspr	r3, SPRN_EPCR
+	rlwinm	r3, r3, 0, ~SPRN_EPCR_DMIUH
+	mtspr	SPRN_EPCR, r3
+	isync
+
+#ifdef CONFIG_64BIT
+	/*
+	 * We enter with interrupts disabled in hardware, but
+	 * we need to call RECONCILE_IRQ_STATE to ensure
+	 * that the software state is kept in sync.
+	 */
+	RECONCILE_IRQ_STATE(r3,r5)
+#endif
+
+	/* Switch to kernel stack and jump to handler. */
+	PPC_LL	r3, HOST_RUN(r1)
+	mr	r5, r14 /* intno */
+	mr	r14, r4 /* Save vcpu pointer. */
+	bl	kvmppc_handle_exit
+
+	/* Restore vcpu pointer and the nonvolatiles we used. */
+	mr	r4, r14
+	PPC_LL	r14, VCPU_GPR(R14)(r4)
+
+	andi.	r5, r3, RESUME_FLAG_NV
+	beq	skip_nv_load
+	PPC_LL	r15, VCPU_GPR(R15)(r4)
+	PPC_LL	r16, VCPU_GPR(R16)(r4)
+	PPC_LL	r17, VCPU_GPR(R17)(r4)
+	PPC_LL	r18, VCPU_GPR(R18)(r4)
+	PPC_LL	r19, VCPU_GPR(R19)(r4)
+	PPC_LL	r20, VCPU_GPR(R20)(r4)
+	PPC_LL	r21, VCPU_GPR(R21)(r4)
+	PPC_LL	r22, VCPU_GPR(R22)(r4)
+	PPC_LL	r23, VCPU_GPR(R23)(r4)
+	PPC_LL	r24, VCPU_GPR(R24)(r4)
+	PPC_LL	r25, VCPU_GPR(R25)(r4)
+	PPC_LL	r26, VCPU_GPR(R26)(r4)
+	PPC_LL	r27, VCPU_GPR(R27)(r4)
+	PPC_LL	r28, VCPU_GPR(R28)(r4)
+	PPC_LL	r29, VCPU_GPR(R29)(r4)
+	PPC_LL	r30, VCPU_GPR(R30)(r4)
+	PPC_LL	r31, VCPU_GPR(R31)(r4)
+skip_nv_load:
+	/* Should we return to the guest? */
+	andi.	r5, r3, RESUME_FLAG_HOST
+	beq	lightweight_exit
+
+	srawi	r3, r3, 2 /* Shift -ERR back down. */
+
+heavyweight_exit:
+	/* Not returning to guest. */
+	PPC_LL	r5, HOST_STACK_LR(r1)
+	lwz	r6, HOST_CR(r1)
+
+	/*
+	 * We already saved guest volatile register state; now save the
+	 * non-volatiles.
+	 */
+
+	PPC_STL	r15, VCPU_GPR(R15)(r4)
+	PPC_STL	r16, VCPU_GPR(R16)(r4)
+	PPC_STL	r17, VCPU_GPR(R17)(r4)
+	PPC_STL	r18, VCPU_GPR(R18)(r4)
+	PPC_STL	r19, VCPU_GPR(R19)(r4)
+	PPC_STL	r20, VCPU_GPR(R20)(r4)
+	PPC_STL	r21, VCPU_GPR(R21)(r4)
+	PPC_STL	r22, VCPU_GPR(R22)(r4)
+	PPC_STL	r23, VCPU_GPR(R23)(r4)
+	PPC_STL	r24, VCPU_GPR(R24)(r4)
+	PPC_STL	r25, VCPU_GPR(R25)(r4)
+	PPC_STL	r26, VCPU_GPR(R26)(r4)
+	PPC_STL	r27, VCPU_GPR(R27)(r4)
+	PPC_STL	r28, VCPU_GPR(R28)(r4)
+	PPC_STL	r29, VCPU_GPR(R29)(r4)
+	PPC_STL	r30, VCPU_GPR(R30)(r4)
+	PPC_STL	r31, VCPU_GPR(R31)(r4)
+
+	/* Load host non-volatile register state from host stack. */
+	PPC_LL	r14, HOST_NV_GPR(R14)(r1)
+	PPC_LL	r15, HOST_NV_GPR(R15)(r1)
+	PPC_LL	r16, HOST_NV_GPR(R16)(r1)
+	PPC_LL	r17, HOST_NV_GPR(R17)(r1)
+	PPC_LL	r18, HOST_NV_GPR(R18)(r1)
+	PPC_LL	r19, HOST_NV_GPR(R19)(r1)
+	PPC_LL	r20, HOST_NV_GPR(R20)(r1)
+	PPC_LL	r21, HOST_NV_GPR(R21)(r1)
+	PPC_LL	r22, HOST_NV_GPR(R22)(r1)
+	PPC_LL	r23, HOST_NV_GPR(R23)(r1)
+	PPC_LL	r24, HOST_NV_GPR(R24)(r1)
+	PPC_LL	r25, HOST_NV_GPR(R25)(r1)
+	PPC_LL	r26, HOST_NV_GPR(R26)(r1)
+	PPC_LL	r27, HOST_NV_GPR(R27)(r1)
+	PPC_LL	r28, HOST_NV_GPR(R28)(r1)
+	PPC_LL	r29, HOST_NV_GPR(R29)(r1)
+	PPC_LL	r30, HOST_NV_GPR(R30)(r1)
+	PPC_LL	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Return to kvm_vcpu_run(). */
+	mtlr	r5
+	mtcr	r6
+	addi	r1, r1, HOST_STACK_SIZE
+	/* r3 still contains the return code from kvmppc_handle_exit(). */
+	blr
+
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+	stwu	r1, -HOST_STACK_SIZE(r1)
+	PPC_STL	r1, VCPU_HOST_STACK(r4)	/* Save stack pointer to vcpu. */
+
+	/* Save host state to stack. */
+	PPC_STL	r3, HOST_RUN(r1)
+	mflr	r3
+	mfcr	r5
+	PPC_STL	r3, HOST_STACK_LR(r1)
+
+	stw	r5, HOST_CR(r1)
+
+	/* Save host non-volatile register state to stack. */
+	PPC_STL	r14, HOST_NV_GPR(R14)(r1)
+	PPC_STL	r15, HOST_NV_GPR(R15)(r1)
+	PPC_STL	r16, HOST_NV_GPR(R16)(r1)
+	PPC_STL	r17, HOST_NV_GPR(R17)(r1)
+	PPC_STL	r18, HOST_NV_GPR(R18)(r1)
+	PPC_STL	r19, HOST_NV_GPR(R19)(r1)
+	PPC_STL	r20, HOST_NV_GPR(R20)(r1)
+	PPC_STL	r21, HOST_NV_GPR(R21)(r1)
+	PPC_STL	r22, HOST_NV_GPR(R22)(r1)
+	PPC_STL	r23, HOST_NV_GPR(R23)(r1)
+	PPC_STL	r24, HOST_NV_GPR(R24)(r1)
+	PPC_STL	r25, HOST_NV_GPR(R25)(r1)
+	PPC_STL	r26, HOST_NV_GPR(R26)(r1)
+	PPC_STL	r27, HOST_NV_GPR(R27)(r1)
+	PPC_STL	r28, HOST_NV_GPR(R28)(r1)
+	PPC_STL	r29, HOST_NV_GPR(R29)(r1)
+	PPC_STL	r30, HOST_NV_GPR(R30)(r1)
+	PPC_STL	r31, HOST_NV_GPR(R31)(r1)
+
+	/* Load guest non-volatiles. */
+	PPC_LL	r14, VCPU_GPR(R14)(r4)
+	PPC_LL	r15, VCPU_GPR(R15)(r4)
+	PPC_LL	r16, VCPU_GPR(R16)(r4)
+	PPC_LL	r17, VCPU_GPR(R17)(r4)
+	PPC_LL	r18, VCPU_GPR(R18)(r4)
+	PPC_LL	r19, VCPU_GPR(R19)(r4)
+	PPC_LL	r20, VCPU_GPR(R20)(r4)
+	PPC_LL	r21, VCPU_GPR(R21)(r4)
+	PPC_LL	r22, VCPU_GPR(R22)(r4)
+	PPC_LL	r23, VCPU_GPR(R23)(r4)
+	PPC_LL	r24, VCPU_GPR(R24)(r4)
+	PPC_LL	r25, VCPU_GPR(R25)(r4)
+	PPC_LL	r26, VCPU_GPR(R26)(r4)
+	PPC_LL	r27, VCPU_GPR(R27)(r4)
+	PPC_LL	r28, VCPU_GPR(R28)(r4)
+	PPC_LL	r29, VCPU_GPR(R29)(r4)
+	PPC_LL	r30, VCPU_GPR(R30)(r4)
+	PPC_LL	r31, VCPU_GPR(R31)(r4)
+
+
+lightweight_exit:
+	PPC_STL	r2, HOST_R2(r1)
+
+	mfspr	r3, SPRN_PID
+	stw	r3, VCPU_HOST_PID(r4)
+	lwz	r3, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r3
+
+	PPC_LL	r11, VCPU_SHARED(r4)
+	/* Disable MAS register updates via exception */
+	mfspr	r3, SPRN_EPCR
+	oris	r3, r3, SPRN_EPCR_DMIUH@h
+	mtspr	SPRN_EPCR, r3
+	isync
+	/* Save host mas4 and mas6 and load guest MAS registers */
+	mfspr	r3, SPRN_MAS4
+	stw	r3, VCPU_HOST_MAS4(r4)
+	mfspr	r3, SPRN_MAS6
+	stw	r3, VCPU_HOST_MAS6(r4)
+	lwz	r3, VCPU_SHARED_MAS0(r11)
+	lwz	r5, VCPU_SHARED_MAS1(r11)
+	PPC_LD(r6, VCPU_SHARED_MAS2, r11)
+	lwz	r7, VCPU_SHARED_MAS7_3+4(r11)
+	lwz	r8, VCPU_SHARED_MAS4(r11)
+	mtspr	SPRN_MAS0, r3
+	mtspr	SPRN_MAS1, r5
+	mtspr	SPRN_MAS2, r6
+	mtspr	SPRN_MAS3, r7
+	mtspr	SPRN_MAS4, r8
+	lwz	r3, VCPU_SHARED_MAS6(r11)
+	lwz	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r3
+	mtspr	SPRN_MAS7, r5
+
+	/*
+	 * Host interrupt handlers may have clobbered these guest-readable
+	 * SPRGs, so we need to reload them here with the guest's values.
+	 */
+	lwz	r3, VCPU_VRSAVE(r4)
+	PPC_LD(r5, VCPU_SHARED_SPRG4, r11)
+	mtspr	SPRN_VRSAVE, r3
+	PPC_LD(r6, VCPU_SHARED_SPRG5, r11)
+	mtspr	SPRN_SPRG4W, r5
+	PPC_LD(r7, VCPU_SHARED_SPRG6, r11)
+	mtspr	SPRN_SPRG5W, r6
+	PPC_LD(r8, VCPU_SHARED_SPRG7, r11)
+	mtspr	SPRN_SPRG6W, r7
+	PPC_LD(r5, VCPU_SPRG9, r4)
+	mtspr	SPRN_SPRG7W, r8
+	mtspr	SPRN_SPRG9, r5
+
+	/* Load some guest volatiles. */
+	PPC_LL	r3, VCPU_LR(r4)
+	PPC_LL	r5, VCPU_XER(r4)
+	PPC_LL	r6, VCPU_CTR(r4)
+	lwz	r7, VCPU_CR(r4)
+	PPC_LL	r8, VCPU_PC(r4)
+	PPC_LD(r9, VCPU_SHARED_MSR, r11)
+	PPC_LL	r0, VCPU_GPR(R0)(r4)
+	PPC_LL	r1, VCPU_GPR(R1)(r4)
+	PPC_LL	r2, VCPU_GPR(R2)(r4)
+	PPC_LL	r10, VCPU_GPR(R10)(r4)
+	PPC_LL	r11, VCPU_GPR(R11)(r4)
+	PPC_LL	r12, VCPU_GPR(R12)(r4)
+	PPC_LL	r13, VCPU_GPR(R13)(r4)
+	mtlr	r3
+	mtxer	r5
+	mtctr	r6
+	mtsrr0	r8
+	mtsrr1	r9
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save enter time */
+1:
+	mfspr	r6, SPRN_TBRU
+	mfspr	r9, SPRN_TBRL
+	mfspr	r8, SPRN_TBRU
+	cmpw	r8, r6
+	stw	r9, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	bne	1b
+	stw	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
+	/*
+	 * Don't execute any instruction which can change CR after
+	 * below instruction.
+	 */
+	mtcr	r7
+
+	/* Finish loading guest volatiles and jump to guest. */
+	PPC_LL	r5, VCPU_GPR(R5)(r4)
+	PPC_LL	r6, VCPU_GPR(R6)(r4)
+	PPC_LL	r7, VCPU_GPR(R7)(r4)
+	PPC_LL	r8, VCPU_GPR(R8)(r4)
+	PPC_LL	r9, VCPU_GPR(R9)(r4)
+
+	PPC_LL	r3, VCPU_GPR(R3)(r4)
+	PPC_LL	r4, VCPU_GPR(R4)(r4)
+	rfi
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
new file mode 100644
index 000000000..b29ce752c
--- /dev/null
+++ b/arch/powerpc/kvm/e500.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, <yu.liu@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+
+#include "../mm/mmu_decl.h"
+#include "booke.h"
+#include "e500.h"
+
+struct id {
+	unsigned long val;
+	struct id **pentry;
+};
+
+#define NUM_TIDS 256
+
+/*
+ * This table provide mappings from:
+ * (guestAS,guestTID,guestPR) --> ID of physical cpu
+ * guestAS	[0..1]
+ * guestTID	[0..255]
+ * guestPR	[0..1]
+ * ID		[1..255]
+ * Each vcpu keeps one vcpu_id_table.
+ */
+struct vcpu_id_table {
+	struct id id[2][NUM_TIDS][2];
+};
+
+/*
+ * This table provide reversed mappings of vcpu_id_table:
+ * ID --> address of vcpu_id_table item.
+ * Each physical core has one pcpu_id_table.
+ */
+struct pcpu_id_table {
+	struct id *entry[NUM_TIDS];
+};
+
+static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
+
+/* This variable keeps last used shadow ID on local core.
+ * The valid range of shadow ID is [1..255] */
+static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
+
+/*
+ * Allocate a free shadow id and setup a valid sid mapping in given entry.
+ * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_setup_one(struct id *entry)
+{
+	unsigned long sid;
+	int ret = -1;
+
+	sid = __this_cpu_inc_return(pcpu_last_used_sid);
+	if (sid < NUM_TIDS) {
+		__this_cpu_write(pcpu_sids.entry[sid], entry);
+		entry->val = sid;
+		entry->pentry = this_cpu_ptr(&pcpu_sids.entry[sid]);
+		ret = sid;
+	}
+
+	/*
+	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
+	 * the caller will invalidate everything and start over.
+	 *
+	 * sid > NUM_TIDS indicates a race, which we disable preemption to
+	 * avoid.
+	 */
+	WARN_ON(sid > NUM_TIDS);
+
+	return ret;
+}
+
+/*
+ * Check if given entry contain a valid shadow id mapping.
+ * An ID mapping is considered valid only if
+ * both vcpu and pcpu know this mapping.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_lookup(struct id *entry)
+{
+	if (entry && entry->val != 0 &&
+	    __this_cpu_read(pcpu_sids.entry[entry->val]) == entry &&
+	    entry->pentry == this_cpu_ptr(&pcpu_sids.entry[entry->val]))
+		return entry->val;
+	return -1;
+}
+
+/* Invalidate all id mappings on local core -- call with preempt disabled */
+static inline void local_sid_destroy_all(void)
+{
+	__this_cpu_write(pcpu_last_used_sid, 0);
+	memset(this_cpu_ptr(&pcpu_sids), 0, sizeof(pcpu_sids));
+}
+
+static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
+	return vcpu_e500->idt;
+}
+
+static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->idt);
+	vcpu_e500->idt = NULL;
+}
+
+/* Map guest pid to shadow.
+ * We use PID to keep shadow of current guest non-zero PID,
+ * and use PID1 to keep shadow of guest zero PID.
+ * So that guest tlbe with TID=0 can be accessed at any time */
+static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	preempt_disable();
+	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu),
+			get_cur_pid(&vcpu_e500->vcpu),
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu), 0,
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	preempt_enable();
+}
+
+/* Invalidate all mappings on vcpu */
+static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/* Invalidate one ID mapping on vcpu */
+static inline void kvmppc_e500_id_table_reset_one(
+			       struct kvmppc_vcpu_e500 *vcpu_e500,
+			       int as, int pid, int pr)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+
+	BUG_ON(as >= 2);
+	BUG_ON(pid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	idt->id[as][pid][pr].val = 0;
+	idt->id[as][pid][pr].pentry = NULL;
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/*
+ * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
+ * This function first lookup if a valid mapping exists,
+ * if not, then creates a new one.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	int sid;
+
+	BUG_ON(as >= 2);
+	BUG_ON(gid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	sid = local_sid_lookup(&idt->id[as][gid][pr]);
+
+	while (sid <= 0) {
+		/* No mapping yet */
+		sid = local_sid_setup_one(&idt->id[as][gid][pr]);
+		if (sid <= 0) {
+			_tlbil_all();
+			local_sid_destroy_all();
+		}
+
+		/* Update shadow pid when mappings are changed */
+		if (!avoid_recursion)
+			kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+
+	return sid;
+}
+
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe),
+				   get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	if (vcpu->arch.pid != pid) {
+		vcpu_e500->pid[0] = vcpu->arch.pid = pid;
+		kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+}
+
+/* gtlbe must not be mapped by more than one host tlbe */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+                           struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	unsigned int pr, tid, ts, pid;
+	u32 val, eaddr;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+
+	preempt_disable();
+
+	/* One guest ID may be mapped to two shadow IDs */
+	for (pr = 0; pr < 2; pr++) {
+		/*
+		 * The shadow PID can have a valid mapping on at most one
+		 * host CPU.  In the common case, it will be valid on this
+		 * CPU, in which case we do a local invalidation of the
+		 * specific address.
+		 *
+		 * If the shadow PID is not valid on the current host CPU,
+		 * we invalidate the entire shadow PID.
+		 */
+		pid = local_sid_lookup(&idt->id[ts][tid][pr]);
+		if (pid <= 0) {
+			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
+			continue;
+		}
+
+		/*
+		 * The guest is invalidating a 4K entry which is in a PID
+		 * that has a valid shadow mapping on this host CPU.  We
+		 * search host TLB to invalidate it's shadow TLB entry,
+		 * similar to __tlbil_va except that we need to look in AS1.
+		 */
+		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
+		eaddr = get_tlb_eaddr(gtlbe);
+
+		local_irq_save(flags);
+
+		mtspr(SPRN_MAS6, val);
+		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
+		val = mfspr(SPRN_MAS1);
+		if (val & MAS1_VALID) {
+			mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+			asm volatile("tlbwe");
+		}
+
+		local_irq_restore(flags);
+	}
+
+	preempt_enable();
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kvmppc_e500_id_table_reset_all(vcpu_e500);
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+	/* Recalc shadow pid since MSR changes */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
+}
+
+static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu)
+{
+	kvmppc_booke_vcpu_load(vcpu, cpu);
+
+	/* Shadow PID may be expired on local core */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
+}
+
+static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_SPE
+	if (vcpu->arch.shadow_msr & MSR_SPE)
+		kvmppc_vcpu_disable_spe(vcpu);
+#endif
+
+	kvmppc_booke_vcpu_put(vcpu);
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->cpu_name, "e500v2") == 0)
+		r = 0;
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
+
+static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+
+	/* Insert large initial mapping for guest. */
+	tlbe = get_entry(vcpu_e500, 1, 0);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
+	tlbe->mas2 = 0;
+	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
+
+	/* 4K map for serial output. Used by kernel wrapper. */
+	tlbe = get_entry(vcpu_e500, 1, 1);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
+	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_e500_tlb_setup(vcpu_e500);
+
+	/* Registers init */
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu_e500->svr = mfspr(SPRN_SVR);
+
+	vcpu->arch.cpu_type = KVM_CPU_E500V2;
+
+	return 0;
+}
+
+static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu,
+				      struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_SPE |
+	                       KVM_SREGS_E_PM;
+	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL;
+
+	sregs->u.e.impl.fsl.features = 0;
+	sregs->u.e.impl.fsl.svr = vcpu_e500->svr;
+	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
+	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
+
+	sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
+	sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
+	sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
+	sregs->u.e.ivor_high[3] =
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+
+	kvmppc_get_sregs_ivor(vcpu, sregs);
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
+	return 0;
+}
+
+static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu,
+				      struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
+
+	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
+		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
+		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0;
+		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
+	}
+
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	if (sregs->u.e.features & KVM_SREGS_E_SPE) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] =
+			sregs->u.e.ivor_high[0];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] =
+			sregs->u.e.ivor_high[1];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] =
+			sregs->u.e.ivor_high[2];
+	}
+
+	if (sregs->u.e.features & KVM_SREGS_E_PM) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] =
+			sregs->u.e.ivor_high[3];
+	}
+
+	return kvmppc_set_sregs_ivor(vcpu, sregs);
+}
+
+static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id,
+				   union kvmppc_one_reg *val)
+{
+	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
+static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id,
+				   union kvmppc_one_reg *val)
+{
+	int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	return r;
+}
+
+static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm,
+						     unsigned int id)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500;
+	struct kvm_vcpu *vcpu;
+	int err;
+
+	vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu_e500) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	vcpu = &vcpu_e500->vcpu;
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
+		goto uninit_vcpu;
+
+	err = kvmppc_e500_tlb_init(vcpu_e500);
+	if (err)
+		goto uninit_id;
+
+	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_tlb;
+
+	return vcpu;
+
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+uninit_id:
+	kvmppc_e500_id_table_free(vcpu_e500);
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+out:
+	return ERR_PTR(err);
+}
+
+static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared);
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvmppc_e500_id_table_free(vcpu_e500);
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+}
+
+static int kvmppc_core_init_vm_e500(struct kvm *kvm)
+{
+	return 0;
+}
+
+static void kvmppc_core_destroy_vm_e500(struct kvm *kvm)
+{
+}
+
+static struct kvmppc_ops kvm_ops_e500 = {
+	.get_sregs = kvmppc_core_get_sregs_e500,
+	.set_sregs = kvmppc_core_set_sregs_e500,
+	.get_one_reg = kvmppc_get_one_reg_e500,
+	.set_one_reg = kvmppc_set_one_reg_e500,
+	.vcpu_load   = kvmppc_core_vcpu_load_e500,
+	.vcpu_put    = kvmppc_core_vcpu_put_e500,
+	.vcpu_create = kvmppc_core_vcpu_create_e500,
+	.vcpu_free   = kvmppc_core_vcpu_free_e500,
+	.mmu_destroy  = kvmppc_mmu_destroy_e500,
+	.init_vm = kvmppc_core_init_vm_e500,
+	.destroy_vm = kvmppc_core_destroy_vm_e500,
+	.emulate_op = kvmppc_core_emulate_op_e500,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_e500,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_e500,
+};
+
+static int __init kvmppc_e500_init(void)
+{
+	int r, i;
+	unsigned long ivor[3];
+	/* Process remaining handlers above the generic first 16 */
+	unsigned long *handler = &kvmppc_booke_handler_addr[16];
+	unsigned long handler_len;
+	unsigned long max_ivor = 0;
+
+	r = kvmppc_core_check_processor_compat();
+	if (r)
+		goto err_out;
+
+	r = kvmppc_booke_init();
+	if (r)
+		goto err_out;
+
+	/* copy extra E500 exception handlers */
+	ivor[0] = mfspr(SPRN_IVOR32);
+	ivor[1] = mfspr(SPRN_IVOR33);
+	ivor[2] = mfspr(SPRN_IVOR34);
+	for (i = 0; i < 3; i++) {
+		if (ivor[i] > ivor[max_ivor])
+			max_ivor = i;
+
+		handler_len = handler[i + 1] - handler[i];
+		memcpy((void *)kvmppc_booke_handlers + ivor[i],
+		       (void *)handler[i], handler_len);
+	}
+	handler_len = handler[max_ivor + 1] - handler[max_ivor];
+	flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
+			   ivor[max_ivor] + handler_len);
+
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+	if (r)
+		goto err_out;
+	kvm_ops_e500.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_e500;
+
+err_out:
+	return r;
+}
+
+static void __exit kvmppc_e500_exit(void)
+{
+	kvmppc_pr_ops = NULL;
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_e500_init);
+module_exit(kvmppc_e500_exit);
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
new file mode 100644
index 000000000..72920bed3
--- /dev/null
+++ b/arch/powerpc/kvm/e500.h
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu <yu.liu@freescale.com>
+ *         Scott Wood <scottwood@freescale.com>
+ *         Ashish Kalra <ashish.kalra@freescale.com>
+ *         Varun Sethi <varun.sethi@freescale.com>
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.h and
+ * arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>,
+ * Copyright IBM Corp. 2007-2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef KVM_E500_H
+#define KVM_E500_H
+
+#include <linux/kvm_host.h>
+#include <asm/mmu-book3e.h>
+#include <asm/tlb.h>
+#include <asm/cputhreads.h>
+
+enum vcpu_ftr {
+	VCPU_FTR_MMU_V2
+};
+
+#define E500_PID_NUM   3
+#define E500_TLB_NUM   2
+
+/* entry is mapped somewhere in host TLB */
+#define E500_TLB_VALID		(1 << 31)
+/* TLB1 entry is mapped by host TLB1, tracked by bitmaps */
+#define E500_TLB_BITMAP		(1 << 30)
+/* TLB1 entry is mapped by host TLB0 */
+#define E500_TLB_TLB0		(1 << 29)
+/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */
+#define E500_TLB_MAS2_ATTR	(0x7f)
+
+struct tlbe_ref {
+	pfn_t pfn;		/* valid only for TLB0, except briefly */
+	unsigned int flags;	/* E500_TLB_* */
+};
+
+struct tlbe_priv {
+	struct tlbe_ref ref;
+};
+
+#ifdef CONFIG_KVM_E500V2
+struct vcpu_id_table;
+#endif
+
+struct kvmppc_e500_tlb_params {
+	int entries, ways, sets;
+};
+
+struct kvmppc_vcpu_e500 {
+	struct kvm_vcpu vcpu;
+
+	/* Unmodified copy of the guest's TLB -- shared with host userspace. */
+	struct kvm_book3e_206_tlb_entry *gtlb_arch;
+
+	/* Starting entry number in gtlb_arch[] */
+	int gtlb_offset[E500_TLB_NUM];
+
+	/* KVM internal information associated with each guest TLB entry */
+	struct tlbe_priv *gtlb_priv[E500_TLB_NUM];
+
+	struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM];
+
+	unsigned int gtlb_nv[E500_TLB_NUM];
+
+	unsigned int host_tlb1_nv;
+
+	u32 svr;
+	u32 l1csr0;
+	u32 l1csr1;
+	u32 hid0;
+	u32 hid1;
+	u64 mcar;
+
+	struct page **shared_tlb_pages;
+	int num_shared_tlb_pages;
+
+	u64 *g2h_tlb1_map;
+	unsigned int *h2g_tlb1_rmap;
+
+	/* Minimum and maximum address mapped my TLB1 */
+	unsigned long tlb1_min_eaddr;
+	unsigned long tlb1_max_eaddr;
+
+#ifdef CONFIG_KVM_E500V2
+	u32 pid[E500_PID_NUM];
+
+	/* vcpu id table */
+	struct vcpu_id_table *idt;
+#endif
+};
+
+static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu);
+}
+
+
+/* This geometry is the legacy default -- can be overridden by userspace */
+#define KVM_E500_TLB0_WAY_SIZE		128
+#define KVM_E500_TLB0_WAY_NUM		2
+
+#define KVM_E500_TLB0_SIZE  (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM)
+#define KVM_E500_TLB1_SIZE  16
+
+#define index_of(tlbsel, esel)	(((tlbsel) << 16) | ((esel) & 0xFFFF))
+#define tlbsel_of(index)	((index) >> 16)
+#define esel_of(index)		((index) & 0xFFFF)
+
+#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW)
+#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW)
+#define MAS2_ATTRIB_MASK \
+	  (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G)
+#define MAS3_ATTRIB_MASK \
+	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
+	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
+
+int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
+				ulong value);
+int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
+int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea);
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea);
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea);
+int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
+void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+				union kvmppc_one_reg *val);
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+			       union kvmppc_one_reg *val);
+
+#ifdef CONFIG_KVM_E500V2
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion);
+#endif
+
+/* TLB helper functions */
+static inline unsigned int
+get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 7) & 0x1f;
+}
+
+static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas2 & MAS2_EPN;
+}
+
+static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	unsigned int pgsize = get_tlb_size(tlbe);
+	return 1ULL << 10 << pgsize;
+}
+
+static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	u64 bytes = get_tlb_bytes(tlbe);
+	return get_tlb_eaddr(tlbe) + bytes - 1;
+}
+
+static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas7_3 & ~0xfffULL;
+}
+
+static inline unsigned int
+get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 16) & 0xff;
+}
+
+static inline unsigned int
+get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 12) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 31) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 >> 30) & 0x1;
+}
+
+static inline unsigned int
+get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
+}
+
+static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.pid & 0xff;
+}
+
+static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS));
+}
+
+static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu)
+{
+	return !!(vcpu->arch.shared->msr & MSR_PR);
+}
+
+static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.shared->mas6 >> 16) & 0xff;
+}
+
+static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->mas6 & 0x1;
+}
+
+static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Manual says that tlbsel has 2 bits wide.
+	 * Since we only have two TLBs, only lower bit is used.
+	 */
+	return (vcpu->arch.shared->mas0 >> 28) & 0x1;
+}
+
+static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.shared->mas0 & 0xfff;
+}
+
+static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.shared->mas0 >> 16) & 0xfff;
+}
+
+static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+			const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	gpa_t gpa;
+
+	if (!get_tlb_v(tlbe))
+		return 0;
+
+#ifndef CONFIG_KVM_BOOKE_HV
+	/* Does it match current guest AS? */
+	/* XXX what about IS != DS? */
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
+		return 0;
+#endif
+
+	gpa = get_tlb_raddr(tlbe);
+	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+		/* Mapping is not for RAM. */
+		return 0;
+
+	return 1;
+}
+
+static inline struct kvm_book3e_206_tlb_entry *get_entry(
+	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
+{
+	int offset = vcpu_e500->gtlb_offset[tlbsel];
+	return &vcpu_e500->gtlb_arch[offset + entry];
+}
+
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe);
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe)       get_tlb_tid(gtlbe)
+#define get_tlbmiss_tid(vcpu)           get_cur_pid(vcpu)
+#define get_tlb_sts(gtlbe)              (gtlbe->mas1 & MAS1_TS)
+
+/*
+ * These functions should be called with preemption disabled
+ * and the returned value is valid only in that context
+ */
+static inline int get_thread_specific_lpid(int vm_lpid)
+{
+	int vcpu_lpid = vm_lpid;
+
+	if (threads_per_core == 2)
+		vcpu_lpid |= smp_processor_id() & 1;
+
+	return vcpu_lpid;
+}
+
+static inline int get_lpid(struct kvm_vcpu *vcpu)
+{
+	return get_thread_specific_lpid(vcpu->kvm->arch.lpid);
+}
+#else
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe);
+
+static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf;
+
+	return vcpu_e500->pid[tidseld];
+}
+
+/* Force TS=1 for all guest mappings. */
+#define get_tlb_sts(gtlbe)              (MAS1_TS)
+#endif /* !BOOKE_HV */
+
+static inline bool has_feature(const struct kvm_vcpu *vcpu,
+			       enum vcpu_ftr ftr)
+{
+	bool has_ftr;
+	switch (ftr) {
+	case VCPU_FTR_MMU_V2:
+		has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2);
+		break;
+	default:
+		return false;
+	}
+	return has_ftr;
+}
+
+#endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
new file mode 100644
index 000000000..ce7291c79
--- /dev/null
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, <yu.liu@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x_emulate.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/dbell.h>
+
+#include "booke.h"
+#include "e500.h"
+
+#define XOP_DCBTLS  166
+#define XOP_MSGSND  206
+#define XOP_MSGCLR  238
+#define XOP_TLBIVAX 786
+#define XOP_TLBSX   914
+#define XOP_TLBRE   946
+#define XOP_TLBWE   978
+#define XOP_TLBILX  18
+#define XOP_EHPRIV  270
+
+#ifdef CONFIG_KVM_E500MC
+static int dbell2prio(ulong param)
+{
+	int msg = param & PPC_DBELL_TYPE_MASK;
+	int prio = -1;
+
+	switch (msg) {
+	case PPC_DBELL_TYPE(PPC_DBELL):
+		prio = BOOKE_IRQPRIO_DBELL;
+		break;
+	case PPC_DBELL_TYPE(PPC_DBELL_CRIT):
+		prio = BOOKE_IRQPRIO_DBELL_CRIT;
+		break;
+	default:
+		break;
+	}
+
+	return prio;
+}
+
+static int kvmppc_e500_emul_msgclr(struct kvm_vcpu *vcpu, int rb)
+{
+	ulong param = vcpu->arch.gpr[rb];
+	int prio = dbell2prio(param);
+
+	if (prio < 0)
+		return EMULATE_FAIL;
+
+	clear_bit(prio, &vcpu->arch.pending_exceptions);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb)
+{
+	ulong param = vcpu->arch.gpr[rb];
+	int prio = dbell2prio(rb);
+	int pir = param & PPC_DBELL_PIR_MASK;
+	int i;
+	struct kvm_vcpu *cvcpu;
+
+	if (prio < 0)
+		return EMULATE_FAIL;
+
+	kvm_for_each_vcpu(i, cvcpu, vcpu->kvm) {
+		int cpir = cvcpu->arch.shared->pir;
+		if ((param & PPC_DBELL_MSG_BRDCAST) || (cpir == pir)) {
+			set_bit(prio, &cvcpu->arch.pending_exceptions);
+			kvm_vcpu_kick(cvcpu);
+		}
+	}
+
+	return EMULATE_DONE;
+}
+#endif
+
+static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				   unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+
+	switch (get_oc(inst)) {
+	case EHPRIV_OC_DEBUG:
+		run->exit_reason = KVM_EXIT_DEBUG;
+		run->debug.arch.address = vcpu->arch.pc;
+		run->debug.arch.status = 0;
+		kvmppc_account_exit(vcpu, DEBUG_EXITS);
+		emulated = EMULATE_EXIT_USER;
+		*advance = 0;
+		break;
+	default:
+		emulated = EMULATE_FAIL;
+	}
+	return emulated;
+}
+
+static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	/* Always fail to lock the cache */
+	vcpu_e500->l1csr0 |= L1CSR0_CUL;
+	return EMULATE_DONE;
+}
+
+int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
+				unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int ra = get_ra(inst);
+	int rb = get_rb(inst);
+	int rt = get_rt(inst);
+	gva_t ea;
+
+	switch (get_op(inst)) {
+	case 31:
+		switch (get_xop(inst)) {
+
+		case XOP_DCBTLS:
+			emulated = kvmppc_e500_emul_dcbtls(vcpu);
+			break;
+
+#ifdef CONFIG_KVM_E500MC
+		case XOP_MSGSND:
+			emulated = kvmppc_e500_emul_msgsnd(vcpu, rb);
+			break;
+
+		case XOP_MSGCLR:
+			emulated = kvmppc_e500_emul_msgclr(vcpu, rb);
+			break;
+#endif
+
+		case XOP_TLBRE:
+			emulated = kvmppc_e500_emul_tlbre(vcpu);
+			break;
+
+		case XOP_TLBWE:
+			emulated = kvmppc_e500_emul_tlbwe(vcpu);
+			break;
+
+		case XOP_TLBSX:
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbsx(vcpu, ea);
+			break;
+
+		case XOP_TLBILX: {
+			int type = rt & 0x3;
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea);
+			break;
+		}
+
+		case XOP_TLBIVAX:
+			ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+			emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
+			break;
+
+		case XOP_EHPRIV:
+			emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
+							   advance);
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+		}
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (emulated == EMULATE_FAIL)
+		emulated = kvmppc_booke_emulate_op(run, vcpu, inst, advance);
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
+	case SPRN_PID:
+		kvmppc_set_pid(vcpu, spr_val);
+		break;
+	case SPRN_PID1:
+		if (spr_val != 0)
+			return EMULATE_FAIL;
+		vcpu_e500->pid[1] = spr_val;
+		break;
+	case SPRN_PID2:
+		if (spr_val != 0)
+			return EMULATE_FAIL;
+		vcpu_e500->pid[2] = spr_val;
+		break;
+	case SPRN_MAS0:
+		vcpu->arch.shared->mas0 = spr_val;
+		break;
+	case SPRN_MAS1:
+		vcpu->arch.shared->mas1 = spr_val;
+		break;
+	case SPRN_MAS2:
+		vcpu->arch.shared->mas2 = spr_val;
+		break;
+	case SPRN_MAS3:
+		vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff;
+		vcpu->arch.shared->mas7_3 |= spr_val;
+		break;
+	case SPRN_MAS4:
+		vcpu->arch.shared->mas4 = spr_val;
+		break;
+	case SPRN_MAS6:
+		vcpu->arch.shared->mas6 = spr_val;
+		break;
+	case SPRN_MAS7:
+		vcpu->arch.shared->mas7_3 &= (u64)0xffffffff;
+		vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32;
+		break;
+#endif
+	case SPRN_L1CSR0:
+		vcpu_e500->l1csr0 = spr_val;
+		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
+		break;
+	case SPRN_L1CSR1:
+		vcpu_e500->l1csr1 = spr_val;
+		vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR);
+		break;
+	case SPRN_HID0:
+		vcpu_e500->hid0 = spr_val;
+		break;
+	case SPRN_HID1:
+		vcpu_e500->hid1 = spr_val;
+		break;
+
+	case SPRN_MMUCSR0:
+		emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500,
+				spr_val);
+		break;
+
+	case SPRN_PWRMGTCR0:
+		/*
+		 * Guest relies on host power management configurations
+		 * Treat the request as a general store
+		 */
+		vcpu->arch.pwrmgtcr0 = spr_val;
+		break;
+
+	/* extra exceptions */
+#ifdef CONFIG_SPE_POSSIBLE
+	case SPRN_IVOR32:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR33:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val;
+		break;
+	case SPRN_IVOR34:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val;
+		break;
+#endif
+#ifdef CONFIG_ALTIVEC
+	case SPRN_IVOR32:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL] = spr_val;
+		break;
+	case SPRN_IVOR33:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST] = spr_val;
+		break;
+#endif
+	case SPRN_IVOR35:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val;
+		break;
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val;
+		break;
+	case SPRN_IVOR37:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val;
+		break;
+#endif
+	default:
+		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val);
+	}
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int emulated = EMULATE_DONE;
+
+	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
+	case SPRN_PID:
+		*spr_val = vcpu_e500->pid[0];
+		break;
+	case SPRN_PID1:
+		*spr_val = vcpu_e500->pid[1];
+		break;
+	case SPRN_PID2:
+		*spr_val = vcpu_e500->pid[2];
+		break;
+	case SPRN_MAS0:
+		*spr_val = vcpu->arch.shared->mas0;
+		break;
+	case SPRN_MAS1:
+		*spr_val = vcpu->arch.shared->mas1;
+		break;
+	case SPRN_MAS2:
+		*spr_val = vcpu->arch.shared->mas2;
+		break;
+	case SPRN_MAS3:
+		*spr_val = (u32)vcpu->arch.shared->mas7_3;
+		break;
+	case SPRN_MAS4:
+		*spr_val = vcpu->arch.shared->mas4;
+		break;
+	case SPRN_MAS6:
+		*spr_val = vcpu->arch.shared->mas6;
+		break;
+	case SPRN_MAS7:
+		*spr_val = vcpu->arch.shared->mas7_3 >> 32;
+		break;
+#endif
+	case SPRN_DECAR:
+		*spr_val = vcpu->arch.decar;
+		break;
+	case SPRN_TLB0CFG:
+		*spr_val = vcpu->arch.tlbcfg[0];
+		break;
+	case SPRN_TLB1CFG:
+		*spr_val = vcpu->arch.tlbcfg[1];
+		break;
+	case SPRN_TLB0PS:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		*spr_val = vcpu->arch.tlbps[0];
+		break;
+	case SPRN_TLB1PS:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		*spr_val = vcpu->arch.tlbps[1];
+		break;
+	case SPRN_L1CSR0:
+		*spr_val = vcpu_e500->l1csr0;
+		break;
+	case SPRN_L1CSR1:
+		*spr_val = vcpu_e500->l1csr1;
+		break;
+	case SPRN_HID0:
+		*spr_val = vcpu_e500->hid0;
+		break;
+	case SPRN_HID1:
+		*spr_val = vcpu_e500->hid1;
+		break;
+	case SPRN_SVR:
+		*spr_val = vcpu_e500->svr;
+		break;
+
+	case SPRN_MMUCSR0:
+		*spr_val = 0;
+		break;
+
+	case SPRN_MMUCFG:
+		*spr_val = vcpu->arch.mmucfg;
+		break;
+	case SPRN_EPTCFG:
+		if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+			return EMULATE_FAIL;
+		/*
+		 * Legacy Linux guests access EPTCFG register even if the E.PT
+		 * category is disabled in the VM. Give them a chance to live.
+		 */
+		*spr_val = vcpu->arch.eptcfg;
+		break;
+
+	case SPRN_PWRMGTCR0:
+		*spr_val = vcpu->arch.pwrmgtcr0;
+		break;
+
+	/* extra exceptions */
+#ifdef CONFIG_SPE_POSSIBLE
+	case SPRN_IVOR32:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
+		break;
+	case SPRN_IVOR33:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
+		break;
+	case SPRN_IVOR34:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
+		break;
+#endif
+#ifdef CONFIG_ALTIVEC
+	case SPRN_IVOR32:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL];
+		break;
+	case SPRN_IVOR33:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST];
+		break;
+#endif
+	case SPRN_IVOR35:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+		break;
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
+		break;
+	case SPRN_IVOR37:
+		*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
+		break;
+#endif
+	default:
+		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val);
+	}
+
+	return emulated;
+}
+
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
new file mode 100644
index 000000000..50860e919
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ *         Scott Wood, scottwood@freescale.com
+ *         Ashish Kalra, ashish.kalra@freescale.com
+ *         Varun Sethi, varun.sethi@freescale.com
+ *         Alexander Graf, agraf@suse.de
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <asm/kvm_ppc.h>
+
+#include "e500.h"
+#include "trace_booke.h"
+#include "timing.h"
+#include "e500_mmu_host.h"
+
+static inline unsigned int gtlb0_get_next_victim(
+		struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	unsigned int victim;
+
+	victim = vcpu_e500->gtlb_nv[0]++;
+	if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways))
+		vcpu_e500->gtlb_nv[0] = 0;
+
+	return victim;
+}
+
+static int tlb0_set_base(gva_t addr, int sets, int ways)
+{
+	int set_base;
+
+	set_base = (addr >> PAGE_SHIFT) & (sets - 1);
+	set_base *= ways;
+
+	return set_base;
+}
+
+static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr)
+{
+	return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets,
+			     vcpu_e500->gtlb_params[0].ways);
+}
+
+static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel = get_tlb_esel_bit(vcpu);
+
+	if (tlbsel == 0) {
+		esel &= vcpu_e500->gtlb_params[0].ways - 1;
+		esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2);
+	} else {
+		esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1;
+	}
+
+	return esel;
+}
+
+/* Search the guest TLB for a matching entry. */
+static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500,
+		gva_t eaddr, int tlbsel, unsigned int pid, int as)
+{
+	int size = vcpu_e500->gtlb_params[tlbsel].entries;
+	unsigned int set_base, offset;
+	int i;
+
+	if (tlbsel == 0) {
+		set_base = gtlb0_set_base(vcpu_e500, eaddr);
+		size = vcpu_e500->gtlb_params[0].ways;
+	} else {
+		if (eaddr < vcpu_e500->tlb1_min_eaddr ||
+				eaddr > vcpu_e500->tlb1_max_eaddr)
+			return -1;
+		set_base = 0;
+	}
+
+	offset = vcpu_e500->gtlb_offset[tlbsel];
+
+	for (i = 0; i < size; i++) {
+		struct kvm_book3e_206_tlb_entry *tlbe =
+			&vcpu_e500->gtlb_arch[offset + set_base + i];
+		unsigned int tid;
+
+		if (eaddr < get_tlb_eaddr(tlbe))
+			continue;
+
+		if (eaddr > get_tlb_end(tlbe))
+			continue;
+
+		tid = get_tlb_tid(tlbe);
+		if (tid && (tid != pid))
+			continue;
+
+		if (!get_tlb_v(tlbe))
+			continue;
+
+		if (get_tlb_ts(tlbe) != as && as != -1)
+			continue;
+
+		return set_base + i;
+	}
+
+	return -1;
+}
+
+static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
+		gva_t eaddr, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int victim, tsized;
+	int tlbsel;
+
+	/* since we only have two TLBs, only lower bit is used. */
+	tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1;
+	victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
+	tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f;
+
+	vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
+		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+	vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
+		| MAS1_TID(get_tlbmiss_tid(vcpu))
+		| MAS1_TSIZE(tsized);
+	vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN)
+		| (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK);
+	vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3;
+	vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1)
+		| (get_cur_pid(vcpu) << 16)
+		| (as ? MAS6_SAS : 0);
+}
+
+static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int size = vcpu_e500->gtlb_params[1].entries;
+	unsigned int offset;
+	gva_t eaddr;
+	int i;
+
+	vcpu_e500->tlb1_min_eaddr = ~0UL;
+	vcpu_e500->tlb1_max_eaddr = 0;
+	offset = vcpu_e500->gtlb_offset[1];
+
+	for (i = 0; i < size; i++) {
+		struct kvm_book3e_206_tlb_entry *tlbe =
+			&vcpu_e500->gtlb_arch[offset + i];
+
+		if (!get_tlb_v(tlbe))
+			continue;
+
+		eaddr = get_tlb_eaddr(tlbe);
+		vcpu_e500->tlb1_min_eaddr =
+				min(vcpu_e500->tlb1_min_eaddr, eaddr);
+
+		eaddr = get_tlb_end(tlbe);
+		vcpu_e500->tlb1_max_eaddr =
+				max(vcpu_e500->tlb1_max_eaddr, eaddr);
+	}
+}
+
+static int kvmppc_need_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500,
+				struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned long start, end, size;
+
+	size = get_tlb_bytes(gtlbe);
+	start = get_tlb_eaddr(gtlbe) & ~(size - 1);
+	end = start + size - 1;
+
+	return vcpu_e500->tlb1_min_eaddr == start ||
+			vcpu_e500->tlb1_max_eaddr == end;
+}
+
+/* This function is supposed to be called for a adding a new valid tlb entry */
+static void kvmppc_set_tlb1map_range(struct kvm_vcpu *vcpu,
+				struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned long start, end, size;
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	if (!get_tlb_v(gtlbe))
+		return;
+
+	size = get_tlb_bytes(gtlbe);
+	start = get_tlb_eaddr(gtlbe) & ~(size - 1);
+	end = start + size - 1;
+
+	vcpu_e500->tlb1_min_eaddr = min(vcpu_e500->tlb1_min_eaddr, start);
+	vcpu_e500->tlb1_max_eaddr = max(vcpu_e500->tlb1_max_eaddr, end);
+}
+
+static inline int kvmppc_e500_gtlbe_invalidate(
+				struct kvmppc_vcpu_e500 *vcpu_e500,
+				int tlbsel, int esel)
+{
+	struct kvm_book3e_206_tlb_entry *gtlbe =
+		get_entry(vcpu_e500, tlbsel, esel);
+
+	if (unlikely(get_tlb_iprot(gtlbe)))
+		return -1;
+
+	if (tlbsel == 1 && kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe))
+		kvmppc_recalc_tlb1map_range(vcpu_e500);
+
+	gtlbe->mas1 = 0;
+
+	return 0;
+}
+
+int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
+{
+	int esel;
+
+	if (value & MMUCSR0_TLB0FI)
+		for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel);
+	if (value & MMUCSR0_TLB1FI)
+		for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
+
+	/* Invalidate all host shadow mappings */
+	kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int ia;
+	int esel, tlbsel;
+
+	ia = (ea >> 2) & 0x1;
+
+	/* since we only have two TLBs, only lower bit is used. */
+	tlbsel = (ea >> 3) & 0x1;
+
+	if (ia) {
+		/* invalidate all entries */
+		for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries;
+		     esel++)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+	} else {
+		ea &= 0xfffff000;
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel,
+				get_cur_pid(vcpu), -1);
+		if (esel >= 0)
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+	}
+
+	/* Invalidate all host shadow mappings */
+	kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
+
+	return EMULATE_DONE;
+}
+
+static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+		       int pid, int type)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+	int tid, esel;
+
+	/* invalidate all entries */
+	for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
+		tlbe = get_entry(vcpu_e500, tlbsel, esel);
+		tid = get_tlb_tid(tlbe);
+		if (type == 0 || tid == pid) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+		}
+	}
+}
+
+static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
+		       gva_t ea)
+{
+	int tlbsel, esel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
+		if (esel >= 0) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+			break;
+		}
+	}
+}
+
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int pid = get_cur_spid(vcpu);
+
+	if (type == 0 || type == 1) {
+		tlbilx_all(vcpu_e500, 0, pid, type);
+		tlbilx_all(vcpu_e500, 1, pid, type);
+	} else if (type == 3) {
+		tlbilx_one(vcpu_e500, pid, ea);
+	}
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int tlbsel, esel;
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu, tlbsel);
+
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+	vcpu->arch.shared->mas0 &= ~MAS0_NV(~0);
+	vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+	vcpu->arch.shared->mas1 = gtlbe->mas1;
+	vcpu->arch.shared->mas2 = gtlbe->mas2;
+	vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int as = !!get_cur_sas(vcpu);
+	unsigned int pid = get_cur_spid(vcpu);
+	int esel, tlbsel;
+	struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
+		if (esel >= 0) {
+			gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+			break;
+		}
+	}
+
+	if (gtlbe) {
+		esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1;
+
+		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel)
+			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+		vcpu->arch.shared->mas1 = gtlbe->mas1;
+		vcpu->arch.shared->mas2 = gtlbe->mas2;
+		vcpu->arch.shared->mas7_3 = gtlbe->mas7_3;
+	} else {
+		int victim;
+
+		/* since we only have two TLBs, only lower bit is used. */
+		tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1;
+		victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
+
+		vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel)
+			| MAS0_ESEL(victim)
+			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
+		vcpu->arch.shared->mas1 =
+			  (vcpu->arch.shared->mas6 & MAS6_SPID0)
+			| (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0))
+			| (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
+		vcpu->arch.shared->mas2 &= MAS2_EPN;
+		vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
+					   MAS2_ATTRIB_MASK;
+		vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 |
+					     MAS3_U2 | MAS3_U3;
+	}
+
+	kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
+	return EMULATE_DONE;
+}
+
+int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	int tlbsel, esel;
+	int recal = 0;
+	int idx;
+
+	tlbsel = get_tlb_tlbsel(vcpu);
+	esel = get_tlb_esel(vcpu, tlbsel);
+
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+
+	if (get_tlb_v(gtlbe)) {
+		inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+		if ((tlbsel == 1) &&
+			kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe))
+			recal = 1;
+	}
+
+	gtlbe->mas1 = vcpu->arch.shared->mas1;
+	gtlbe->mas2 = vcpu->arch.shared->mas2;
+	if (!(vcpu->arch.shared->msr & MSR_CM))
+		gtlbe->mas2 &= 0xffffffffUL;
+	gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
+
+	trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
+	                              gtlbe->mas2, gtlbe->mas7_3);
+
+	if (tlbsel == 1) {
+		/*
+		 * If a valid tlb1 entry is overwritten then recalculate the
+		 * min/max TLB1 map address range otherwise no need to look
+		 * in tlb1 array.
+		 */
+		if (recal)
+			kvmppc_recalc_tlb1map_range(vcpu_e500);
+		else
+			kvmppc_set_tlb1map_range(vcpu, gtlbe);
+	}
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
+	if (tlbe_is_host_safe(vcpu, gtlbe)) {
+		u64 eaddr = get_tlb_eaddr(gtlbe);
+		u64 raddr = get_tlb_raddr(gtlbe);
+
+		if (tlbsel == 0) {
+			gtlbe->mas1 &= ~MAS1_TSIZE(~0);
+			gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+		}
+
+		/* Premap the faulting page */
+		kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel));
+	}
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
+				  gva_t eaddr, unsigned int pid, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel, tlbsel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as);
+		if (esel >= 0)
+			return index_of(tlbsel, esel);
+	}
+
+	return -1;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+	int index;
+	gva_t eaddr;
+	u8 pid;
+	u8 as;
+
+	eaddr = tr->linear_address;
+	pid = (tr->linear_address >> 32) & 0xff;
+	as = (tr->linear_address >> 40) & 0x1;
+
+	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as);
+	if (index < 0) {
+		tr->valid = 0;
+		return 0;
+	}
+
+	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
+	/* XXX what does "writeable" and "usermode" even mean? */
+	tr->valid = 1;
+
+	return 0;
+}
+
+
+int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
+
+	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
+}
+
+int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
+
+	return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as);
+}
+
+void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
+
+	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as);
+}
+
+void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu)
+{
+	unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS);
+
+	kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as);
+}
+
+gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index,
+			gva_t eaddr)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	u64 pgmask;
+
+	gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index));
+	pgmask = get_tlb_bytes(gtlbe) - 1;
+
+	return get_tlb_raddr(gtlbe) | (eaddr & pgmask);
+}
+
+void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu)
+{
+}
+
+/*****************************************/
+
+static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int i;
+
+	kvmppc_core_flush_tlb(&vcpu_e500->vcpu);
+	kfree(vcpu_e500->g2h_tlb1_map);
+	kfree(vcpu_e500->gtlb_priv[0]);
+	kfree(vcpu_e500->gtlb_priv[1]);
+
+	if (vcpu_e500->shared_tlb_pages) {
+		vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch,
+					  PAGE_SIZE)));
+
+		for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) {
+			set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]);
+			put_page(vcpu_e500->shared_tlb_pages[i]);
+		}
+
+		vcpu_e500->num_shared_tlb_pages = 0;
+
+		kfree(vcpu_e500->shared_tlb_pages);
+		vcpu_e500->shared_tlb_pages = NULL;
+	} else {
+		kfree(vcpu_e500->gtlb_arch);
+	}
+
+	vcpu_e500->gtlb_arch = NULL;
+}
+
+void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	sregs->u.e.mas0 = vcpu->arch.shared->mas0;
+	sregs->u.e.mas1 = vcpu->arch.shared->mas1;
+	sregs->u.e.mas2 = vcpu->arch.shared->mas2;
+	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
+	sregs->u.e.mas4 = vcpu->arch.shared->mas4;
+	sregs->u.e.mas6 = vcpu->arch.shared->mas6;
+
+	sregs->u.e.mmucfg = vcpu->arch.mmucfg;
+	sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0];
+	sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1];
+	sregs->u.e.tlbcfg[2] = 0;
+	sregs->u.e.tlbcfg[3] = 0;
+}
+
+int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
+		vcpu->arch.shared->mas0 = sregs->u.e.mas0;
+		vcpu->arch.shared->mas1 = sregs->u.e.mas1;
+		vcpu->arch.shared->mas2 = sregs->u.e.mas2;
+		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
+		vcpu->arch.shared->mas4 = sregs->u.e.mas4;
+		vcpu->arch.shared->mas6 = sregs->u.e.mas6;
+	}
+
+	return 0;
+}
+
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+				union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_MAS0:
+		*val = get_reg_val(id, vcpu->arch.shared->mas0);
+		break;
+	case KVM_REG_PPC_MAS1:
+		*val = get_reg_val(id, vcpu->arch.shared->mas1);
+		break;
+	case KVM_REG_PPC_MAS2:
+		*val = get_reg_val(id, vcpu->arch.shared->mas2);
+		break;
+	case KVM_REG_PPC_MAS7_3:
+		*val = get_reg_val(id, vcpu->arch.shared->mas7_3);
+		break;
+	case KVM_REG_PPC_MAS4:
+		*val = get_reg_val(id, vcpu->arch.shared->mas4);
+		break;
+	case KVM_REG_PPC_MAS6:
+		*val = get_reg_val(id, vcpu->arch.shared->mas6);
+		break;
+	case KVM_REG_PPC_MMUCFG:
+		*val = get_reg_val(id, vcpu->arch.mmucfg);
+		break;
+	case KVM_REG_PPC_EPTCFG:
+		*val = get_reg_val(id, vcpu->arch.eptcfg);
+		break;
+	case KVM_REG_PPC_TLB0CFG:
+	case KVM_REG_PPC_TLB1CFG:
+	case KVM_REG_PPC_TLB2CFG:
+	case KVM_REG_PPC_TLB3CFG:
+		i = id - KVM_REG_PPC_TLB0CFG;
+		*val = get_reg_val(id, vcpu->arch.tlbcfg[i]);
+		break;
+	case KVM_REG_PPC_TLB0PS:
+	case KVM_REG_PPC_TLB1PS:
+	case KVM_REG_PPC_TLB2PS:
+	case KVM_REG_PPC_TLB3PS:
+		i = id - KVM_REG_PPC_TLB0PS;
+		*val = get_reg_val(id, vcpu->arch.tlbps[i]);
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+			       union kvmppc_one_reg *val)
+{
+	int r = 0;
+	long int i;
+
+	switch (id) {
+	case KVM_REG_PPC_MAS0:
+		vcpu->arch.shared->mas0 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS1:
+		vcpu->arch.shared->mas1 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS2:
+		vcpu->arch.shared->mas2 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS7_3:
+		vcpu->arch.shared->mas7_3 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS4:
+		vcpu->arch.shared->mas4 = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_MAS6:
+		vcpu->arch.shared->mas6 = set_reg_val(id, *val);
+		break;
+	/* Only allow MMU registers to be set to the config supported by KVM */
+	case KVM_REG_PPC_MMUCFG: {
+		u32 reg = set_reg_val(id, *val);
+		if (reg != vcpu->arch.mmucfg)
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_EPTCFG: {
+		u32 reg = set_reg_val(id, *val);
+		if (reg != vcpu->arch.eptcfg)
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_TLB0CFG:
+	case KVM_REG_PPC_TLB1CFG:
+	case KVM_REG_PPC_TLB2CFG:
+	case KVM_REG_PPC_TLB3CFG: {
+		/* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */
+		u32 reg = set_reg_val(id, *val);
+		i = id - KVM_REG_PPC_TLB0CFG;
+		if (reg != vcpu->arch.tlbcfg[i])
+			r = -EINVAL;
+		break;
+	}
+	case KVM_REG_PPC_TLB0PS:
+	case KVM_REG_PPC_TLB1PS:
+	case KVM_REG_PPC_TLB2PS:
+	case KVM_REG_PPC_TLB3PS: {
+		u32 reg = set_reg_val(id, *val);
+		i = id - KVM_REG_PPC_TLB0PS;
+		if (reg != vcpu->arch.tlbps[i])
+			r = -EINVAL;
+		break;
+	}
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu,
+		struct kvm_book3e_206_tlb_params *params)
+{
+	vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	if (params->tlb_sizes[0] <= 2048)
+		vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0];
+	vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1];
+	vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+	return 0;
+}
+
+int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
+			      struct kvm_config_tlb *cfg)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_params params;
+	char *virt;
+	struct page **pages;
+	struct tlbe_priv *privs[2] = {};
+	u64 *g2h_bitmap = NULL;
+	size_t array_len;
+	u32 sets;
+	int num_pages, ret, i;
+
+	if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV)
+		return -EINVAL;
+
+	if (copy_from_user(&params, (void __user *)(uintptr_t)cfg->params,
+			   sizeof(params)))
+		return -EFAULT;
+
+	if (params.tlb_sizes[1] > 64)
+		return -EINVAL;
+	if (params.tlb_ways[1] != params.tlb_sizes[1])
+		return -EINVAL;
+	if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0)
+		return -EINVAL;
+	if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0)
+		return -EINVAL;
+
+	if (!is_power_of_2(params.tlb_ways[0]))
+		return -EINVAL;
+
+	sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]);
+	if (!is_power_of_2(sets))
+		return -EINVAL;
+
+	array_len = params.tlb_sizes[0] + params.tlb_sizes[1];
+	array_len *= sizeof(struct kvm_book3e_206_tlb_entry);
+
+	if (cfg->array_len < array_len)
+		return -EINVAL;
+
+	num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
+		    cfg->array / PAGE_SIZE;
+	pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+	if (ret < 0)
+		goto err_pages;
+
+	if (ret != num_pages) {
+		num_pages = ret;
+		ret = -EFAULT;
+		goto err_put_page;
+	}
+
+	virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
+	if (!virt) {
+		ret = -ENOMEM;
+		goto err_put_page;
+	}
+
+	privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
+			   GFP_KERNEL);
+	privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
+			   GFP_KERNEL);
+
+	if (!privs[0] || !privs[1]) {
+		ret = -ENOMEM;
+		goto err_privs;
+	}
+
+	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
+	                     GFP_KERNEL);
+	if (!g2h_bitmap) {
+		ret = -ENOMEM;
+		goto err_privs;
+	}
+
+	free_gtlb(vcpu_e500);
+
+	vcpu_e500->gtlb_priv[0] = privs[0];
+	vcpu_e500->gtlb_priv[1] = privs[1];
+	vcpu_e500->g2h_tlb1_map = g2h_bitmap;
+
+	vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *)
+		(virt + (cfg->array & (PAGE_SIZE - 1)));
+
+	vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0];
+	vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1];
+
+	vcpu_e500->gtlb_offset[0] = 0;
+	vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
+
+	/* Update vcpu's MMU geometry based on SW_TLB input */
+	vcpu_mmu_geometry_update(vcpu, &params);
+
+	vcpu_e500->shared_tlb_pages = pages;
+	vcpu_e500->num_shared_tlb_pages = num_pages;
+
+	vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0];
+	vcpu_e500->gtlb_params[0].sets = sets;
+
+	vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1];
+	vcpu_e500->gtlb_params[1].sets = 1;
+
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
+	return 0;
+
+err_privs:
+	kfree(privs[0]);
+	kfree(privs[1]);
+
+err_put_page:
+	for (i = 0; i < num_pages; i++)
+		put_page(pages[i]);
+
+err_pages:
+	kfree(pages);
+	return ret;
+}
+
+int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
+			     struct kvm_dirty_tlb *dirty)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
+	kvmppc_core_flush_tlb(vcpu);
+	return 0;
+}
+
+/* Vcpu's MMU default configuration */
+static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
+		       struct kvmppc_e500_tlb_params *params)
+{
+	/* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/
+	vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
+
+	/* Initialize TLBnCFG fields with host values and SW_TLB geometry*/
+	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[0] |= params[0].entries;
+	vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT;
+
+	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
+			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params[1].entries;
+	vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT;
+
+	if (has_feature(vcpu, VCPU_FTR_MMU_V2)) {
+		vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS);
+		vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS);
+
+		vcpu->arch.mmucfg &= ~MMUCFG_LRAT;
+
+		/* Guest mmu emulation currently doesn't handle E.PT */
+		vcpu->arch.eptcfg = 0;
+		vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT;
+		vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND;
+	}
+
+	return 0;
+}
+
+int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
+	int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
+	int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
+
+	if (e500_mmu_host_init(vcpu_e500))
+		goto err;
+
+	vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
+	vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
+
+	vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM;
+	vcpu_e500->gtlb_params[0].sets =
+		KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM;
+
+	vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
+	vcpu_e500->gtlb_params[1].sets = 1;
+
+	vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+	if (!vcpu_e500->gtlb_arch)
+		return -ENOMEM;
+
+	vcpu_e500->gtlb_offset[0] = 0;
+	vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
+
+	vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
+					  vcpu_e500->gtlb_params[0].entries,
+					  GFP_KERNEL);
+	if (!vcpu_e500->gtlb_priv[0])
+		goto err;
+
+	vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
+					  vcpu_e500->gtlb_params[1].entries,
+					  GFP_KERNEL);
+	if (!vcpu_e500->gtlb_priv[1])
+		goto err;
+
+	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
+					  vcpu_e500->gtlb_params[1].entries,
+					  GFP_KERNEL);
+	if (!vcpu_e500->g2h_tlb1_map)
+		goto err;
+
+	vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
+
+	kvmppc_recalc_tlb1map_range(vcpu_e500);
+	return 0;
+
+err:
+	free_gtlb(vcpu_e500);
+	return -1;
+}
+
+void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	free_gtlb(vcpu_e500);
+	e500_mmu_host_uninit(vcpu_e500);
+}
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
new file mode 100644
index 000000000..4d33e199e
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -0,0 +1,813 @@
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Yu Liu, yu.liu@freescale.com
+ *         Scott Wood, scottwood@freescale.com
+ *         Ashish Kalra, ashish.kalra@freescale.com
+ *         Varun Sethi, varun.sethi@freescale.com
+ *         Alexander Graf, agraf@suse.de
+ *
+ * Description:
+ * This file is based on arch/powerpc/kvm/44x_tlb.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/uaccess.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/hugetlb.h>
+#include <asm/kvm_ppc.h>
+
+#include "e500.h"
+#include "timing.h"
+#include "e500_mmu_host.h"
+
+#include "trace_booke.h"
+
+#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
+
+static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
+
+static inline unsigned int tlb1_max_shadow_size(void)
+{
+	/* reserve one entry for magic page */
+	return host_tlb_params[1].entries - tlbcam_index - 1;
+}
+
+static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
+{
+	/* Mask off reserved bits. */
+	mas3 &= MAS3_ATTRIB_MASK;
+
+#ifndef CONFIG_KVM_BOOKE_HV
+	if (!usermode) {
+		/* Guest is in supervisor mode,
+		 * so we need to translate guest
+		 * supervisor permissions into user permissions. */
+		mas3 &= ~E500_TLB_USER_PERM_MASK;
+		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
+	}
+	mas3 |= E500_TLB_SUPER_PERM_MASK;
+#endif
+	return mas3;
+}
+
+/*
+ * writing shadow tlb entry to host TLB
+ */
+static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
+				     uint32_t mas0,
+				     uint32_t lpid)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS0, mas0);
+	mtspr(SPRN_MAS1, stlbe->mas1);
+	mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
+	mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
+	mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_MAS8, MAS8_TGS | get_thread_specific_lpid(lpid));
+#endif
+	asm volatile("isync; tlbwe" : : : "memory");
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/* Must clear mas8 for other host tlbwe's */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+#endif
+	local_irq_restore(flags);
+
+	trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
+	                              stlbe->mas2, stlbe->mas7_3);
+}
+
+/*
+ * Acquire a mas0 with victim hint, as if we just took a TLB miss.
+ *
+ * We don't care about the address we're searching for, other than that it's
+ * in the right set and is not present in the TLB.  Using a zero PID and a
+ * userspace address means we don't have to set and then restore MAS5, or
+ * calculate a proper MAS6 value.
+ */
+static u32 get_host_mas0(unsigned long eaddr)
+{
+	unsigned long flags;
+	u32 mas0;
+	u32 mas4;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS6, 0);
+	mas4 = mfspr(SPRN_MAS4);
+	mtspr(SPRN_MAS4, mas4 & ~MAS4_TLBSEL_MASK);
+	asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET));
+	mas0 = mfspr(SPRN_MAS0);
+	mtspr(SPRN_MAS4, mas4);
+	local_irq_restore(flags);
+
+	return mas0;
+}
+
+/* sesel is for tlb1 only */
+static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+		int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe)
+{
+	u32 mas0;
+
+	if (tlbsel == 0) {
+		mas0 = get_host_mas0(stlbe->mas2);
+		__write_host_tlbe(stlbe, mas0, vcpu_e500->vcpu.kvm->arch.lpid);
+	} else {
+		__write_host_tlbe(stlbe,
+				  MAS0_TLBSEL(1) |
+				  MAS0_ESEL(to_htlb1_esel(sesel)),
+				  vcpu_e500->vcpu.kvm->arch.lpid);
+	}
+}
+
+/* sesel is for tlb1 only */
+static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
+			struct kvm_book3e_206_tlb_entry *gtlbe,
+			struct kvm_book3e_206_tlb_entry *stlbe,
+			int stlbsel, int sesel)
+{
+	int stid;
+
+	preempt_disable();
+	stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
+
+	stlbe->mas1 |= MAS1_TID(stid);
+	write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
+	preempt_enable();
+}
+
+#ifdef CONFIG_KVM_E500V2
+/* XXX should be a hook in the gva2hpa translation */
+void kvmppc_map_magic(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct kvm_book3e_206_tlb_entry magic;
+	ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK;
+	unsigned int stid;
+	pfn_t pfn;
+
+	pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT;
+	get_page(pfn_to_page(pfn));
+
+	preempt_disable();
+	stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0);
+
+	magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) |
+		     MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+	magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M;
+	magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+		       MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR;
+	magic.mas8 = 0;
+
+	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index), 0);
+	preempt_enable();
+}
+#endif
+
+void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+			 int esel)
+{
+	struct kvm_book3e_206_tlb_entry *gtlbe =
+		get_entry(vcpu_e500, tlbsel, esel);
+	struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref;
+
+	/* Don't bother with unmapped entries */
+	if (!(ref->flags & E500_TLB_VALID)) {
+		WARN(ref->flags & (E500_TLB_BITMAP | E500_TLB_TLB0),
+		     "%s: flags %x\n", __func__, ref->flags);
+		WARN_ON(tlbsel == 1 && vcpu_e500->g2h_tlb1_map[esel]);
+	}
+
+	if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) {
+		u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
+		int hw_tlb_indx;
+		unsigned long flags;
+
+		local_irq_save(flags);
+		while (tmp) {
+			hw_tlb_indx = __ilog2_u64(tmp & -tmp);
+			mtspr(SPRN_MAS0,
+			      MAS0_TLBSEL(1) |
+			      MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
+			mtspr(SPRN_MAS1, 0);
+			asm volatile("tlbwe");
+			vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
+			tmp &= tmp - 1;
+		}
+		mb();
+		vcpu_e500->g2h_tlb1_map[esel] = 0;
+		ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID);
+		local_irq_restore(flags);
+	}
+
+	if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) {
+		/*
+		 * TLB1 entry is backed by 4k pages. This should happen
+		 * rarely and is not worth optimizing. Invalidate everything.
+		 */
+		kvmppc_e500_tlbil_all(vcpu_e500);
+		ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID);
+	}
+
+	/*
+	 * If TLB entry is still valid then it's a TLB0 entry, and thus
+	 * backed by at most one host tlbe per shadow pid
+	 */
+	if (ref->flags & E500_TLB_VALID)
+		kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
+
+	/* Mark the TLB as not backed by the host anymore */
+	ref->flags = 0;
+}
+
+static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return tlbe->mas7_3 & (MAS3_SW|MAS3_UW);
+}
+
+static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
+					 struct kvm_book3e_206_tlb_entry *gtlbe,
+					 pfn_t pfn, unsigned int wimg)
+{
+	ref->pfn = pfn;
+	ref->flags = E500_TLB_VALID;
+
+	/* Use guest supplied MAS2_G and MAS2_E */
+	ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;
+
+	/* Mark the page accessed */
+	kvm_set_pfn_accessed(pfn);
+
+	if (tlbe_is_writable(gtlbe))
+		kvm_set_pfn_dirty(pfn);
+}
+
+static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
+{
+	if (ref->flags & E500_TLB_VALID) {
+		/* FIXME: don't log bogus pfn for TLB1 */
+		trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
+		ref->flags = 0;
+	}
+}
+
+static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	if (vcpu_e500->g2h_tlb1_map)
+		memset(vcpu_e500->g2h_tlb1_map, 0,
+		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
+	if (vcpu_e500->h2g_tlb1_rmap)
+		memset(vcpu_e500->h2g_tlb1_rmap, 0,
+		       sizeof(unsigned int) * host_tlb_params[1].entries);
+}
+
+static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	int tlbsel;
+	int i;
+
+	for (tlbsel = 0; tlbsel <= 1; tlbsel++) {
+		for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) {
+			struct tlbe_ref *ref =
+				&vcpu_e500->gtlb_priv[tlbsel][i].ref;
+			kvmppc_e500_ref_release(ref);
+		}
+	}
+}
+
+void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	kvmppc_e500_tlbil_all(vcpu_e500);
+	clear_tlb_privs(vcpu_e500);
+	clear_tlb1_bitmap(vcpu_e500);
+}
+
+/* TID must be supplied by the caller */
+static void kvmppc_e500_setup_stlbe(
+	struct kvm_vcpu *vcpu,
+	struct kvm_book3e_206_tlb_entry *gtlbe,
+	int tsize, struct tlbe_ref *ref, u64 gvaddr,
+	struct kvm_book3e_206_tlb_entry *stlbe)
+{
+	pfn_t pfn = ref->pfn;
+	u32 pr = vcpu->arch.shared->msr & MSR_PR;
+
+	BUG_ON(!(ref->flags & E500_TLB_VALID));
+
+	/* Force IPROT=0 for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
+	stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR);
+	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
+}
+
+static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+	u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+	int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe,
+	struct tlbe_ref *ref)
+{
+	struct kvm_memory_slot *slot;
+	unsigned long pfn = 0; /* silence GCC warning */
+	unsigned long hva;
+	int pfnmap = 0;
+	int tsize = BOOK3E_PAGESZ_4K;
+	int ret = 0;
+	unsigned long mmu_seq;
+	struct kvm *kvm = vcpu_e500->vcpu.kvm;
+	unsigned long tsize_pages = 0;
+	pte_t *ptep;
+	unsigned int wimg = 0;
+	pgd_t *pgdir;
+	unsigned long flags;
+
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	/*
+	 * Translate guest physical to true physical, acquiring
+	 * a page reference if it is normal, non-reserved memory.
+	 *
+	 * gfn_to_memslot() must succeed because otherwise we wouldn't
+	 * have gotten this far.  Eventually we should just pass the slot
+	 * pointer through from the first lookup.
+	 */
+	slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn);
+	hva = gfn_to_hva_memslot(slot, gfn);
+
+	if (tlbsel == 1) {
+		struct vm_area_struct *vma;
+		down_read(&current->mm->mmap_sem);
+
+		vma = find_vma(current->mm, hva);
+		if (vma && hva >= vma->vm_start &&
+		    (vma->vm_flags & VM_PFNMAP)) {
+			/*
+			 * This VMA is a physically contiguous region (e.g.
+			 * /dev/mem) that bypasses normal Linux page
+			 * management.  Find the overlap between the
+			 * vma and the memslot.
+			 */
+
+			unsigned long start, end;
+			unsigned long slot_start, slot_end;
+
+			pfnmap = 1;
+
+			start = vma->vm_pgoff;
+			end = start +
+			      ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
+
+			pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
+
+			slot_start = pfn - (gfn - slot->base_gfn);
+			slot_end = slot_start + slot->npages;
+
+			if (start < slot_start)
+				start = slot_start;
+			if (end > slot_end)
+				end = slot_end;
+
+			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+				MAS1_TSIZE_SHIFT;
+
+			/*
+			 * e500 doesn't implement the lowest tsize bit,
+			 * or 1K pages.
+			 */
+			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+
+			/*
+			 * Now find the largest tsize (up to what the guest
+			 * requested) that will cover gfn, stay within the
+			 * range, and for which gfn and pfn are mutually
+			 * aligned.
+			 */
+
+			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
+				unsigned long gfn_start, gfn_end;
+				tsize_pages = 1 << (tsize - 2);
+
+				gfn_start = gfn & ~(tsize_pages - 1);
+				gfn_end = gfn_start + tsize_pages;
+
+				if (gfn_start + pfn - gfn < start)
+					continue;
+				if (gfn_end + pfn - gfn > end)
+					continue;
+				if ((gfn & (tsize_pages - 1)) !=
+				    (pfn & (tsize_pages - 1)))
+					continue;
+
+				gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+				pfn &= ~(tsize_pages - 1);
+				break;
+			}
+		} else if (vma && hva >= vma->vm_start &&
+			   (vma->vm_flags & VM_HUGETLB)) {
+			unsigned long psize = vma_kernel_pagesize(vma);
+
+			tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >>
+				MAS1_TSIZE_SHIFT;
+
+			/*
+			 * Take the largest page size that satisfies both host
+			 * and guest mapping
+			 */
+			tsize = min(__ilog2(psize) - 10, tsize);
+
+			/*
+			 * e500 doesn't implement the lowest tsize bit,
+			 * or 1K pages.
+			 */
+			tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1);
+		}
+
+		up_read(&current->mm->mmap_sem);
+	}
+
+	if (likely(!pfnmap)) {
+		tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
+		pfn = gfn_to_pfn_memslot(slot, gfn);
+		if (is_error_noslot_pfn(pfn)) {
+			if (printk_ratelimit())
+				pr_err("%s: real page not found for gfn %lx\n",
+				       __func__, (long)gfn);
+			return -EINVAL;
+		}
+
+		/* Align guest and physical address to page map boundaries */
+		pfn &= ~(tsize_pages - 1);
+		gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1);
+	}
+
+	spin_lock(&kvm->mmu_lock);
+	if (mmu_notifier_retry(kvm, mmu_seq)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+
+	pgdir = vcpu_e500->vcpu.arch.pgdir;
+	/*
+	 * We are just looking at the wimg bits, so we don't
+	 * care much about the trans splitting bit.
+	 * We are holding kvm->mmu_lock so a notifier invalidate
+	 * can't run hence pfn won't change.
+	 */
+	local_irq_save(flags);
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL);
+	if (ptep) {
+		pte_t pte = READ_ONCE(*ptep);
+
+		if (pte_present(pte)) {
+			wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
+				MAS2_WIMGE_MASK;
+			local_irq_restore(flags);
+		} else {
+			local_irq_restore(flags);
+			pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
+					   __func__, (long)gfn, pfn);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+	kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
+
+	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
+				ref, gvaddr, stlbe);
+
+	/* Clear i-cache for new pages */
+	kvmppc_mmu_flush_icache(pfn);
+
+out:
+	spin_unlock(&kvm->mmu_lock);
+
+	/* Drop refcount on page, so that mmu notifiers can clear it */
+	kvm_release_pfn_clean(pfn);
+
+	return ret;
+}
+
+/* XXX only map the one-one case, for now use TLB0 */
+static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel,
+				struct kvm_book3e_206_tlb_entry *stlbe)
+{
+	struct kvm_book3e_206_tlb_entry *gtlbe;
+	struct tlbe_ref *ref;
+	int stlbsel = 0;
+	int sesel = 0;
+	int r;
+
+	gtlbe = get_entry(vcpu_e500, 0, esel);
+	ref = &vcpu_e500->gtlb_priv[0][esel].ref;
+
+	r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe),
+			get_tlb_raddr(gtlbe) >> PAGE_SHIFT,
+			gtlbe, 0, stlbe, ref);
+	if (r)
+		return r;
+
+	write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel);
+
+	return 0;
+}
+
+static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500,
+				     struct tlbe_ref *ref,
+				     int esel)
+{
+	unsigned int sesel = vcpu_e500->host_tlb1_nv++;
+
+	if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size()))
+		vcpu_e500->host_tlb1_nv = 0;
+
+	if (vcpu_e500->h2g_tlb1_rmap[sesel]) {
+		unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1;
+		vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel);
+	}
+
+	vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
+	vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel;
+	vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1;
+	WARN_ON(!(ref->flags & E500_TLB_VALID));
+
+	return sesel;
+}
+
+/* Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB. */
+/* For both one-one and one-to-many */
+static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
+		u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
+		struct kvm_book3e_206_tlb_entry *stlbe, int esel)
+{
+	struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[1][esel].ref;
+	int sesel;
+	int r;
+
+	r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe,
+				   ref);
+	if (r)
+		return r;
+
+	/* Use TLB0 when we can only map a page with 4k */
+	if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) {
+		vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0;
+		write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0);
+		return 0;
+	}
+
+	/* Otherwise map into TLB1 */
+	sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, ref, esel);
+	write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel);
+
+	return 0;
+}
+
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
+		    unsigned int index)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	struct tlbe_priv *priv;
+	struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
+	int tlbsel = tlbsel_of(index);
+	int esel = esel_of(index);
+
+	gtlbe = get_entry(vcpu_e500, tlbsel, esel);
+
+	switch (tlbsel) {
+	case 0:
+		priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
+
+		/* Triggers after clear_tlb_privs or on initial mapping */
+		if (!(priv->ref.flags & E500_TLB_VALID)) {
+			kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
+		} else {
+			kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
+						&priv->ref, eaddr, &stlbe);
+			write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0);
+		}
+		break;
+
+	case 1: {
+		gfn_t gfn = gpaddr >> PAGE_SHIFT;
+		kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe,
+				     esel);
+		break;
+	}
+
+	default:
+		BUG();
+		break;
+	}
+}
+
+#ifdef CONFIG_KVM_BOOKE_HV
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type,
+			  u32 *instr)
+{
+	gva_t geaddr;
+	hpa_t addr;
+	hfn_t pfn;
+	hva_t eaddr;
+	u32 mas1, mas2, mas3;
+	u64 mas7_mas3;
+	struct page *page;
+	unsigned int addr_space, psize_shift;
+	bool pr;
+	unsigned long flags;
+
+	/* Search TLB for guest pc to get the real address */
+	geaddr = kvmppc_get_pc(vcpu);
+
+	addr_space = (vcpu->arch.shared->msr & MSR_IS) >> MSR_IR_LG;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS6, (vcpu->arch.pid << MAS6_SPID_SHIFT) | addr_space);
+	mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(vcpu));
+	asm volatile("tlbsx 0, %[geaddr]\n" : :
+		     [geaddr] "r" (geaddr));
+	mtspr(SPRN_MAS5, 0);
+	mtspr(SPRN_MAS8, 0);
+	mas1 = mfspr(SPRN_MAS1);
+	mas2 = mfspr(SPRN_MAS2);
+	mas3 = mfspr(SPRN_MAS3);
+#ifdef CONFIG_64BIT
+	mas7_mas3 = mfspr(SPRN_MAS7_MAS3);
+#else
+	mas7_mas3 = ((u64)mfspr(SPRN_MAS7) << 32) | mas3;
+#endif
+	local_irq_restore(flags);
+
+	/*
+	 * If the TLB entry for guest pc was evicted, return to the guest.
+	 * There are high chances to find a valid TLB entry next time.
+	 */
+	if (!(mas1 & MAS1_VALID))
+		return EMULATE_AGAIN;
+
+	/*
+	 * Another thread may rewrite the TLB entry in parallel, don't
+	 * execute from the address if the execute permission is not set
+	 */
+	pr = vcpu->arch.shared->msr & MSR_PR;
+	if (unlikely((pr && !(mas3 & MAS3_UX)) ||
+		     (!pr && !(mas3 & MAS3_SX)))) {
+		pr_err_ratelimited(
+			"%s: Instruction emulation from guest address %08lx without execute permission\n",
+			__func__, geaddr);
+		return EMULATE_AGAIN;
+	}
+
+	/*
+	 * The real address will be mapped by a cacheable, memory coherent,
+	 * write-back page. Check for mismatches when LRAT is used.
+	 */
+	if (has_feature(vcpu, VCPU_FTR_MMU_V2) &&
+	    unlikely((mas2 & MAS2_I) || (mas2 & MAS2_W) || !(mas2 & MAS2_M))) {
+		pr_err_ratelimited(
+			"%s: Instruction emulation from guest address %08lx mismatches storage attributes\n",
+			__func__, geaddr);
+		return EMULATE_AGAIN;
+	}
+
+	/* Get pfn */
+	psize_shift = MAS1_GET_TSIZE(mas1) + 10;
+	addr = (mas7_mas3 & (~0ULL << psize_shift)) |
+	       (geaddr & ((1ULL << psize_shift) - 1ULL));
+	pfn = addr >> PAGE_SHIFT;
+
+	/* Guard against emulation from devices area */
+	if (unlikely(!page_is_ram(pfn))) {
+		pr_err_ratelimited("%s: Instruction emulation from non-RAM host address %08llx is not supported\n",
+			 __func__, addr);
+		return EMULATE_AGAIN;
+	}
+
+	/* Map a page and get guest's instruction */
+	page = pfn_to_page(pfn);
+	eaddr = (unsigned long)kmap_atomic(page);
+	*instr = *(u32 *)(eaddr | (unsigned long)(addr & ~PAGE_MASK));
+	kunmap_atomic((u32 *)eaddr);
+
+	return EMULATE_DONE;
+}
+#else
+int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type,
+			  u32 *instr)
+{
+	return EMULATE_AGAIN;
+}
+#endif
+
+/************* MMU Notifiers *************/
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+	trace_kvm_unmap_hva(hva);
+
+	/*
+	 * Flush all shadow tlb entries everywhere. This is slow, but
+	 * we are 100% sure that we catch the to be unmapped page
+	 */
+	kvm_flush_remote_tlbs(kvm);
+
+	return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	/* kvm_unmap_hva flushes everything anyways */
+	kvm_unmap_hva(kvm, start);
+
+	return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+	/* XXX could be more clever ;) */
+	return 0;
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+	/* The page will get remapped properly on its next fault */
+	kvm_unmap_hva(kvm, hva);
+}
+
+/*****************************************/
+
+int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY;
+	host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+
+	/*
+	 * This should never happen on real e500 hardware, but is
+	 * architecturally possible -- e.g. in some weird nested
+	 * virtualization case.
+	 */
+	if (host_tlb_params[0].entries == 0 ||
+	    host_tlb_params[1].entries == 0) {
+		pr_err("%s: need to know host tlb size\n", __func__);
+		return -ENODEV;
+	}
+
+	host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >>
+				  TLBnCFG_ASSOC_SHIFT;
+	host_tlb_params[1].ways = host_tlb_params[1].entries;
+
+	if (!is_power_of_2(host_tlb_params[0].entries) ||
+	    !is_power_of_2(host_tlb_params[0].ways) ||
+	    host_tlb_params[0].entries < host_tlb_params[0].ways ||
+	    host_tlb_params[0].ways == 0) {
+		pr_err("%s: bad tlb0 host config: %u entries %u ways\n",
+		       __func__, host_tlb_params[0].entries,
+		       host_tlb_params[0].ways);
+		return -ENODEV;
+	}
+
+	host_tlb_params[0].sets =
+		host_tlb_params[0].entries / host_tlb_params[0].ways;
+	host_tlb_params[1].sets = 1;
+
+	vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
+					   host_tlb_params[1].entries,
+					   GFP_KERNEL);
+	if (!vcpu_e500->h2g_tlb1_rmap)
+		return -EINVAL;
+
+	return 0;
+}
+
+void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->h2g_tlb1_rmap);
+}
diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h
new file mode 100644
index 000000000..7624835b7
--- /dev/null
+++ b/arch/powerpc/kvm/e500_mmu_host.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef KVM_E500_MMU_HOST_H
+#define KVM_E500_MMU_HOST_H
+
+void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+			 int esel);
+
+int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500);
+void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+#endif /* KVM_E500_MMU_HOST_H */
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
new file mode 100644
index 000000000..cda695de8
--- /dev/null
+++ b/arch/powerpc/kvm/e500mc.c
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Varun Sethi, <varun.sethi@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/e500.c,
+ * by Yu Liu <yu.liu@freescale.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
+
+#include "booke.h"
+#include "e500.h"
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type)
+{
+	enum ppc_dbell dbell_type;
+	unsigned long tag;
+
+	switch (type) {
+	case INT_CLASS_NONCRIT:
+		dbell_type = PPC_G_DBELL;
+		break;
+	case INT_CLASS_CRIT:
+		dbell_type = PPC_G_DBELL_CRIT;
+		break;
+	case INT_CLASS_MC:
+		dbell_type = PPC_G_DBELL_MC;
+		break;
+	default:
+		WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type);
+		return;
+	}
+
+	preempt_disable();
+	tag = PPC_DBELL_LPID(get_lpid(vcpu)) | vcpu->vcpu_id;
+	mb();
+	ppc_msgsnd(dbell_type, 0, tag);
+	preempt_enable();
+}
+
+/* gtlbe must not be mapped by more than one host tlb entry */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned int tid, ts;
+	gva_t eaddr;
+	u32 val;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+
+	/* We search the host TLB to invalidate its shadow TLB entry */
+	val = (tid << 16) | ts;
+	eaddr = get_tlb_eaddr(gtlbe);
+
+	local_irq_save(flags);
+
+	mtspr(SPRN_MAS6, val);
+	mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu));
+
+	asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr));
+	val = mfspr(SPRN_MAS1);
+	if (val & MAS1_VALID) {
+		mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+		asm volatile("tlbwe");
+	}
+	mtspr(SPRN_MAS5, 0);
+	/* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+
+	local_irq_restore(flags);
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu));
+	asm volatile("tlbilxlpid");
+	mtspr(SPRN_MAS5, 0);
+	local_irq_restore(flags);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	vcpu->arch.pid = pid;
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+}
+
+/* We use two lpids per VM */
+static DEFINE_PER_CPU(struct kvm_vcpu *[KVMPPC_NR_LPIDS], last_vcpu_of_lpid);
+
+static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_booke_vcpu_load(vcpu, cpu);
+
+	mtspr(SPRN_LPID, get_lpid(vcpu));
+	mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+	mtspr(SPRN_GPIR, vcpu->vcpu_id);
+	mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp);
+	vcpu->arch.eplc = EPC_EGS | (get_lpid(vcpu) << EPC_ELPID_SHIFT);
+	vcpu->arch.epsc = vcpu->arch.eplc;
+	mtspr(SPRN_EPLC, vcpu->arch.eplc);
+	mtspr(SPRN_EPSC, vcpu->arch.epsc);
+
+	mtspr(SPRN_GIVPR, vcpu->arch.ivpr);
+	mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
+	mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
+	mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0);
+	mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1);
+	mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2);
+	mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3);
+
+	mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0);
+	mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1);
+
+	mtspr(SPRN_GEPR, vcpu->arch.epr);
+	mtspr(SPRN_GDEAR, vcpu->arch.shared->dar);
+	mtspr(SPRN_GESR, vcpu->arch.shared->esr);
+
+	if (vcpu->arch.oldpir != mfspr(SPRN_PIR) ||
+	    __this_cpu_read(last_vcpu_of_lpid[get_lpid(vcpu)]) != vcpu) {
+		kvmppc_e500_tlbil_all(vcpu_e500);
+		__this_cpu_write(last_vcpu_of_lpid[get_lpid(vcpu)], vcpu);
+	}
+}
+
+static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.eplc = mfspr(SPRN_EPLC);
+	vcpu->arch.epsc = mfspr(SPRN_EPSC);
+
+	vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0);
+	vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1);
+	vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2);
+	vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3);
+
+	vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0);
+	vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1);
+
+	vcpu->arch.epr = mfspr(SPRN_GEPR);
+	vcpu->arch.shared->dar = mfspr(SPRN_GDEAR);
+	vcpu->arch.shared->esr = mfspr(SPRN_GESR);
+
+	vcpu->arch.oldpir = mfspr(SPRN_PIR);
+
+	kvmppc_booke_vcpu_put(vcpu);
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0)
+		r = 0;
+	else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
+		r = 0;
+#ifdef CONFIG_ALTIVEC
+	/*
+	 * Since guests have the priviledge to enable AltiVec, we need AltiVec
+	 * support in the host to save/restore their context.
+	 * Don't use CPU_FTR_ALTIVEC to identify cores with AltiVec unit
+	 * because it's cleared in the absence of CONFIG_ALTIVEC!
+	 */
+	else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0)
+		r = 0;
+#endif
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \
+				 SPRN_EPCR_DUVD;
+#ifdef CONFIG_64BIT
+	vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM;
+#endif
+	vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_PMMP;
+
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu_e500->svr = mfspr(SPRN_SVR);
+
+	vcpu->arch.cpu_type = KVM_CPU_E500MC;
+
+	return 0;
+}
+
+static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu,
+					struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM |
+			       KVM_SREGS_E_PC;
+	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL;
+
+	sregs->u.e.impl.fsl.features = 0;
+	sregs->u.e.impl.fsl.svr = vcpu_e500->svr;
+	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
+	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
+
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
+
+	sregs->u.e.ivor_high[3] =
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+	sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
+	sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
+
+	return kvmppc_get_sregs_ivor(vcpu, sregs);
+}
+
+static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu,
+					struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
+
+	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
+		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
+		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0;
+		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
+	}
+
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	if (sregs->u.e.features & KVM_SREGS_E_PM) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] =
+			sregs->u.e.ivor_high[3];
+	}
+
+	if (sregs->u.e.features & KVM_SREGS_E_PC) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] =
+			sregs->u.e.ivor_high[4];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] =
+			sregs->u.e.ivor_high[5];
+	}
+
+	return kvmppc_set_sregs_ivor(vcpu, sregs);
+}
+
+static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id,
+			      union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_SPRG9:
+		*val = get_reg_val(id, vcpu->arch.sprg9);
+		break;
+	default:
+		r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+	}
+
+	return r;
+}
+
+static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id,
+			      union kvmppc_one_reg *val)
+{
+	int r = 0;
+
+	switch (id) {
+	case KVM_REG_PPC_SPRG9:
+		vcpu->arch.sprg9 = set_reg_val(id, *val);
+		break;
+	default:
+		r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val);
+	}
+
+	return r;
+}
+
+static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm,
+						       unsigned int id)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500;
+	struct kvm_vcpu *vcpu;
+	int err;
+
+	vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu_e500) {
+		err = -ENOMEM;
+		goto out;
+	}
+	vcpu = &vcpu_e500->vcpu;
+
+	/* Invalid PIR value -- this LPID dosn't have valid state on any cpu */
+	vcpu->arch.oldpir = 0xffffffff;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	err = kvmppc_e500_tlb_init(vcpu_e500);
+	if (err)
+		goto uninit_vcpu;
+
+	vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_tlb;
+
+	return vcpu;
+
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+out:
+	return ERR_PTR(err);
+}
+
+static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared);
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+}
+
+static int kvmppc_core_init_vm_e500mc(struct kvm *kvm)
+{
+	int lpid;
+
+	lpid = kvmppc_alloc_lpid();
+	if (lpid < 0)
+		return lpid;
+
+	/*
+	 * Use two lpids per VM on cores with two threads like e6500. Use
+	 * even numbers to speedup vcpu lpid computation with consecutive lpids
+	 * per VM. vm1 will use lpids 2 and 3, vm2 lpids 4 and 5, and so on.
+	 */
+	if (threads_per_core == 2)
+		lpid <<= 1;
+
+	kvm->arch.lpid = lpid;
+	return 0;
+}
+
+static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm)
+{
+	int lpid = kvm->arch.lpid;
+
+	if (threads_per_core == 2)
+		lpid >>= 1;
+
+	kvmppc_free_lpid(lpid);
+}
+
+static struct kvmppc_ops kvm_ops_e500mc = {
+	.get_sregs = kvmppc_core_get_sregs_e500mc,
+	.set_sregs = kvmppc_core_set_sregs_e500mc,
+	.get_one_reg = kvmppc_get_one_reg_e500mc,
+	.set_one_reg = kvmppc_set_one_reg_e500mc,
+	.vcpu_load   = kvmppc_core_vcpu_load_e500mc,
+	.vcpu_put    = kvmppc_core_vcpu_put_e500mc,
+	.vcpu_create = kvmppc_core_vcpu_create_e500mc,
+	.vcpu_free   = kvmppc_core_vcpu_free_e500mc,
+	.mmu_destroy  = kvmppc_mmu_destroy_e500,
+	.init_vm = kvmppc_core_init_vm_e500mc,
+	.destroy_vm = kvmppc_core_destroy_vm_e500mc,
+	.emulate_op = kvmppc_core_emulate_op_e500,
+	.emulate_mtspr = kvmppc_core_emulate_mtspr_e500,
+	.emulate_mfspr = kvmppc_core_emulate_mfspr_e500,
+};
+
+static int __init kvmppc_e500mc_init(void)
+{
+	int r;
+
+	r = kvmppc_booke_init();
+	if (r)
+		goto err_out;
+
+	/*
+	 * Use two lpids per VM on dual threaded processors like e6500
+	 * to workarround the lack of tlb write conditional instruction.
+	 * Expose half the number of available hardware lpids to the lpid
+	 * allocator.
+	 */
+	kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core);
+	kvmppc_claim_lpid(0); /* host */
+
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+	if (r)
+		goto err_out;
+	kvm_ops_e500mc.owner = THIS_MODULE;
+	kvmppc_pr_ops = &kvm_ops_e500mc;
+
+err_out:
+	return r;
+}
+
+static void __exit kvmppc_e500mc_exit(void)
+{
+	kvmppc_pr_ops = NULL;
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_e500mc_init);
+module_exit(kvmppc_e500mc_exit);
+MODULE_ALIAS_MISCDEV(KVM_MINOR);
+MODULE_ALIAS("devname:kvm");
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
new file mode 100644
index 000000000..5cc2e7af3
--- /dev/null
+++ b/arch/powerpc/kvm/emulate.c
@@ -0,0 +1,317 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/jiffies.h>
+#include <linux/hrtimer.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm_host.h>
+#include <linux/clockchips.h>
+
+#include <asm/reg.h>
+#include <asm/time.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
+#include "timing.h"
+#include "trace.h"
+
+void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
+{
+	unsigned long dec_nsec;
+	unsigned long long dec_time;
+
+	pr_debug("mtDEC: %x\n", vcpu->arch.dec);
+	hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+
+#ifdef CONFIG_PPC_BOOK3S
+	/* mtdec lowers the interrupt line when positive. */
+	kvmppc_core_dequeue_dec(vcpu);
+
+	/* POWER4+ triggers a dec interrupt if the value is < 0 */
+	if (vcpu->arch.dec & 0x80000000) {
+		kvmppc_core_queue_dec(vcpu);
+		return;
+	}
+#endif
+
+#ifdef CONFIG_BOOKE
+	/* On BOOKE, DEC = 0 is as good as decrementer not enabled */
+	if (vcpu->arch.dec == 0)
+		return;
+#endif
+
+	/*
+	 * The decrementer ticks at the same rate as the timebase, so
+	 * that's how we convert the guest DEC value to the number of
+	 * host ticks.
+	 */
+
+	dec_time = vcpu->arch.dec;
+	/*
+	 * Guest timebase ticks at the same frequency as host decrementer.
+	 * So use the host decrementer calculations for decrementer emulation.
+	 */
+	dec_time = dec_time << decrementer_clockevent.shift;
+	do_div(dec_time, decrementer_clockevent.mult);
+	dec_nsec = do_div(dec_time, NSEC_PER_SEC);
+	hrtimer_start(&vcpu->arch.dec_timer,
+		ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL);
+	vcpu->arch.dec_jiffies = get_tb();
+}
+
+u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
+{
+	u64 jd = tb - vcpu->arch.dec_jiffies;
+
+#ifdef CONFIG_BOOKE
+	if (vcpu->arch.dec < jd)
+		return 0;
+#endif
+
+	return vcpu->arch.dec - jd;
+}
+
+static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		kvmppc_set_srr0(vcpu, spr_val);
+		break;
+	case SPRN_SRR1:
+		kvmppc_set_srr1(vcpu, spr_val);
+		break;
+
+	/* XXX We need to context-switch the timebase for
+	 * watchdog and FIT. */
+	case SPRN_TBWL: break;
+	case SPRN_TBWU: break;
+
+	case SPRN_DEC:
+		vcpu->arch.dec = spr_val;
+		kvmppc_emulate_dec(vcpu);
+		break;
+
+	case SPRN_SPRG0:
+		kvmppc_set_sprg0(vcpu, spr_val);
+		break;
+	case SPRN_SPRG1:
+		kvmppc_set_sprg1(vcpu, spr_val);
+		break;
+	case SPRN_SPRG2:
+		kvmppc_set_sprg2(vcpu, spr_val);
+		break;
+	case SPRN_SPRG3:
+		kvmppc_set_sprg3(vcpu, spr_val);
+		break;
+
+	/* PIR can legally be written, but we ignore it */
+	case SPRN_PIR: break;
+
+	default:
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn,
+								  spr_val);
+		if (emulated == EMULATE_FAIL)
+			printk(KERN_INFO "mtspr: unknown spr "
+				"0x%x\n", sprn);
+		break;
+	}
+
+	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = 0;
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		spr_val = kvmppc_get_srr0(vcpu);
+		break;
+	case SPRN_SRR1:
+		spr_val = kvmppc_get_srr1(vcpu);
+		break;
+	case SPRN_PVR:
+		spr_val = vcpu->arch.pvr;
+		break;
+	case SPRN_PIR:
+		spr_val = vcpu->vcpu_id;
+		break;
+
+	/* Note: mftb and TBRL/TBWL are user-accessible, so
+	 * the guest can always access the real TB anyways.
+	 * In fact, we probably will never see these traps. */
+	case SPRN_TBWL:
+		spr_val = get_tb() >> 32;
+		break;
+	case SPRN_TBWU:
+		spr_val = get_tb();
+		break;
+
+	case SPRN_SPRG0:
+		spr_val = kvmppc_get_sprg0(vcpu);
+		break;
+	case SPRN_SPRG1:
+		spr_val = kvmppc_get_sprg1(vcpu);
+		break;
+	case SPRN_SPRG2:
+		spr_val = kvmppc_get_sprg2(vcpu);
+		break;
+	case SPRN_SPRG3:
+		spr_val = kvmppc_get_sprg3(vcpu);
+		break;
+	/* Note: SPRG4-7 are user-readable, so we don't get
+	 * a trap. */
+
+	case SPRN_DEC:
+		spr_val = kvmppc_get_dec(vcpu, get_tb());
+		break;
+	default:
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn,
+								  &spr_val);
+		if (unlikely(emulated == EMULATE_FAIL)) {
+			printk(KERN_INFO "mfspr: unknown spr "
+				"0x%x\n", sprn);
+		}
+		break;
+	}
+
+	if (emulated == EMULATE_DONE)
+		kvmppc_set_gpr(vcpu, rt, spr_val);
+	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+
+	return emulated;
+}
+
+/* XXX Should probably auto-generate instruction decoding for a particular core
+ * from opcode tables in the future. */
+int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	u32 inst;
+	int rs, rt, sprn;
+	enum emulation_result emulated;
+	int advance = 1;
+
+	/* this default type might be overwritten by subcategories */
+	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
+
+	emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst);
+	if (emulated != EMULATE_DONE)
+		return emulated;
+
+	pr_debug("Emulating opcode %d / %d\n", get_op(inst), get_xop(inst));
+
+	rs = get_rs(inst);
+	rt = get_rt(inst);
+	sprn = get_sprn(inst);
+
+	switch (get_op(inst)) {
+	case OP_TRAP:
+#ifdef CONFIG_PPC_BOOK3S
+	case OP_TRAP_64:
+		kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
+#else
+		kvmppc_core_queue_program(vcpu,
+					  vcpu->arch.shared->esr | ESR_PTR);
+#endif
+		advance = 0;
+		break;
+
+	case 31:
+		switch (get_xop(inst)) {
+
+		case OP_31_XOP_TRAP:
+#ifdef CONFIG_64BIT
+		case OP_31_XOP_TRAP_64:
+#endif
+#ifdef CONFIG_PPC_BOOK3S
+			kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP);
+#else
+			kvmppc_core_queue_program(vcpu,
+					vcpu->arch.shared->esr | ESR_PTR);
+#endif
+			advance = 0;
+			break;
+
+		case OP_31_XOP_MFSPR:
+			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
+			break;
+
+		case OP_31_XOP_MTSPR:
+			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
+			break;
+
+		case OP_31_XOP_TLBSYNC:
+			break;
+
+		default:
+			/* Attempt core-specific emulation below. */
+			emulated = EMULATE_FAIL;
+		}
+		break;
+
+	case 0:
+		/*
+		 * Instruction with primary opcode 0. Based on PowerISA
+		 * these are illegal instructions.
+		 */
+		if (inst == KVMPPC_INST_SW_BREAKPOINT) {
+			run->exit_reason = KVM_EXIT_DEBUG;
+			run->debug.arch.address = kvmppc_get_pc(vcpu);
+			emulated = EMULATE_EXIT_USER;
+			advance = 0;
+		} else
+			emulated = EMULATE_FAIL;
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	if (emulated == EMULATE_FAIL) {
+		emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst,
+							       &advance);
+		if (emulated == EMULATE_AGAIN) {
+			advance = 0;
+		} else if (emulated == EMULATE_FAIL) {
+			advance = 0;
+			printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
+			       "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
+			kvmppc_core_queue_program(vcpu, 0);
+		}
+	}
+
+	trace_kvm_ppc_instr(inst, kvmppc_get_pc(vcpu), emulated);
+
+	/* Advance past emulated instruction. */
+	if (advance)
+		kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+
+	return emulated;
+}
+EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction);
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
new file mode 100644
index 000000000..6d3c0ee1d
--- /dev/null
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -0,0 +1,272 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ * Copyright 2011 Freescale Semiconductor, Inc.
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/jiffies.h>
+#include <linux/hrtimer.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm_host.h>
+#include <linux/clockchips.h>
+
+#include <asm/reg.h>
+#include <asm/time.h>
+#include <asm/byteorder.h>
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/ppc-opcode.h>
+#include "timing.h"
+#include "trace.h"
+
+/* XXX to do:
+ * lhax
+ * lhaux
+ * lswx
+ * lswi
+ * stswx
+ * stswi
+ * lha
+ * lhau
+ * lmw
+ * stmw
+ *
+ */
+int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+	u32 inst;
+	int ra, rs, rt;
+	enum emulation_result emulated;
+	int advance = 1;
+
+	/* this default type might be overwritten by subcategories */
+	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
+
+	emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst);
+	if (emulated != EMULATE_DONE)
+		return emulated;
+
+	ra = get_ra(inst);
+	rs = get_rs(inst);
+	rt = get_rt(inst);
+
+	switch (get_op(inst)) {
+	case 31:
+		switch (get_xop(inst)) {
+		case OP_31_XOP_LWZX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+			break;
+
+		case OP_31_XOP_LBZX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+			break;
+
+		case OP_31_XOP_LBZUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STWX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+			                               4, 1);
+			break;
+
+		case OP_31_XOP_STBX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+			                               1, 1);
+			break;
+
+		case OP_31_XOP_STBUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+			                               1, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_LHAX:
+			emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+			break;
+
+		case OP_31_XOP_LHZX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+			break;
+
+		case OP_31_XOP_LHZUX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_STHX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+			                               2, 1);
+			break;
+
+		case OP_31_XOP_STHUX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+			                               2, 1);
+			kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+			break;
+
+		case OP_31_XOP_DCBST:
+		case OP_31_XOP_DCBF:
+		case OP_31_XOP_DCBI:
+			/* Do nothing. The guest is performing dcbi because
+			 * hardware DMA is not snooped by the dcache, but
+			 * emulated DMA either goes through the dcache as
+			 * normal writes, or the host kernel has handled dcache
+			 * coherence. */
+			break;
+
+		case OP_31_XOP_LWBRX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0);
+			break;
+
+		case OP_31_XOP_STWBRX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+			                               4, 0);
+			break;
+
+		case OP_31_XOP_LHBRX:
+			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);
+			break;
+
+		case OP_31_XOP_STHBRX:
+			emulated = kvmppc_handle_store(run, vcpu,
+						       kvmppc_get_gpr(vcpu, rs),
+			                               2, 0);
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+			break;
+		}
+		break;
+
+	case OP_LWZ:
+		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+		break;
+
+	/* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */
+	case OP_LD:
+		rt = get_rt(inst);
+		emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1);
+		break;
+
+	case OP_LWZU:
+		emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_LBZ:
+		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+		break;
+
+	case OP_LBZU:
+		emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_STW:
+		emulated = kvmppc_handle_store(run, vcpu,
+					       kvmppc_get_gpr(vcpu, rs),
+		                               4, 1);
+		break;
+
+	/* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */
+	case OP_STD:
+		rs = get_rs(inst);
+		emulated = kvmppc_handle_store(run, vcpu,
+					       kvmppc_get_gpr(vcpu, rs),
+		                               8, 1);
+		break;
+
+	case OP_STWU:
+		emulated = kvmppc_handle_store(run, vcpu,
+					       kvmppc_get_gpr(vcpu, rs),
+		                               4, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_STB:
+		emulated = kvmppc_handle_store(run, vcpu,
+					       kvmppc_get_gpr(vcpu, rs),
+		                               1, 1);
+		break;
+
+	case OP_STBU:
+		emulated = kvmppc_handle_store(run, vcpu,
+					       kvmppc_get_gpr(vcpu, rs),
+		                               1, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_LHZ:
+		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+		break;
+
+	case OP_LHZU:
+		emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_LHA:
+		emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+		break;
+
+	case OP_LHAU:
+		emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	case OP_STH:
+		emulated = kvmppc_handle_store(run, vcpu,
+					       kvmppc_get_gpr(vcpu, rs),
+		                               2, 1);
+		break;
+
+	case OP_STHU:
+		emulated = kvmppc_handle_store(run, vcpu,
+					       kvmppc_get_gpr(vcpu, rs),
+		                               2, 1);
+		kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed);
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+		break;
+	}
+
+	if (emulated == EMULATE_FAIL) {
+		advance = 0;
+		kvmppc_core_queue_program(vcpu, 0);
+	}
+
+	trace_kvm_ppc_instr(inst, kvmppc_get_pc(vcpu), emulated);
+
+	/* Advance past emulated instruction. */
+	if (advance)
+		kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+
+	return emulated;
+}
diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S
new file mode 100644
index 000000000..bf68d5975
--- /dev/null
+++ b/arch/powerpc/kvm/fpu.S
@@ -0,0 +1,283 @@
+/*
+ *  FPU helper code to use FPU operations from inside the kernel
+ *
+ *    Copyright (C) 2010 Alexander Graf (agraf@suse.de)
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/cputable.h>
+#include <asm/cache.h>
+#include <asm/thread_info.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+
+/* Instructions operating on single parameters */
+
+/*
+ * Single operation with one input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ */
+#define FPS_ONE_IN(name) 					\
+_GLOBAL(fps_ ## name);							\
+	lfd	0,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(0);							\
+	lfs	0,0(r5);						\
+									\
+	name	0,0;							\
+									\
+	stfs	0,0(r4);						\
+	mffs	0;							\
+	stfd	0,0(r3);	/* save new fpscr value */	\
+	blr
+
+/*
+ * Single operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ * R6 = (short*)&param2
+ */
+#define FPS_TWO_IN(name) 					\
+_GLOBAL(fps_ ## name);							\
+	lfd	0,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(0);							\
+	lfs	0,0(r5);						\
+	lfs	1,0(r6);						\
+									\
+	name	0,0,1;							\
+									\
+	stfs	0,0(r4);						\
+	mffs	0;							\
+	stfd	0,0(r3);		/* save new fpscr value */	\
+	blr
+
+/*
+ * Single operation with three input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (short*)&result
+ * R5 = (short*)&param1
+ * R6 = (short*)&param2
+ * R7 = (short*)&param3
+ */
+#define FPS_THREE_IN(name) 					\
+_GLOBAL(fps_ ## name);							\
+	lfd	0,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(0);							\
+	lfs	0,0(r5);						\
+	lfs	1,0(r6);						\
+	lfs	2,0(r7);						\
+									\
+	name	0,0,1,2;						\
+									\
+	stfs	0,0(r4);						\
+	mffs	0;							\
+	stfd	0,0(r3);		/* save new fpscr value */	\
+	blr
+
+FPS_ONE_IN(fres)
+FPS_ONE_IN(frsqrte)
+FPS_ONE_IN(fsqrts)
+FPS_TWO_IN(fadds)
+FPS_TWO_IN(fdivs)
+FPS_TWO_IN(fmuls)
+FPS_TWO_IN(fsubs)
+FPS_THREE_IN(fmadds)
+FPS_THREE_IN(fmsubs)
+FPS_THREE_IN(fnmadds)
+FPS_THREE_IN(fnmsubs)
+FPS_THREE_IN(fsel)
+
+
+/* Instructions operating on double parameters */
+
+/*
+ * Beginning of double instruction processing
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2 [load_two]
+ * R8 = (double*)&param3 [load_three]
+ * LR = instruction call function
+ */
+fpd_load_three:
+	lfd	2,0(r8)			/* load param3 */
+fpd_load_two:
+	lfd	1,0(r7)			/* load param2 */
+fpd_load_one:
+	lfd	0,0(r6)			/* load param1 */
+fpd_load_none:
+	lfd	3,0(r3)			/* load up fpscr value */
+	MTFSF_L(3)
+	lwz	r6, 0(r4)		/* load cr */
+	mtcr	r6
+	blr
+
+/*
+ * End of double instruction processing
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * LR = caller of instruction call function
+ */
+fpd_return:
+	mfcr	r6
+	stfd	0,0(r5)			/* save result */
+	mffs	0
+	stfd	0,0(r3)			/* save new fpscr value */
+	stw	r6,0(r4)		/* save new cr value */
+	blr
+
+/*
+ * Double operation with no input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ */
+#define FPD_NONE_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_none;						\
+	mtlr	r12;							\
+									\
+	name.	0;			/* call instruction */		\
+	b	fpd_return
+
+/*
+ * Double operation with one input operand
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ */
+#define FPD_ONE_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_one;						\
+	mtlr	r12;							\
+									\
+	name.	0,0;			/* call instruction */		\
+	b	fpd_return
+
+/*
+ * Double operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2
+ * R8 = (double*)&param3
+ */
+#define FPD_TWO_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_two;						\
+	mtlr	r12;							\
+									\
+	name.	0,0,1;			/* call instruction */		\
+	b	fpd_return
+
+/*
+ * CR Double operation with two input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&param1
+ * R6 = (double*)&param2
+ * R7 = (double*)&param3
+ */
+#define FPD_TWO_IN_CR(name)						\
+_GLOBAL(fpd_ ## name);							\
+	lfd	1,0(r6);		/* load param2 */		\
+	lfd	0,0(r5);		/* load param1 */		\
+	lfd	3,0(r3);		/* load up fpscr value */	\
+	MTFSF_L(3);							\
+	lwz	r6, 0(r4);		/* load cr */			\
+	mtcr	r6;							\
+									\
+	name	0,0,1;			/* call instruction */		\
+	mfcr	r6;							\
+	mffs	0;							\
+	stfd	0,0(r3);		/* save new fpscr value */	\
+	stw	r6,0(r4);		/* save new cr value */		\
+	blr
+
+/*
+ * Double operation with three input operands
+ *
+ * R3 = (double*)&fpscr
+ * R4 = (u32*)&cr
+ * R5 = (double*)&result
+ * R6 = (double*)&param1
+ * R7 = (double*)&param2
+ * R8 = (double*)&param3
+ */
+#define FPD_THREE_IN(name) 						\
+_GLOBAL(fpd_ ## name);							\
+	mflr	r12;							\
+	bl	fpd_load_three;						\
+	mtlr	r12;							\
+									\
+	name.	0,0,1,2;		/* call instruction */		\
+	b	fpd_return
+
+FPD_ONE_IN(fsqrts)
+FPD_ONE_IN(frsqrtes)
+FPD_ONE_IN(fres)
+FPD_ONE_IN(frsp)
+FPD_ONE_IN(fctiw)
+FPD_ONE_IN(fctiwz)
+FPD_ONE_IN(fsqrt)
+FPD_ONE_IN(fre)
+FPD_ONE_IN(frsqrte)
+FPD_ONE_IN(fneg)
+FPD_ONE_IN(fabs)
+FPD_TWO_IN(fadds)
+FPD_TWO_IN(fsubs)
+FPD_TWO_IN(fdivs)
+FPD_TWO_IN(fmuls)
+FPD_TWO_IN_CR(fcmpu)
+FPD_TWO_IN(fcpsgn)
+FPD_TWO_IN(fdiv)
+FPD_TWO_IN(fadd)
+FPD_TWO_IN(fmul)
+FPD_TWO_IN_CR(fcmpo)
+FPD_TWO_IN(fsub)
+FPD_THREE_IN(fmsubs)
+FPD_THREE_IN(fmadds)
+FPD_THREE_IN(fnmsubs)
+FPD_THREE_IN(fnmadds)
+FPD_THREE_IN(fsel)
+FPD_THREE_IN(fmsub)
+FPD_THREE_IN(fmadd)
+FPD_THREE_IN(fnmsub)
+FPD_THREE_IN(fnmadd)
+
+_GLOBAL(kvm_cvt_fd)
+	lfs	0,0(r3)
+	stfd	0,0(r4)
+	blr
+
+_GLOBAL(kvm_cvt_df)
+	lfd	0,0(r3)
+	stfs	0,0(r4)
+	blr
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
new file mode 100644
index 000000000..5a9a10b90
--- /dev/null
+++ b/arch/powerpc/kvm/irq.h
@@ -0,0 +1,20 @@
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/kvm_host.h>
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+	int ret = 0;
+
+#ifdef CONFIG_KVM_MPIC
+	ret = ret || (kvm->arch.mpic != NULL);
+#endif
+#ifdef CONFIG_KVM_XICS
+	ret = ret || (kvm->arch.xics != NULL);
+#endif
+	smp_rmb();
+	return ret;
+}
+
+#endif
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
new file mode 100644
index 000000000..6249cdc83
--- /dev/null
+++ b/arch/powerpc/kvm/mpic.c
@@ -0,0 +1,1852 @@
+/*
+ * OpenPIC emulation
+ *
+ * Copyright (c) 2004 Jocelyn Mayer
+ *               2011 Alexander Graf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <asm/uaccess.h>
+#include <asm/mpic.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include <kvm/iodev.h>
+
+#define MAX_CPU     32
+#define MAX_SRC     256
+#define MAX_TMR     4
+#define MAX_IPI     4
+#define MAX_MSI     8
+#define MAX_IRQ     (MAX_SRC + MAX_IPI + MAX_TMR)
+#define VID         0x03	/* MPIC version ID */
+
+/* OpenPIC capability flags */
+#define OPENPIC_FLAG_IDR_CRIT     (1 << 0)
+#define OPENPIC_FLAG_ILR          (2 << 0)
+
+/* OpenPIC address map */
+#define OPENPIC_REG_SIZE             0x40000
+#define OPENPIC_GLB_REG_START        0x0
+#define OPENPIC_GLB_REG_SIZE         0x10F0
+#define OPENPIC_TMR_REG_START        0x10F0
+#define OPENPIC_TMR_REG_SIZE         0x220
+#define OPENPIC_MSI_REG_START        0x1600
+#define OPENPIC_MSI_REG_SIZE         0x200
+#define OPENPIC_SUMMARY_REG_START    0x3800
+#define OPENPIC_SUMMARY_REG_SIZE     0x800
+#define OPENPIC_SRC_REG_START        0x10000
+#define OPENPIC_SRC_REG_SIZE         (MAX_SRC * 0x20)
+#define OPENPIC_CPU_REG_START        0x20000
+#define OPENPIC_CPU_REG_SIZE         (0x100 + ((MAX_CPU - 1) * 0x1000))
+
+struct fsl_mpic_info {
+	int max_ext;
+};
+
+static struct fsl_mpic_info fsl_mpic_20 = {
+	.max_ext = 12,
+};
+
+static struct fsl_mpic_info fsl_mpic_42 = {
+	.max_ext = 12,
+};
+
+#define FRR_NIRQ_SHIFT    16
+#define FRR_NCPU_SHIFT     8
+#define FRR_VID_SHIFT      0
+
+#define VID_REVISION_1_2   2
+#define VID_REVISION_1_3   3
+
+#define VIR_GENERIC      0x00000000	/* Generic Vendor ID */
+
+#define GCR_RESET        0x80000000
+#define GCR_MODE_PASS    0x00000000
+#define GCR_MODE_MIXED   0x20000000
+#define GCR_MODE_PROXY   0x60000000
+
+#define TBCR_CI           0x80000000	/* count inhibit */
+#define TCCR_TOG          0x80000000	/* toggles when decrement to zero */
+
+#define IDR_EP_SHIFT      31
+#define IDR_EP_MASK       (1 << IDR_EP_SHIFT)
+#define IDR_CI0_SHIFT     30
+#define IDR_CI1_SHIFT     29
+#define IDR_P1_SHIFT      1
+#define IDR_P0_SHIFT      0
+
+#define ILR_INTTGT_MASK   0x000000ff
+#define ILR_INTTGT_INT    0x00
+#define ILR_INTTGT_CINT   0x01	/* critical */
+#define ILR_INTTGT_MCP    0x02	/* machine check */
+#define NUM_OUTPUTS       3
+
+#define MSIIR_OFFSET       0x140
+#define MSIIR_SRS_SHIFT    29
+#define MSIIR_SRS_MASK     (0x7 << MSIIR_SRS_SHIFT)
+#define MSIIR_IBS_SHIFT    24
+#define MSIIR_IBS_MASK     (0x1f << MSIIR_IBS_SHIFT)
+
+static int get_current_cpu(void)
+{
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
+	return vcpu ? vcpu->arch.irq_cpu_id : -1;
+#else
+	/* XXX */
+	return -1;
+#endif
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx);
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx);
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+				    uint32_t val);
+
+enum irq_type {
+	IRQ_TYPE_NORMAL = 0,
+	IRQ_TYPE_FSLINT,	/* FSL internal interrupt -- level only */
+	IRQ_TYPE_FSLSPECIAL,	/* FSL timer/IPI interrupt, edge, no polarity */
+};
+
+struct irq_queue {
+	/* Round up to the nearest 64 IRQs so that the queue length
+	 * won't change when moving between 32 and 64 bit hosts.
+	 */
+	unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)];
+	int next;
+	int priority;
+};
+
+struct irq_source {
+	uint32_t ivpr;		/* IRQ vector/priority register */
+	uint32_t idr;		/* IRQ destination register */
+	uint32_t destmask;	/* bitmap of CPU destinations */
+	int last_cpu;
+	int output;		/* IRQ level, e.g. ILR_INTTGT_INT */
+	int pending;		/* TRUE if IRQ is pending */
+	enum irq_type type;
+	bool level:1;		/* level-triggered */
+	bool nomask:1;	/* critical interrupts ignore mask on some FSL MPICs */
+};
+
+#define IVPR_MASK_SHIFT       31
+#define IVPR_MASK_MASK        (1 << IVPR_MASK_SHIFT)
+#define IVPR_ACTIVITY_SHIFT   30
+#define IVPR_ACTIVITY_MASK    (1 << IVPR_ACTIVITY_SHIFT)
+#define IVPR_MODE_SHIFT       29
+#define IVPR_MODE_MASK        (1 << IVPR_MODE_SHIFT)
+#define IVPR_POLARITY_SHIFT   23
+#define IVPR_POLARITY_MASK    (1 << IVPR_POLARITY_SHIFT)
+#define IVPR_SENSE_SHIFT      22
+#define IVPR_SENSE_MASK       (1 << IVPR_SENSE_SHIFT)
+
+#define IVPR_PRIORITY_MASK     (0xF << 16)
+#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16))
+#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask)
+
+/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */
+#define IDR_EP      0x80000000	/* external pin */
+#define IDR_CI      0x40000000	/* critical interrupt */
+
+struct irq_dest {
+	struct kvm_vcpu *vcpu;
+
+	int32_t ctpr;		/* CPU current task priority */
+	struct irq_queue raised;
+	struct irq_queue servicing;
+
+	/* Count of IRQ sources asserting on non-INT outputs */
+	uint32_t outputs_active[NUM_OUTPUTS];
+};
+
+#define MAX_MMIO_REGIONS 10
+
+struct openpic {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct kvm_io_device mmio;
+	const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS];
+	int num_mmio_regions;
+
+	gpa_t reg_base;
+	spinlock_t lock;
+
+	/* Behavior control */
+	struct fsl_mpic_info *fsl;
+	uint32_t model;
+	uint32_t flags;
+	uint32_t nb_irqs;
+	uint32_t vid;
+	uint32_t vir;		/* Vendor identification register */
+	uint32_t vector_mask;
+	uint32_t tfrr_reset;
+	uint32_t ivpr_reset;
+	uint32_t idr_reset;
+	uint32_t brr1;
+	uint32_t mpic_mode_mask;
+
+	/* Global registers */
+	uint32_t frr;		/* Feature reporting register */
+	uint32_t gcr;		/* Global configuration register  */
+	uint32_t pir;		/* Processor initialization register */
+	uint32_t spve;		/* Spurious vector register */
+	uint32_t tfrr;		/* Timer frequency reporting register */
+	/* Source registers */
+	struct irq_source src[MAX_IRQ];
+	/* Local registers per output pin */
+	struct irq_dest dst[MAX_CPU];
+	uint32_t nb_cpus;
+	/* Timer registers */
+	struct {
+		uint32_t tccr;	/* Global timer current count register */
+		uint32_t tbcr;	/* Global timer base count register */
+	} timers[MAX_TMR];
+	/* Shared MSI registers */
+	struct {
+		uint32_t msir;	/* Shared Message Signaled Interrupt Register */
+	} msi[MAX_MSI];
+	uint32_t max_irq;
+	uint32_t irq_ipi0;
+	uint32_t irq_tim0;
+	uint32_t irq_msi;
+};
+
+
+static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	struct kvm_interrupt irq = {
+		.irq = KVM_INTERRUPT_SET_LEVEL,
+	};
+
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
+}
+
+static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvmppc_core_dequeue_external(dst->vcpu);
+}
+
+static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
+{
+	set_bit(n_IRQ, q->queue);
+}
+
+static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ)
+{
+	clear_bit(n_IRQ, q->queue);
+}
+
+static void IRQ_check(struct openpic *opp, struct irq_queue *q)
+{
+	int irq = -1;
+	int next = -1;
+	int priority = -1;
+
+	for (;;) {
+		irq = find_next_bit(q->queue, opp->max_irq, irq + 1);
+		if (irq == opp->max_irq)
+			break;
+
+		pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n",
+			irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority);
+
+		if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) {
+			next = irq;
+			priority = IVPR_PRIORITY(opp->src[irq].ivpr);
+		}
+	}
+
+	q->next = next;
+	q->priority = priority;
+}
+
+static int IRQ_get_next(struct openpic *opp, struct irq_queue *q)
+{
+	/* XXX: optimize */
+	IRQ_check(opp, q);
+
+	return q->next;
+}
+
+static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
+			   bool active, bool was_active)
+{
+	struct irq_dest *dst;
+	struct irq_source *src;
+	int priority;
+
+	dst = &opp->dst[n_CPU];
+	src = &opp->src[n_IRQ];
+
+	pr_debug("%s: IRQ %d active %d was %d\n",
+		__func__, n_IRQ, active, was_active);
+
+	if (src->output != ILR_INTTGT_INT) {
+		pr_debug("%s: output %d irq %d active %d was %d count %d\n",
+			__func__, src->output, n_IRQ, active, was_active,
+			dst->outputs_active[src->output]);
+
+		/* On Freescale MPIC, critical interrupts ignore priority,
+		 * IACK, EOI, etc.  Before MPIC v4.1 they also ignore
+		 * masking.
+		 */
+		if (active) {
+			if (!was_active &&
+			    dst->outputs_active[src->output]++ == 0) {
+				pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
+					__func__, src->output, n_CPU, n_IRQ);
+				mpic_irq_raise(opp, dst, src->output);
+			}
+		} else {
+			if (was_active &&
+			    --dst->outputs_active[src->output] == 0) {
+				pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
+					__func__, src->output, n_CPU, n_IRQ);
+				mpic_irq_lower(opp, dst, src->output);
+			}
+		}
+
+		return;
+	}
+
+	priority = IVPR_PRIORITY(src->ivpr);
+
+	/* Even if the interrupt doesn't have enough priority,
+	 * it is still raised, in case ctpr is lowered later.
+	 */
+	if (active)
+		IRQ_setbit(&dst->raised, n_IRQ);
+	else
+		IRQ_resetbit(&dst->raised, n_IRQ);
+
+	IRQ_check(opp, &dst->raised);
+
+	if (active && priority <= dst->ctpr) {
+		pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n",
+			__func__, n_IRQ, priority, dst->ctpr, n_CPU);
+		active = 0;
+	}
+
+	if (active) {
+		if (IRQ_get_next(opp, &dst->servicing) >= 0 &&
+		    priority <= dst->servicing.priority) {
+			pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n",
+				__func__, n_IRQ, dst->servicing.next, n_CPU);
+		} else {
+			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
+				__func__, n_CPU, n_IRQ, dst->raised.next);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+	} else {
+		IRQ_get_next(opp, &dst->servicing);
+		if (dst->raised.priority > dst->ctpr &&
+		    dst->raised.priority > dst->servicing.priority) {
+			pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n",
+				__func__, n_IRQ, dst->raised.next,
+				dst->raised.priority, dst->ctpr,
+				dst->servicing.priority, n_CPU);
+			/* IRQ line stays asserted */
+		} else {
+			pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
+				__func__, n_IRQ, dst->ctpr,
+				dst->servicing.priority, n_CPU);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+		}
+	}
+}
+
+/* update pic state because registers for n_IRQ have changed value */
+static void openpic_update_irq(struct openpic *opp, int n_IRQ)
+{
+	struct irq_source *src;
+	bool active, was_active;
+	int i;
+
+	src = &opp->src[n_IRQ];
+	active = src->pending;
+
+	if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) {
+		/* Interrupt source is disabled */
+		pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ);
+		active = false;
+	}
+
+	was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK);
+
+	/*
+	 * We don't have a similar check for already-active because
+	 * ctpr may have changed and we need to withdraw the interrupt.
+	 */
+	if (!active && !was_active) {
+		pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ);
+		return;
+	}
+
+	if (active)
+		src->ivpr |= IVPR_ACTIVITY_MASK;
+	else
+		src->ivpr &= ~IVPR_ACTIVITY_MASK;
+
+	if (src->destmask == 0) {
+		/* No target */
+		pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ);
+		return;
+	}
+
+	if (src->destmask == (1 << src->last_cpu)) {
+		/* Only one CPU is allowed to receive this IRQ */
+		IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active);
+	} else if (!(src->ivpr & IVPR_MODE_MASK)) {
+		/* Directed delivery mode */
+		for (i = 0; i < opp->nb_cpus; i++) {
+			if (src->destmask & (1 << i)) {
+				IRQ_local_pipe(opp, i, n_IRQ, active,
+					       was_active);
+			}
+		}
+	} else {
+		/* Distributed delivery mode */
+		for (i = src->last_cpu + 1; i != src->last_cpu; i++) {
+			if (i == opp->nb_cpus)
+				i = 0;
+
+			if (src->destmask & (1 << i)) {
+				IRQ_local_pipe(opp, i, n_IRQ, active,
+					       was_active);
+				src->last_cpu = i;
+				break;
+			}
+		}
+	}
+}
+
+static void openpic_set_irq(void *opaque, int n_IRQ, int level)
+{
+	struct openpic *opp = opaque;
+	struct irq_source *src;
+
+	if (n_IRQ >= MAX_IRQ) {
+		WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
+		return;
+	}
+
+	src = &opp->src[n_IRQ];
+	pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n",
+		n_IRQ, level, src->ivpr);
+	if (src->level) {
+		/* level-sensitive irq */
+		src->pending = level;
+		openpic_update_irq(opp, n_IRQ);
+	} else {
+		/* edge-sensitive irq */
+		if (level) {
+			src->pending = 1;
+			openpic_update_irq(opp, n_IRQ);
+		}
+
+		if (src->output != ILR_INTTGT_INT) {
+			/* Edge-triggered interrupts shouldn't be used
+			 * with non-INT delivery, but just in case,
+			 * try to make it do something sane rather than
+			 * cause an interrupt storm.  This is close to
+			 * what you'd probably see happen in real hardware.
+			 */
+			src->pending = 0;
+			openpic_update_irq(opp, n_IRQ);
+		}
+	}
+}
+
+static void openpic_reset(struct openpic *opp)
+{
+	int i;
+
+	opp->gcr = GCR_RESET;
+	/* Initialise controller registers */
+	opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
+	    (opp->vid << FRR_VID_SHIFT);
+
+	opp->pir = 0;
+	opp->spve = -1 & opp->vector_mask;
+	opp->tfrr = opp->tfrr_reset;
+	/* Initialise IRQ sources */
+	for (i = 0; i < opp->max_irq; i++) {
+		opp->src[i].ivpr = opp->ivpr_reset;
+
+		switch (opp->src[i].type) {
+		case IRQ_TYPE_NORMAL:
+			opp->src[i].level =
+			    !!(opp->ivpr_reset & IVPR_SENSE_MASK);
+			break;
+
+		case IRQ_TYPE_FSLINT:
+			opp->src[i].ivpr |= IVPR_POLARITY_MASK;
+			break;
+
+		case IRQ_TYPE_FSLSPECIAL:
+			break;
+		}
+
+		write_IRQreg_idr(opp, i, opp->idr_reset);
+	}
+	/* Initialise IRQ destinations */
+	for (i = 0; i < MAX_CPU; i++) {
+		opp->dst[i].ctpr = 15;
+		memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue));
+		opp->dst[i].raised.next = -1;
+		memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue));
+		opp->dst[i].servicing.next = -1;
+	}
+	/* Initialise timers */
+	for (i = 0; i < MAX_TMR; i++) {
+		opp->timers[i].tccr = 0;
+		opp->timers[i].tbcr = TBCR_CI;
+	}
+	/* Go out of RESET state */
+	opp->gcr = 0;
+}
+
+static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
+{
+	return opp->src[n_IRQ].idr;
+}
+
+static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
+{
+	if (opp->flags & OPENPIC_FLAG_ILR)
+		return opp->src[n_IRQ].output;
+
+	return 0xffffffff;
+}
+
+static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ)
+{
+	return opp->src[n_IRQ].ivpr;
+}
+
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+				    uint32_t val)
+{
+	struct irq_source *src = &opp->src[n_IRQ];
+	uint32_t normal_mask = (1UL << opp->nb_cpus) - 1;
+	uint32_t crit_mask = 0;
+	uint32_t mask = normal_mask;
+	int crit_shift = IDR_EP_SHIFT - opp->nb_cpus;
+	int i;
+
+	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+		crit_mask = mask << crit_shift;
+		mask |= crit_mask | IDR_EP;
+	}
+
+	src->idr = val & mask;
+	pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr);
+
+	if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+		if (src->idr & crit_mask) {
+			if (src->idr & normal_mask) {
+				pr_debug("%s: IRQ configured for multiple output types, using critical\n",
+					__func__);
+			}
+
+			src->output = ILR_INTTGT_CINT;
+			src->nomask = true;
+			src->destmask = 0;
+
+			for (i = 0; i < opp->nb_cpus; i++) {
+				int n_ci = IDR_CI0_SHIFT - i;
+
+				if (src->idr & (1UL << n_ci))
+					src->destmask |= 1UL << i;
+			}
+		} else {
+			src->output = ILR_INTTGT_INT;
+			src->nomask = false;
+			src->destmask = src->idr & normal_mask;
+		}
+	} else {
+		src->destmask = src->idr;
+	}
+}
+
+static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
+				    uint32_t val)
+{
+	if (opp->flags & OPENPIC_FLAG_ILR) {
+		struct irq_source *src = &opp->src[n_IRQ];
+
+		src->output = val & ILR_INTTGT_MASK;
+		pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
+			src->output);
+
+		/* TODO: on MPIC v4.0 only, set nomask for non-INT */
+	}
+}
+
+static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
+				     uint32_t val)
+{
+	uint32_t mask;
+
+	/* NOTE when implementing newer FSL MPIC models: starting with v4.0,
+	 * the polarity bit is read-only on internal interrupts.
+	 */
+	mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK |
+	    IVPR_POLARITY_MASK | opp->vector_mask;
+
+	/* ACTIVITY bit is read-only */
+	opp->src[n_IRQ].ivpr =
+	    (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask);
+
+	/* For FSL internal interrupts, The sense bit is reserved and zero,
+	 * and the interrupt is always level-triggered.  Timers and IPIs
+	 * have no sense or polarity bits, and are edge-triggered.
+	 */
+	switch (opp->src[n_IRQ].type) {
+	case IRQ_TYPE_NORMAL:
+		opp->src[n_IRQ].level =
+		    !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK);
+		break;
+
+	case IRQ_TYPE_FSLINT:
+		opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK;
+		break;
+
+	case IRQ_TYPE_FSLSPECIAL:
+		opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK);
+		break;
+	}
+
+	openpic_update_irq(opp, n_IRQ);
+	pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val,
+		opp->src[n_IRQ].ivpr);
+}
+
+static void openpic_gcr_write(struct openpic *opp, uint64_t val)
+{
+	if (val & GCR_RESET) {
+		openpic_reset(opp);
+		return;
+	}
+
+	opp->gcr &= ~opp->mpic_mode_mask;
+	opp->gcr |= val & opp->mpic_mode_mask;
+}
+
+static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int err = 0;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	switch (addr) {
+	case 0x00:	/* Block Revision Register1 (BRR1) is Readonly */
+		break;
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:
+	case 0x80:
+	case 0x90:
+	case 0xA0:
+	case 0xB0:
+		err = openpic_cpu_write_internal(opp, addr, val,
+						 get_current_cpu());
+		break;
+	case 0x1000:		/* FRR */
+		break;
+	case 0x1020:		/* GCR */
+		openpic_gcr_write(opp, val);
+		break;
+	case 0x1080:		/* VIR */
+		break;
+	case 0x1090:		/* PIR */
+		/*
+		 * This register is used to reset a CPU core --
+		 * let userspace handle it.
+		 */
+		err = -ENXIO;
+		break;
+	case 0x10A0:		/* IPI_IVPR */
+	case 0x10B0:
+	case 0x10C0:
+	case 0x10D0: {
+		int idx;
+		idx = (addr - 0x10A0) >> 4;
+		write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val);
+		break;
+	}
+	case 0x10E0:		/* SPVE */
+		opp->spve = val & opp->vector_mask;
+		break;
+	default:
+		break;
+	}
+
+	return err;
+}
+
+static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	u32 retval;
+	int err = 0;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	retval = 0xFFFFFFFF;
+	if (addr & 0xF)
+		goto out;
+
+	switch (addr) {
+	case 0x1000:		/* FRR */
+		retval = opp->frr;
+		retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
+		break;
+	case 0x1020:		/* GCR */
+		retval = opp->gcr;
+		break;
+	case 0x1080:		/* VIR */
+		retval = opp->vir;
+		break;
+	case 0x1090:		/* PIR */
+		retval = 0x00000000;
+		break;
+	case 0x00:		/* Block Revision Register1 (BRR1) */
+		retval = opp->brr1;
+		break;
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:
+	case 0x80:
+	case 0x90:
+	case 0xA0:
+	case 0xB0:
+		err = openpic_cpu_read_internal(opp, addr,
+			&retval, get_current_cpu());
+		break;
+	case 0x10A0:		/* IPI_IVPR */
+	case 0x10B0:
+	case 0x10C0:
+	case 0x10D0:
+		{
+			int idx;
+			idx = (addr - 0x10A0) >> 4;
+			retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx);
+		}
+		break;
+	case 0x10E0:		/* SPVE */
+		retval = opp->spve;
+		break;
+	default:
+		break;
+	}
+
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return err;
+}
+
+static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx;
+
+	addr += 0x10f0;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	if (addr == 0x10f0) {
+		/* TFRR */
+		opp->tfrr = val;
+		return 0;
+	}
+
+	idx = (addr >> 6) & 0x3;
+	addr = addr & 0x30;
+
+	switch (addr & 0x30) {
+	case 0x00:		/* TCCR */
+		break;
+	case 0x10:		/* TBCR */
+		if ((opp->timers[idx].tccr & TCCR_TOG) != 0 &&
+		    (val & TBCR_CI) == 0 &&
+		    (opp->timers[idx].tbcr & TBCR_CI) != 0)
+			opp->timers[idx].tccr &= ~TCCR_TOG;
+
+		opp->timers[idx].tbcr = val;
+		break;
+	case 0x20:		/* TVPR */
+		write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val);
+		break;
+	case 0x30:		/* TDR */
+		write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t retval = -1;
+	int idx;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	if (addr & 0xF)
+		goto out;
+
+	idx = (addr >> 6) & 0x3;
+	if (addr == 0x0) {
+		/* TFRR */
+		retval = opp->tfrr;
+		goto out;
+	}
+
+	switch (addr & 0x30) {
+	case 0x00:		/* TCCR */
+		retval = opp->timers[idx].tccr;
+		break;
+	case 0x10:		/* TBCR */
+		retval = opp->timers[idx].tbcr;
+		break;
+	case 0x20:		/* TIPV */
+		retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx);
+		break;
+	case 0x30:		/* TIDE (TIDR) */
+		retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx);
+		break;
+	}
+
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx;
+
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+
+	addr = addr & 0xffff;
+	idx = addr >> 5;
+
+	switch (addr & 0x1f) {
+	case 0x00:
+		write_IRQreg_ivpr(opp, idx, val);
+		break;
+	case 0x10:
+		write_IRQreg_idr(opp, idx, val);
+		break;
+	case 0x18:
+		write_IRQreg_ilr(opp, idx, val);
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t retval;
+	int idx;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	retval = 0xFFFFFFFF;
+
+	addr = addr & 0xffff;
+	idx = addr >> 5;
+
+	switch (addr & 0x1f) {
+	case 0x00:
+		retval = read_IRQreg_ivpr(opp, idx);
+		break;
+	case 0x10:
+		retval = read_IRQreg_idr(opp, idx);
+		break;
+	case 0x18:
+		retval = read_IRQreg_ilr(opp, idx);
+		break;
+	}
+
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+	int idx = opp->irq_msi;
+	int srs, ibs;
+
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+	if (addr & 0xF)
+		return 0;
+
+	switch (addr) {
+	case MSIIR_OFFSET:
+		srs = val >> MSIIR_SRS_SHIFT;
+		idx += srs;
+		ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT;
+		opp->msi[srs].msir |= 1 << ibs;
+		openpic_set_irq(opp, idx, 1);
+		break;
+	default:
+		/* most registers are read-only, thus ignored */
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+	uint32_t r = 0;
+	int i, srs;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+	if (addr & 0xF)
+		return -ENXIO;
+
+	srs = addr >> 4;
+
+	switch (addr) {
+	case 0x00:
+	case 0x10:
+	case 0x20:
+	case 0x30:
+	case 0x40:
+	case 0x50:
+	case 0x60:
+	case 0x70:		/* MSIRs */
+		r = opp->msi[srs].msir;
+		/* Clear on read */
+		opp->msi[srs].msir = 0;
+		openpic_set_irq(opp, opp->irq_msi + srs, 0);
+		break;
+	case 0x120:		/* MSISR */
+		for (i = 0; i < MAX_MSI; i++)
+			r |= (opp->msi[i].msir ? 1 : 0) << i;
+		break;
+	}
+
+	pr_debug("%s: => 0x%08x\n", __func__, r);
+	*ptr = r;
+	return 0;
+}
+
+static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	uint32_t r = 0;
+
+	pr_debug("%s: addr %#llx\n", __func__, addr);
+
+	/* TODO: EISR/EIMR */
+
+	*ptr = r;
+	return 0;
+}
+
+static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
+{
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+
+	/* TODO: EISR/EIMR */
+	return 0;
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx)
+{
+	struct openpic *opp = opaque;
+	struct irq_source *src;
+	struct irq_dest *dst;
+	int s_IRQ, n_IRQ;
+
+	pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
+		addr, val);
+
+	if (idx < 0)
+		return 0;
+
+	if (addr & 0xF)
+		return 0;
+
+	dst = &opp->dst[idx];
+	addr &= 0xFF0;
+	switch (addr) {
+	case 0x40:		/* IPIDR */
+	case 0x50:
+	case 0x60:
+	case 0x70:
+		idx = (addr - 0x40) >> 4;
+		/* we use IDE as mask which CPUs to deliver the IPI to still. */
+		opp->src[opp->irq_ipi0 + idx].destmask |= val;
+		openpic_set_irq(opp, opp->irq_ipi0 + idx, 1);
+		openpic_set_irq(opp, opp->irq_ipi0 + idx, 0);
+		break;
+	case 0x80:		/* CTPR */
+		dst->ctpr = val & 0x0000000F;
+
+		pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n",
+			__func__, idx, dst->ctpr, dst->raised.priority,
+			dst->servicing.priority);
+
+		if (dst->raised.priority <= dst->ctpr) {
+			pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
+				__func__, idx);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+		} else if (dst->raised.priority > dst->servicing.priority) {
+			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
+				__func__, idx, dst->raised.next);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+
+		break;
+	case 0x90:		/* WHOAMI */
+		/* Read-only register */
+		break;
+	case 0xA0:		/* IACK */
+		/* Read-only register */
+		break;
+	case 0xB0: {		/* EOI */
+		int notify_eoi;
+
+		pr_debug("EOI\n");
+		s_IRQ = IRQ_get_next(opp, &dst->servicing);
+
+		if (s_IRQ < 0) {
+			pr_debug("%s: EOI with no interrupt in service\n",
+				__func__);
+			break;
+		}
+
+		IRQ_resetbit(&dst->servicing, s_IRQ);
+		/* Notify listeners that the IRQ is over */
+		notify_eoi = s_IRQ;
+		/* Set up next servicing IRQ */
+		s_IRQ = IRQ_get_next(opp, &dst->servicing);
+		/* Check queued interrupts. */
+		n_IRQ = IRQ_get_next(opp, &dst->raised);
+		src = &opp->src[n_IRQ];
+		if (n_IRQ != -1 &&
+		    (s_IRQ == -1 ||
+		     IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
+			pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
+				idx, n_IRQ);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+		}
+
+		spin_unlock(&opp->lock);
+		kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
+		spin_lock(&opp->lock);
+
+		break;
+	}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
+{
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_write_internal(opp, addr, val,
+					 (addr & 0x1f000) >> 12);
+}
+
+static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
+			     int cpu)
+{
+	struct irq_source *src;
+	int retval, irq;
+
+	pr_debug("Lower OpenPIC INT output\n");
+	mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+
+	irq = IRQ_get_next(opp, &dst->raised);
+	pr_debug("IACK: irq=%d\n", irq);
+
+	if (irq == -1)
+		/* No more interrupt pending */
+		return opp->spve;
+
+	src = &opp->src[irq];
+	if (!(src->ivpr & IVPR_ACTIVITY_MASK) ||
+	    !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) {
+		pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n",
+			__func__, irq, dst->ctpr, src->ivpr);
+		openpic_update_irq(opp, irq);
+		retval = opp->spve;
+	} else {
+		/* IRQ enter servicing state */
+		IRQ_setbit(&dst->servicing, irq);
+		retval = IVPR_VECTOR(opp, src->ivpr);
+	}
+
+	if (!src->level) {
+		/* edge-sensitive IRQ */
+		src->ivpr &= ~IVPR_ACTIVITY_MASK;
+		src->pending = 0;
+		IRQ_resetbit(&dst->raised, irq);
+	}
+
+	if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) {
+		src->destmask &= ~(1 << cpu);
+		if (src->destmask && !src->level) {
+			/* trigger on CPUs that didn't know about it yet */
+			openpic_set_irq(opp, irq, 1);
+			openpic_set_irq(opp, irq, 0);
+			/* if all CPUs knew about it, set active bit again */
+			src->ivpr |= IVPR_ACTIVITY_MASK;
+		}
+	}
+
+	return retval;
+}
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+	struct openpic *opp = vcpu->arch.mpic;
+	int cpu = vcpu->arch.irq_cpu_id;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+
+	if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
+		kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
+
+	spin_unlock_irqrestore(&opp->lock, flags);
+}
+
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx)
+{
+	struct openpic *opp = opaque;
+	struct irq_dest *dst;
+	uint32_t retval;
+
+	pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
+	retval = 0xFFFFFFFF;
+
+	if (idx < 0)
+		goto out;
+
+	if (addr & 0xF)
+		goto out;
+
+	dst = &opp->dst[idx];
+	addr &= 0xFF0;
+	switch (addr) {
+	case 0x80:		/* CTPR */
+		retval = dst->ctpr;
+		break;
+	case 0x90:		/* WHOAMI */
+		retval = idx;
+		break;
+	case 0xA0:		/* IACK */
+		retval = openpic_iack(opp, dst, idx);
+		break;
+	case 0xB0:		/* EOI */
+		retval = 0;
+		break;
+	default:
+		break;
+	}
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+
+out:
+	*ptr = retval;
+	return 0;
+}
+
+static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_read_internal(opp, addr, ptr,
+					 (addr & 0x1f000) >> 12);
+}
+
+struct mem_reg {
+	int (*read)(void *opaque, gpa_t addr, u32 *ptr);
+	int (*write)(void *opaque, gpa_t addr, u32 val);
+	gpa_t start_addr;
+	int size;
+};
+
+static const struct mem_reg openpic_gbl_mmio = {
+	.write = openpic_gbl_write,
+	.read = openpic_gbl_read,
+	.start_addr = OPENPIC_GLB_REG_START,
+	.size = OPENPIC_GLB_REG_SIZE,
+};
+
+static const struct mem_reg openpic_tmr_mmio = {
+	.write = openpic_tmr_write,
+	.read = openpic_tmr_read,
+	.start_addr = OPENPIC_TMR_REG_START,
+	.size = OPENPIC_TMR_REG_SIZE,
+};
+
+static const struct mem_reg openpic_cpu_mmio = {
+	.write = openpic_cpu_write,
+	.read = openpic_cpu_read,
+	.start_addr = OPENPIC_CPU_REG_START,
+	.size = OPENPIC_CPU_REG_SIZE,
+};
+
+static const struct mem_reg openpic_src_mmio = {
+	.write = openpic_src_write,
+	.read = openpic_src_read,
+	.start_addr = OPENPIC_SRC_REG_START,
+	.size = OPENPIC_SRC_REG_SIZE,
+};
+
+static const struct mem_reg openpic_msi_mmio = {
+	.read = openpic_msi_read,
+	.write = openpic_msi_write,
+	.start_addr = OPENPIC_MSI_REG_START,
+	.size = OPENPIC_MSI_REG_SIZE,
+};
+
+static const struct mem_reg openpic_summary_mmio = {
+	.read = openpic_summary_read,
+	.write = openpic_summary_write,
+	.start_addr = OPENPIC_SUMMARY_REG_START,
+	.size = OPENPIC_SUMMARY_REG_SIZE,
+};
+
+static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr)
+{
+	if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) {
+		WARN(1, "kvm mpic: too many mmio regions\n");
+		return;
+	}
+
+	opp->mmio_regions[opp->num_mmio_regions++] = mr;
+}
+
+static void fsl_common_init(struct openpic *opp)
+{
+	int i;
+	int virq = MAX_SRC;
+
+	add_mmio_region(opp, &openpic_msi_mmio);
+	add_mmio_region(opp, &openpic_summary_mmio);
+
+	opp->vid = VID_REVISION_1_2;
+	opp->vir = VIR_GENERIC;
+	opp->vector_mask = 0xFFFF;
+	opp->tfrr_reset = 0;
+	opp->ivpr_reset = IVPR_MASK_MASK;
+	opp->idr_reset = 1 << 0;
+	opp->max_irq = MAX_IRQ;
+
+	opp->irq_ipi0 = virq;
+	virq += MAX_IPI;
+	opp->irq_tim0 = virq;
+	virq += MAX_TMR;
+
+	BUG_ON(virq > MAX_IRQ);
+
+	opp->irq_msi = 224;
+
+	for (i = 0; i < opp->fsl->max_ext; i++)
+		opp->src[i].level = false;
+
+	/* Internal interrupts, including message and MSI */
+	for (i = 16; i < MAX_SRC; i++) {
+		opp->src[i].type = IRQ_TYPE_FSLINT;
+		opp->src[i].level = true;
+	}
+
+	/* timers and IPIs */
+	for (i = MAX_SRC; i < virq; i++) {
+		opp->src[i].type = IRQ_TYPE_FSLSPECIAL;
+		opp->src[i].level = false;
+	}
+}
+
+static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
+{
+	int i;
+
+	for (i = 0; i < opp->num_mmio_regions; i++) {
+		const struct mem_reg *mr = opp->mmio_regions[i];
+
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
+
+		return mr->read(opp, addr - mr->start_addr, ptr);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
+{
+	int i;
+
+	for (i = 0; i < opp->num_mmio_regions; i++) {
+		const struct mem_reg *mr = opp->mmio_regions[i];
+
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
+
+		return mr->write(opp, addr - mr->start_addr, val);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_mpic_read(struct kvm_vcpu *vcpu,
+			 struct kvm_io_device *this,
+			 gpa_t addr, int len, void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+	union {
+		u32 val;
+		u8 bytes[4];
+	} u;
+
+	if (addr & (len - 1)) {
+		pr_debug("%s: bad alignment %llx/%d\n",
+			 __func__, addr, len);
+		return -EINVAL;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
+	spin_unlock_irq(&opp->lock);
+
+	/*
+	 * Technically only 32-bit accesses are allowed, but be nice to
+	 * people dumping registers a byte at a time -- it works in real
+	 * hardware (reads only, not writes).
+	 */
+	if (len == 4) {
+		*(u32 *)ptr = u.val;
+		pr_debug("%s: addr %llx ret %d len 4 val %x\n",
+			 __func__, addr, ret, u.val);
+	} else if (len == 1) {
+		*(u8 *)ptr = u.bytes[addr & 3];
+		pr_debug("%s: addr %llx ret %d len 1 val %x\n",
+			 __func__, addr, ret, u.bytes[addr & 3]);
+	} else {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int kvm_mpic_write(struct kvm_vcpu *vcpu,
+			  struct kvm_io_device *this,
+			  gpa_t addr, int len, const void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+
+	if (len != 4) {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EOPNOTSUPP;
+	}
+	if (addr & 3) {
+		pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
+				      *(const u32 *)ptr);
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: addr %llx ret %d val %x\n",
+		 __func__, addr, ret, *(const u32 *)ptr);
+
+	return ret;
+}
+
+static const struct kvm_io_device_ops mpic_mmio_ops = {
+	.read = kvm_mpic_read,
+	.write = kvm_mpic_write,
+};
+
+static void map_mmio(struct openpic *opp)
+{
+	kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
+
+	kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
+				opp->reg_base, OPENPIC_REG_SIZE,
+				&opp->mmio);
+}
+
+static void unmap_mmio(struct openpic *opp)
+{
+	kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
+}
+
+static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	u64 base;
+
+	if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64)))
+		return -EFAULT;
+
+	if (base & 0x3ffff) {
+		pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
+			 __func__, base);
+		return -EINVAL;
+	}
+
+	if (base == opp->reg_base)
+		return 0;
+
+	mutex_lock(&opp->kvm->slots_lock);
+
+	unmap_mmio(opp);
+	opp->reg_base = base;
+
+	pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
+		 __func__, base);
+
+	if (base == 0)
+		goto out;
+
+	map_mmio(opp);
+
+out:
+	mutex_unlock(&opp->kvm->slots_lock);
+	return 0;
+}
+
+#define ATTR_SET		0
+#define ATTR_GET		1
+
+static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
+{
+	int ret;
+
+	if (addr & 3)
+		return -ENXIO;
+
+	spin_lock_irq(&opp->lock);
+
+	if (type == ATTR_SET)
+		ret = kvm_mpic_write_internal(opp, addr, *val);
+	else
+		ret = kvm_mpic_read_internal(opp, addr, val);
+
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
+
+	return ret;
+}
+
+static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u32 attr32;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return set_base_addr(opp, attr);
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return access_reg(opp, attr->attr, &attr32, ATTR_SET);
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		if (attr32 != 0 && attr32 != 1)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		openpic_set_irq(opp, attr->attr, attr32);
+		spin_unlock_irq(&opp->lock);
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u64 attr64;
+	u32 attr32;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			mutex_lock(&opp->kvm->slots_lock);
+			attr64 = opp->reg_base;
+			mutex_unlock(&opp->kvm->slots_lock);
+
+			if (copy_to_user((u64 __user *)(long)attr->addr,
+					 &attr64, sizeof(u64)))
+				return -EFAULT;
+
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
+		if (ret)
+			return ret;
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		attr32 = opp->src[attr->attr].pending;
+		spin_unlock_irq(&opp->lock);
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			break;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static void mpic_destroy(struct kvm_device *dev)
+{
+	struct openpic *opp = dev->private;
+
+	dev->kvm->arch.mpic = NULL;
+	kfree(opp);
+	kfree(dev);
+}
+
+static int mpic_set_default_irq_routing(struct openpic *opp)
+{
+	struct kvm_irq_routing_entry *routing;
+
+	/* Create a nop default map, so that dereferencing it still works */
+	routing = kzalloc((sizeof(*routing)), GFP_KERNEL);
+	if (!routing)
+		return -ENOMEM;
+
+	kvm_set_irq_routing(opp->kvm, routing, 0, 0);
+
+	kfree(routing);
+	return 0;
+}
+
+static int mpic_create(struct kvm_device *dev, u32 type)
+{
+	struct openpic *opp;
+	int ret;
+
+	/* We only support one MPIC at a time for now */
+	if (dev->kvm->arch.mpic)
+		return -EINVAL;
+
+	opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
+	if (!opp)
+		return -ENOMEM;
+
+	dev->private = opp;
+	opp->kvm = dev->kvm;
+	opp->dev = dev;
+	opp->model = type;
+	spin_lock_init(&opp->lock);
+
+	add_mmio_region(opp, &openpic_gbl_mmio);
+	add_mmio_region(opp, &openpic_tmr_mmio);
+	add_mmio_region(opp, &openpic_src_mmio);
+	add_mmio_region(opp, &openpic_cpu_mmio);
+
+	switch (opp->model) {
+	case KVM_DEV_TYPE_FSL_MPIC_20:
+		opp->fsl = &fsl_mpic_20;
+		opp->brr1 = 0x00400200;
+		opp->flags |= OPENPIC_FLAG_IDR_CRIT;
+		opp->nb_irqs = 80;
+		opp->mpic_mode_mask = GCR_MODE_MIXED;
+
+		fsl_common_init(opp);
+
+		break;
+
+	case KVM_DEV_TYPE_FSL_MPIC_42:
+		opp->fsl = &fsl_mpic_42;
+		opp->brr1 = 0x00400402;
+		opp->flags |= OPENPIC_FLAG_ILR;
+		opp->nb_irqs = 196;
+		opp->mpic_mode_mask = GCR_MODE_PROXY;
+
+		fsl_common_init(opp);
+
+		break;
+
+	default:
+		ret = -ENODEV;
+		goto err;
+	}
+
+	ret = mpic_set_default_irq_routing(opp);
+	if (ret)
+		goto err;
+
+	openpic_reset(opp);
+
+	smp_wmb();
+	dev->kvm->arch.mpic = opp;
+
+	return 0;
+
+err:
+	kfree(opp);
+	return ret;
+}
+
+struct kvm_device_ops kvm_mpic_ops = {
+	.name = "kvm-mpic",
+	.create = mpic_create,
+	.destroy = mpic_destroy,
+	.set_attr = mpic_set_attr,
+	.get_attr = mpic_get_attr,
+	.has_attr = mpic_has_attr,
+};
+
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 cpu)
+{
+	struct openpic *opp = dev->private;
+	int ret = 0;
+
+	if (dev->ops != &kvm_mpic_ops)
+		return -EPERM;
+	if (opp->kvm != vcpu->kvm)
+		return -EPERM;
+	if (cpu < 0 || cpu >= MAX_CPU)
+		return -EPERM;
+
+	spin_lock_irq(&opp->lock);
+
+	if (opp->dst[cpu].vcpu) {
+		ret = -EEXIST;
+		goto out;
+	}
+	if (vcpu->arch.irq_type) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	opp->dst[cpu].vcpu = vcpu;
+	opp->nb_cpus = max(opp->nb_cpus, cpu + 1);
+
+	vcpu->arch.mpic = opp;
+	vcpu->arch.irq_cpu_id = cpu;
+	vcpu->arch.irq_type = KVMPPC_IRQ_MPIC;
+
+	/* This might need to be changed if GCR gets extended */
+	if (opp->mpic_mode_mask == GCR_MODE_PROXY)
+		vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
+
+out:
+	spin_unlock_irq(&opp->lock);
+	return ret;
+}
+
+/*
+ * This should only happen immediately before the mpic is destroyed,
+ * so we shouldn't need to worry about anything still trying to
+ * access the vcpu pointer.
+ */
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu)
+{
+	BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
+
+	opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
+}
+
+/*
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e,
+			struct kvm *kvm, int irq_source_id, int level,
+			bool line_status)
+{
+	u32 irq = e->irqchip.pin;
+	struct openpic *opp = kvm->arch.mpic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+	openpic_set_irq(opp, irq, level);
+	spin_unlock_irqrestore(&opp->lock, flags);
+
+	/* All code paths we care about don't check for the return value */
+	return 0;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+		struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+	struct openpic *opp = kvm->arch.mpic;
+	unsigned long flags;
+
+	spin_lock_irqsave(&opp->lock, flags);
+
+	/*
+	 * XXX We ignore the target address for now, as we only support
+	 *     a single MSI bank.
+	 */
+	openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data);
+	spin_unlock_irqrestore(&opp->lock, flags);
+
+	/* All code paths we care about don't check for the return value */
+	return 0;
+}
+
+int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+			  const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+
+	switch (ue->type) {
+	case KVM_IRQ_ROUTING_IRQCHIP:
+		e->set = mpic_set_irq;
+		e->irqchip.irqchip = ue->u.irqchip.irqchip;
+		e->irqchip.pin = ue->u.irqchip.pin;
+		if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS)
+			goto out;
+		break;
+	case KVM_IRQ_ROUTING_MSI:
+		e->set = kvm_set_msi;
+		e->msi.address_lo = ue->u.msi.address_lo;
+		e->msi.address_hi = ue->u.msi.address_hi;
+		e->msi.data = ue->u.msi.data;
+		break;
+	default:
+		goto out;
+	}
+
+	r = 0;
+out:
+	return r;
+}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
new file mode 100644
index 000000000..ac3ddf115
--- /dev/null
+++ b/arch/powerpc/kvm/powerpc.c
@@ -0,0 +1,1411 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/vmalloc.h>
+#include <linux/hrtimer.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <asm/cputable.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_ppc.h>
+#include <asm/tlbflush.h>
+#include <asm/cputhreads.h>
+#include <asm/irqflags.h>
+#include "timing.h"
+#include "irq.h"
+#include "../mm/mmu_decl.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+struct kvmppc_ops *kvmppc_hv_ops;
+EXPORT_SYMBOL_GPL(kvmppc_hv_ops);
+struct kvmppc_ops *kvmppc_pr_ops;
+EXPORT_SYMBOL_GPL(kvmppc_pr_ops);
+
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
+{
+	return !!(v->arch.pending_exceptions) ||
+	       v->requests;
+}
+
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
+{
+	return 1;
+}
+
+/*
+ * Common checks before entering the guest world.  Call with interrupts
+ * disabled.
+ *
+ * returns:
+ *
+ * == 1 if we're ready to go into guest state
+ * <= 0 if we need to go back to the host with return value
+ */
+int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+	int r;
+
+	WARN_ON(irqs_disabled());
+	hard_irq_disable();
+
+	while (true) {
+		if (need_resched()) {
+			local_irq_enable();
+			cond_resched();
+			hard_irq_disable();
+			continue;
+		}
+
+		if (signal_pending(current)) {
+			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+			break;
+		}
+
+		vcpu->mode = IN_GUEST_MODE;
+
+		/*
+		 * Reading vcpu->requests must happen after setting vcpu->mode,
+		 * so we don't miss a request because the requester sees
+		 * OUTSIDE_GUEST_MODE and assumes we'll be checking requests
+		 * before next entering the guest (and thus doesn't IPI).
+		 */
+		smp_mb();
+
+		if (vcpu->requests) {
+			/* Make sure we process requests preemptable */
+			local_irq_enable();
+			trace_kvm_check_requests(vcpu);
+			r = kvmppc_core_check_requests(vcpu);
+			hard_irq_disable();
+			if (r > 0)
+				continue;
+			break;
+		}
+
+		if (kvmppc_core_prepare_to_enter(vcpu)) {
+			/* interrupts got enabled in between, so we
+			   are back at square 1 */
+			continue;
+		}
+
+		kvm_guest_enter();
+		return 1;
+	}
+
+	/* return to host */
+	local_irq_enable();
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter);
+
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+static void kvmppc_swab_shared(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared;
+	int i;
+
+	shared->sprg0 = swab64(shared->sprg0);
+	shared->sprg1 = swab64(shared->sprg1);
+	shared->sprg2 = swab64(shared->sprg2);
+	shared->sprg3 = swab64(shared->sprg3);
+	shared->srr0 = swab64(shared->srr0);
+	shared->srr1 = swab64(shared->srr1);
+	shared->dar = swab64(shared->dar);
+	shared->msr = swab64(shared->msr);
+	shared->dsisr = swab32(shared->dsisr);
+	shared->int_pending = swab32(shared->int_pending);
+	for (i = 0; i < ARRAY_SIZE(shared->sr); i++)
+		shared->sr[i] = swab32(shared->sr[i]);
+}
+#endif
+
+int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
+{
+	int nr = kvmppc_get_gpr(vcpu, 11);
+	int r;
+	unsigned long __maybe_unused param1 = kvmppc_get_gpr(vcpu, 3);
+	unsigned long __maybe_unused param2 = kvmppc_get_gpr(vcpu, 4);
+	unsigned long __maybe_unused param3 = kvmppc_get_gpr(vcpu, 5);
+	unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6);
+	unsigned long r2 = 0;
+
+	if (!(kvmppc_get_msr(vcpu) & MSR_SF)) {
+		/* 32 bit mode */
+		param1 &= 0xffffffff;
+		param2 &= 0xffffffff;
+		param3 &= 0xffffffff;
+		param4 &= 0xffffffff;
+	}
+
+	switch (nr) {
+	case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
+	{
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
+		/* Book3S can be little endian, find it out here */
+		int shared_big_endian = true;
+		if (vcpu->arch.intr_msr & MSR_LE)
+			shared_big_endian = false;
+		if (shared_big_endian != vcpu->arch.shared_big_endian)
+			kvmppc_swab_shared(vcpu);
+		vcpu->arch.shared_big_endian = shared_big_endian;
+#endif
+
+		if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) {
+			/*
+			 * Older versions of the Linux magic page code had
+			 * a bug where they would map their trampoline code
+			 * NX. If that's the case, remove !PR NX capability.
+			 */
+			vcpu->arch.disable_kernel_nx = true;
+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		}
+
+		vcpu->arch.magic_page_pa = param1 & ~0xfffULL;
+		vcpu->arch.magic_page_ea = param2 & ~0xfffULL;
+
+#ifdef CONFIG_PPC_64K_PAGES
+		/*
+		 * Make sure our 4k magic page is in the same window of a 64k
+		 * page within the guest and within the host's page.
+		 */
+		if ((vcpu->arch.magic_page_pa & 0xf000) !=
+		    ((ulong)vcpu->arch.shared & 0xf000)) {
+			void *old_shared = vcpu->arch.shared;
+			ulong shared = (ulong)vcpu->arch.shared;
+			void *new_shared;
+
+			shared &= PAGE_MASK;
+			shared |= vcpu->arch.magic_page_pa & 0xf000;
+			new_shared = (void*)shared;
+			memcpy(new_shared, old_shared, 0x1000);
+			vcpu->arch.shared = new_shared;
+		}
+#endif
+
+		r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
+
+		r = EV_SUCCESS;
+		break;
+	}
+	case KVM_HCALL_TOKEN(KVM_HC_FEATURES):
+		r = EV_SUCCESS;
+#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
+		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
+#endif
+
+		/* Second return value is in r4 */
+		break;
+	case EV_HCALL_TOKEN(EV_IDLE):
+		r = EV_SUCCESS;
+		kvm_vcpu_block(vcpu);
+		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+		break;
+	default:
+		r = EV_UNIMPLEMENTED;
+		break;
+	}
+
+	kvmppc_set_gpr(vcpu, 4, r2);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvmppc_kvm_pv);
+
+int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
+{
+	int r = false;
+
+	/* We have to know what CPU to virtualize */
+	if (!vcpu->arch.pvr)
+		goto out;
+
+	/* PAPR only works with book3s_64 */
+	if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled)
+		goto out;
+
+	/* HV KVM can only do PAPR mode for now */
+	if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm))
+		goto out;
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	if (!cpu_has_feature(CPU_FTR_EMB_HV))
+		goto out;
+#endif
+
+	r = true;
+
+out:
+	vcpu->arch.sane = r;
+	return r ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_sanity_check);
+
+int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+	int r;
+
+	er = kvmppc_emulate_loadstore(vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		/* Future optimization: only reload non-volatiles if they were
+		 * actually modified. */
+		r = RESUME_GUEST_NV;
+		break;
+	case EMULATE_AGAIN:
+		r = RESUME_GUEST;
+		break;
+	case EMULATE_DO_MMIO:
+		run->exit_reason = KVM_EXIT_MMIO;
+		/* We must reload nonvolatiles because "update" load/store
+		 * instructions modify register state. */
+		/* Future optimization: only reload non-volatiles if they were
+		 * actually modified. */
+		r = RESUME_HOST_NV;
+		break;
+	case EMULATE_FAIL:
+	{
+		u32 last_inst;
+
+		kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+		/* XXX Deliver Program interrupt to guest. */
+		pr_emerg("%s: emulation failed (%08x)\n", __func__, last_inst);
+		r = RESUME_HOST;
+		break;
+	}
+	default:
+		WARN_ON(1);
+		r = RESUME_GUEST;
+	}
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio);
+
+int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
+	      bool data)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
+	struct kvmppc_pte pte;
+	int r;
+
+	vcpu->stat.st++;
+
+	r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
+			 XLATE_WRITE, &pte);
+	if (r < 0)
+		return r;
+
+	*eaddr = pte.raddr;
+
+	if (!pte.may_write)
+		return -EPERM;
+
+	/* Magic page override */
+	if (kvmppc_supports_magic_page(vcpu) && mp_pa &&
+	    ((pte.raddr & KVM_PAM & PAGE_MASK) == mp_pa) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		void *magic = vcpu->arch.shared;
+		magic += pte.eaddr & 0xfff;
+		memcpy(magic, ptr, size);
+		return EMULATE_DONE;
+	}
+
+	if (kvm_write_guest(vcpu->kvm, pte.raddr, ptr, size))
+		return EMULATE_DO_MMIO;
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvmppc_st);
+
+int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
+		      bool data)
+{
+	ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
+	struct kvmppc_pte pte;
+	int rc;
+
+	vcpu->stat.ld++;
+
+	rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
+			  XLATE_READ, &pte);
+	if (rc)
+		return rc;
+
+	*eaddr = pte.raddr;
+
+	if (!pte.may_read)
+		return -EPERM;
+
+	if (!data && !pte.may_execute)
+		return -ENOEXEC;
+
+	/* Magic page override */
+	if (kvmppc_supports_magic_page(vcpu) && mp_pa &&
+	    ((pte.raddr & KVM_PAM & PAGE_MASK) == mp_pa) &&
+	    !(kvmppc_get_msr(vcpu) & MSR_PR)) {
+		void *magic = vcpu->arch.shared;
+		magic += pte.eaddr & 0xfff;
+		memcpy(ptr, magic, size);
+		return EMULATE_DONE;
+	}
+
+	if (kvm_read_guest(vcpu->kvm, pte.raddr, ptr, size))
+		return EMULATE_DO_MMIO;
+
+	return EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvmppc_ld);
+
+int kvm_arch_hardware_enable(void)
+{
+	return 0;
+}
+
+int kvm_arch_hardware_setup(void)
+{
+	return 0;
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+	*(int *)rtn = kvmppc_core_check_processor_compat();
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+	struct kvmppc_ops *kvm_ops = NULL;
+	/*
+	 * if we have both HV and PR enabled, default is HV
+	 */
+	if (type == 0) {
+		if (kvmppc_hv_ops)
+			kvm_ops = kvmppc_hv_ops;
+		else
+			kvm_ops = kvmppc_pr_ops;
+		if (!kvm_ops)
+			goto err_out;
+	} else	if (type == KVM_VM_PPC_HV) {
+		if (!kvmppc_hv_ops)
+			goto err_out;
+		kvm_ops = kvmppc_hv_ops;
+	} else if (type == KVM_VM_PPC_PR) {
+		if (!kvmppc_pr_ops)
+			goto err_out;
+		kvm_ops = kvmppc_pr_ops;
+	} else
+		goto err_out;
+
+	if (kvm_ops->owner && !try_module_get(kvm_ops->owner))
+		return -ENOENT;
+
+	kvm->arch.kvm_ops = kvm_ops;
+	return kvmppc_core_init_vm(kvm);
+err_out:
+	return -EINVAL;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+	unsigned int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_arch_vcpu_free(vcpu);
+
+	mutex_lock(&kvm->lock);
+	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+		kvm->vcpus[i] = NULL;
+
+	atomic_set(&kvm->online_vcpus, 0);
+
+	kvmppc_core_destroy_vm(kvm);
+
+	mutex_unlock(&kvm->lock);
+
+	/* drop the module reference */
+	module_put(kvm->arch.kvm_ops->owner);
+}
+
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
+{
+	int r;
+	/* Assume we're using HV mode when the HV module is loaded */
+	int hv_enabled = kvmppc_hv_ops ? 1 : 0;
+
+	if (kvm) {
+		/*
+		 * Hooray - we know which VM type we're running on. Depend on
+		 * that rather than the guess above.
+		 */
+		hv_enabled = is_kvmppc_hv_enabled(kvm);
+	}
+
+	switch (ext) {
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_BOOKE_SREGS:
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
+	case KVM_CAP_PPC_EPR:
+#else
+	case KVM_CAP_PPC_SEGSTATE:
+	case KVM_CAP_PPC_HIOR:
+	case KVM_CAP_PPC_PAPR:
+#endif
+	case KVM_CAP_PPC_UNSET_IRQ:
+	case KVM_CAP_PPC_IRQ_LEVEL:
+	case KVM_CAP_ENABLE_CAP:
+	case KVM_CAP_ENABLE_CAP_VM:
+	case KVM_CAP_ONE_REG:
+	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
+		r = 1;
+		break;
+	case KVM_CAP_PPC_PAIRED_SINGLES:
+	case KVM_CAP_PPC_OSI:
+	case KVM_CAP_PPC_GET_PVINFO:
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	case KVM_CAP_SW_TLB:
+#endif
+		/* We support this only for PR */
+		r = !hv_enabled;
+		break;
+#ifdef CONFIG_KVM_MMIO
+	case KVM_CAP_COALESCED_MMIO:
+		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
+		break;
+#endif
+#ifdef CONFIG_KVM_MPIC
+	case KVM_CAP_IRQ_MPIC:
+		r = 1;
+		break;
+#endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CAP_SPAPR_TCE:
+	case KVM_CAP_PPC_ALLOC_HTAB:
+	case KVM_CAP_PPC_RTAS:
+	case KVM_CAP_PPC_FIXUP_HCALL:
+	case KVM_CAP_PPC_ENABLE_HCALL:
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS:
+#endif
+		r = 1;
+		break;
+#endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_SMT:
+		if (hv_enabled)
+			r = threads_per_subcore;
+		else
+			r = 0;
+		break;
+	case KVM_CAP_PPC_RMA:
+		r = 0;
+		break;
+	case KVM_CAP_PPC_HWRNG:
+		r = kvmppc_hwrng_present();
+		break;
+#endif
+	case KVM_CAP_SYNC_MMU:
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+		r = hv_enabled;
+#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+		r = 1;
+#else
+		r = 0;
+#endif
+		break;
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	case KVM_CAP_PPC_HTAB_FD:
+		r = hv_enabled;
+		break;
+#endif
+	case KVM_CAP_NR_VCPUS:
+		/*
+		 * Recommending a number of CPUs is somewhat arbitrary; we
+		 * return the number of present CPUs for -HV (since a host
+		 * will have secondary threads "offline"), and for other KVM
+		 * implementations just count online CPUs.
+		 */
+		if (hv_enabled)
+			r = num_present_cpus();
+		else
+			r = num_online_cpus();
+		break;
+	case KVM_CAP_MAX_VCPUS:
+		r = KVM_MAX_VCPUS;
+		break;
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CAP_PPC_GET_SMMU_INFO:
+		r = 1;
+		break;
+#endif
+	default:
+		r = 0;
+		break;
+	}
+	return r;
+
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+                        unsigned int ioctl, unsigned long arg)
+{
+	return -EINVAL;
+}
+
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+	kvmppc_core_free_memslot(kvm, free, dont);
+}
+
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+			    unsigned long npages)
+{
+	return kvmppc_core_create_memslot(kvm, slot, npages);
+}
+
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot,
+				   struct kvm_userspace_memory_region *mem,
+				   enum kvm_mr_change change)
+{
+	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_memory_slot *old,
+				   enum kvm_mr_change change)
+{
+	kvmppc_core_commit_memory_region(kvm, mem, old);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	kvmppc_core_flush_memslot(kvm, slot);
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvm_vcpu *vcpu;
+	vcpu = kvmppc_core_vcpu_create(kvm, id);
+	if (!IS_ERR(vcpu)) {
+		vcpu->arch.wqp = &vcpu->wq;
+		kvmppc_create_vcpu_debugfs(vcpu, id);
+	}
+	return vcpu;
+}
+
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	/* Make sure we're not using the vcpu anymore */
+	hrtimer_cancel(&vcpu->arch.dec_timer);
+
+	kvmppc_remove_vcpu_debugfs(vcpu);
+
+	switch (vcpu->arch.irq_type) {
+	case KVMPPC_IRQ_MPIC:
+		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
+		break;
+	case KVMPPC_IRQ_XICS:
+		kvmppc_xics_free_icp(vcpu);
+		break;
+	}
+
+	kvmppc_core_vcpu_free(vcpu);
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	kvm_arch_vcpu_free(vcpu);
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return kvmppc_core_pending_dec(vcpu);
+}
+
+enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer);
+	kvmppc_decrementer_func(vcpu);
+
+	return HRTIMER_NORESTART;
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
+	vcpu->arch.dec_expires = ~(u64)0;
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	mutex_init(&vcpu->arch.exit_timing_lock);
+#endif
+	ret = kvmppc_subarch_vcpu_init(vcpu);
+	return ret;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_destroy(vcpu);
+	kvmppc_subarch_vcpu_uninit(vcpu);
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+#ifdef CONFIG_BOOKE
+	/*
+	 * vrsave (formerly usprg0) isn't used by Linux, but may
+	 * be used by the guest.
+	 *
+	 * On non-booke this is associated with Altivec and
+	 * is handled by code in book3s.c.
+	 */
+	mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
+#endif
+	kvmppc_core_vcpu_load(vcpu, cpu);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	kvmppc_core_vcpu_put(vcpu);
+#ifdef CONFIG_BOOKE
+	vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
+#endif
+}
+
+static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
+                                      struct kvm_run *run)
+{
+	u64 uninitialized_var(gpr);
+
+	if (run->mmio.len > sizeof(gpr)) {
+		printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
+		return;
+	}
+
+	if (!vcpu->arch.mmio_host_swabbed) {
+		switch (run->mmio.len) {
+		case 8: gpr = *(u64 *)run->mmio.data; break;
+		case 4: gpr = *(u32 *)run->mmio.data; break;
+		case 2: gpr = *(u16 *)run->mmio.data; break;
+		case 1: gpr = *(u8 *)run->mmio.data; break;
+		}
+	} else {
+		switch (run->mmio.len) {
+		case 8: gpr = swab64(*(u64 *)run->mmio.data); break;
+		case 4: gpr = swab32(*(u32 *)run->mmio.data); break;
+		case 2: gpr = swab16(*(u16 *)run->mmio.data); break;
+		case 1: gpr = *(u8 *)run->mmio.data; break;
+		}
+	}
+
+	if (vcpu->arch.mmio_sign_extend) {
+		switch (run->mmio.len) {
+#ifdef CONFIG_PPC64
+		case 4:
+			gpr = (s64)(s32)gpr;
+			break;
+#endif
+		case 2:
+			gpr = (s64)(s16)gpr;
+			break;
+		case 1:
+			gpr = (s64)(s8)gpr;
+			break;
+		}
+	}
+
+	kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
+
+	switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) {
+	case KVM_MMIO_REG_GPR:
+		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
+		break;
+	case KVM_MMIO_REG_FPR:
+		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
+		break;
+#ifdef CONFIG_PPC_BOOK3S
+	case KVM_MMIO_REG_QPR:
+		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
+		break;
+	case KVM_MMIO_REG_FQPR:
+		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
+		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
+		break;
+#endif
+	default:
+		BUG();
+	}
+}
+
+int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
+		       unsigned int rt, unsigned int bytes,
+		       int is_default_endian)
+{
+	int idx, ret;
+	bool host_swabbed;
+
+	/* Pity C doesn't have a logical XOR operator */
+	if (kvmppc_need_byteswap(vcpu)) {
+		host_swabbed = is_default_endian;
+	} else {
+		host_swabbed = !is_default_endian;
+	}
+
+	if (bytes > sizeof(run->mmio.data)) {
+		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
+		       run->mmio.len);
+	}
+
+	run->mmio.phys_addr = vcpu->arch.paddr_accessed;
+	run->mmio.len = bytes;
+	run->mmio.is_write = 0;
+
+	vcpu->arch.io_gpr = rt;
+	vcpu->arch.mmio_host_swabbed = host_swabbed;
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_is_write = 0;
+	vcpu->arch.mmio_sign_extend = 0;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
+			      bytes, &run->mmio.data);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (!ret) {
+		kvmppc_complete_mmio_load(vcpu, run);
+		vcpu->mmio_needed = 0;
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_DO_MMIO;
+}
+EXPORT_SYMBOL_GPL(kvmppc_handle_load);
+
+/* Same as above, but sign extends */
+int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			unsigned int rt, unsigned int bytes,
+			int is_default_endian)
+{
+	int r;
+
+	vcpu->arch.mmio_sign_extend = 1;
+	r = kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian);
+
+	return r;
+}
+
+int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
+			u64 val, unsigned int bytes, int is_default_endian)
+{
+	void *data = run->mmio.data;
+	int idx, ret;
+	bool host_swabbed;
+
+	/* Pity C doesn't have a logical XOR operator */
+	if (kvmppc_need_byteswap(vcpu)) {
+		host_swabbed = is_default_endian;
+	} else {
+		host_swabbed = !is_default_endian;
+	}
+
+	if (bytes > sizeof(run->mmio.data)) {
+		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
+		       run->mmio.len);
+	}
+
+	run->mmio.phys_addr = vcpu->arch.paddr_accessed;
+	run->mmio.len = bytes;
+	run->mmio.is_write = 1;
+	vcpu->mmio_needed = 1;
+	vcpu->mmio_is_write = 1;
+
+	/* Store the value at the lowest bytes in 'data'. */
+	if (!host_swabbed) {
+		switch (bytes) {
+		case 8: *(u64 *)data = val; break;
+		case 4: *(u32 *)data = val; break;
+		case 2: *(u16 *)data = val; break;
+		case 1: *(u8  *)data = val; break;
+		}
+	} else {
+		switch (bytes) {
+		case 8: *(u64 *)data = swab64(val); break;
+		case 4: *(u32 *)data = swab32(val); break;
+		case 2: *(u16 *)data = swab16(val); break;
+		case 1: *(u8  *)data = val; break;
+		}
+	}
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+	ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
+			       bytes, &run->mmio.data);
+
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	if (!ret) {
+		vcpu->mmio_needed = 0;
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_DO_MMIO;
+}
+EXPORT_SYMBOL_GPL(kvmppc_handle_store);
+
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r = 0;
+	union kvmppc_one_reg val;
+	int size;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
+
+	r = kvmppc_get_one_reg(vcpu, reg->id, &val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (reg->id) {
+#ifdef CONFIG_ALTIVEC
+		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
+			break;
+		case KVM_REG_PPC_VSCR:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val);
+			break;
+		case KVM_REG_PPC_VRSAVE:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			vcpu->arch.vrsave = set_reg_val(reg->id, val);
+			break;
+#endif /* CONFIG_ALTIVEC */
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	if (r)
+		return r;
+
+	if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
+		r = -EFAULT;
+
+	return r;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+	int r;
+	union kvmppc_one_reg val;
+	int size;
+
+	size = one_reg_size(reg->id);
+	if (size > sizeof(val))
+		return -EINVAL;
+
+	if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
+		return -EFAULT;
+
+	r = kvmppc_set_one_reg(vcpu, reg->id, &val);
+	if (r == -EINVAL) {
+		r = 0;
+		switch (reg->id) {
+#ifdef CONFIG_ALTIVEC
+		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0];
+			break;
+		case KVM_REG_PPC_VSCR:
+			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+				r = -ENXIO;
+				break;
+			}
+			val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]);
+			break;
+		case KVM_REG_PPC_VRSAVE:
+			val = get_reg_val(reg->id, vcpu->arch.vrsave);
+			break;
+#endif /* CONFIG_ALTIVEC */
+		default:
+			r = -EINVAL;
+			break;
+		}
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	int r;
+	sigset_t sigsaved;
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+	if (vcpu->mmio_needed) {
+		if (!vcpu->mmio_is_write)
+			kvmppc_complete_mmio_load(vcpu, run);
+		vcpu->mmio_needed = 0;
+	} else if (vcpu->arch.osi_needed) {
+		u64 *gprs = run->osi.gprs;
+		int i;
+
+		for (i = 0; i < 32; i++)
+			kvmppc_set_gpr(vcpu, i, gprs[i]);
+		vcpu->arch.osi_needed = 0;
+	} else if (vcpu->arch.hcall_needed) {
+		int i;
+
+		kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret);
+		for (i = 0; i < 9; ++i)
+			kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]);
+		vcpu->arch.hcall_needed = 0;
+#ifdef CONFIG_BOOKE
+	} else if (vcpu->arch.epr_needed) {
+		kvmppc_set_epr(vcpu, run->epr.epr);
+		vcpu->arch.epr_needed = 0;
+#endif
+	}
+
+	r = kvmppc_vcpu_run(run, vcpu);
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return r;
+}
+
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
+{
+	if (irq->irq == KVM_INTERRUPT_UNSET) {
+		kvmppc_core_dequeue_external(vcpu);
+		return 0;
+	}
+
+	kvmppc_core_queue_external(vcpu, irq);
+
+	kvm_vcpu_kick(vcpu);
+
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+				     struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+	case KVM_CAP_PPC_OSI:
+		r = 0;
+		vcpu->arch.osi_enabled = true;
+		break;
+	case KVM_CAP_PPC_PAPR:
+		r = 0;
+		vcpu->arch.papr_enabled = true;
+		break;
+	case KVM_CAP_PPC_EPR:
+		r = 0;
+		if (cap->args[0])
+			vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
+		else
+			vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
+		break;
+#ifdef CONFIG_BOOKE
+	case KVM_CAP_PPC_BOOKE_WATCHDOG:
+		r = 0;
+		vcpu->arch.watchdog_enabled = true;
+		break;
+#endif
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	case KVM_CAP_SW_TLB: {
+		struct kvm_config_tlb cfg;
+		void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0];
+
+		r = -EFAULT;
+		if (copy_from_user(&cfg, user_ptr, sizeof(cfg)))
+			break;
+
+		r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg);
+		break;
+	}
+#endif
+#ifdef CONFIG_KVM_MPIC
+	case KVM_CAP_IRQ_MPIC: {
+		struct fd f;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		f = fdget(cap->args[0]);
+		if (!f.file)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(f.file);
+		if (dev)
+			r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
+
+		fdput(f);
+		break;
+	}
+#endif
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS: {
+		struct fd f;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		f = fdget(cap->args[0]);
+		if (!f.file)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(f.file);
+		if (dev)
+			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+
+		fdput(f);
+		break;
+	}
+#endif /* CONFIG_KVM_XICS */
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	if (!r)
+		r = kvmppc_sanity_check(vcpu);
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+	return -EINVAL;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+                                    struct kvm_mp_state *mp_state)
+{
+	return -EINVAL;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+                         unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r;
+
+	switch (ioctl) {
+	case KVM_INTERRUPT: {
+		struct kvm_interrupt irq;
+		r = -EFAULT;
+		if (copy_from_user(&irq, argp, sizeof(irq)))
+			goto out;
+		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
+		goto out;
+	}
+
+	case KVM_ENABLE_CAP:
+	{
+		struct kvm_enable_cap cap;
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+		break;
+	}
+
+	case KVM_SET_ONE_REG:
+	case KVM_GET_ONE_REG:
+	{
+		struct kvm_one_reg reg;
+		r = -EFAULT;
+		if (copy_from_user(&reg, argp, sizeof(reg)))
+			goto out;
+		if (ioctl == KVM_SET_ONE_REG)
+			r = kvm_vcpu_ioctl_set_one_reg(vcpu, &reg);
+		else
+			r = kvm_vcpu_ioctl_get_one_reg(vcpu, &reg);
+		break;
+	}
+
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+	case KVM_DIRTY_TLB: {
+		struct kvm_dirty_tlb dirty;
+		r = -EFAULT;
+		if (copy_from_user(&dirty, argp, sizeof(dirty)))
+			goto out;
+		r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty);
+		break;
+	}
+#endif
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	return r;
+}
+
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
+{
+	u32 inst_nop = 0x60000000;
+#ifdef CONFIG_KVM_BOOKE_HV
+	u32 inst_sc1 = 0x44000022;
+	pvinfo->hcall[0] = cpu_to_be32(inst_sc1);
+	pvinfo->hcall[1] = cpu_to_be32(inst_nop);
+	pvinfo->hcall[2] = cpu_to_be32(inst_nop);
+	pvinfo->hcall[3] = cpu_to_be32(inst_nop);
+#else
+	u32 inst_lis = 0x3c000000;
+	u32 inst_ori = 0x60000000;
+	u32 inst_sc = 0x44000002;
+	u32 inst_imm_mask = 0xffff;
+
+	/*
+	 * The hypercall to get into KVM from within guest context is as
+	 * follows:
+	 *
+	 *    lis r0, r0, KVM_SC_MAGIC_R0@h
+	 *    ori r0, KVM_SC_MAGIC_R0@l
+	 *    sc
+	 *    nop
+	 */
+	pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask));
+	pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask));
+	pvinfo->hcall[2] = cpu_to_be32(inst_sc);
+	pvinfo->hcall[3] = cpu_to_be32(inst_nop);
+#endif
+
+	pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;
+
+	return 0;
+}
+
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+			  bool line_status)
+{
+	if (!irqchip_in_kernel(kvm))
+		return -ENXIO;
+
+	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+					irq_event->irq, irq_event->level,
+					line_status);
+	return 0;
+}
+
+
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+				   struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	case KVM_CAP_PPC_ENABLE_HCALL: {
+		unsigned long hcall = cap->args[0];
+
+		r = -EINVAL;
+		if (hcall > MAX_HCALL_OPCODE || (hcall & 3) ||
+		    cap->args[1] > 1)
+			break;
+		if (!kvmppc_book3s_hcall_implemented(kvm, hcall))
+			break;
+		if (cap->args[1])
+			set_bit(hcall / 4, kvm->arch.enabled_hcalls);
+		else
+			clear_bit(hcall / 4, kvm->arch.enabled_hcalls);
+		r = 0;
+		break;
+	}
+#endif
+	default:
+		r = -EINVAL;
+		break;
+	}
+
+	return r;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+                       unsigned int ioctl, unsigned long arg)
+{
+	struct kvm *kvm __maybe_unused = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	long r;
+
+	switch (ioctl) {
+	case KVM_PPC_GET_PVINFO: {
+		struct kvm_ppc_pvinfo pvinfo;
+		memset(&pvinfo, 0, sizeof(pvinfo));
+		r = kvm_vm_ioctl_get_pvinfo(&pvinfo);
+		if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) {
+			r = -EFAULT;
+			goto out;
+		}
+
+		break;
+	}
+	case KVM_ENABLE_CAP:
+	{
+		struct kvm_enable_cap cap;
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+		break;
+	}
+#ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CREATE_SPAPR_TCE: {
+		struct kvm_create_spapr_tce create_tce;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
+			goto out;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+		goto out;
+	}
+	case KVM_PPC_GET_SMMU_INFO: {
+		struct kvm_ppc_smmu_info info;
+		struct kvm *kvm = filp->private_data;
+
+		memset(&info, 0, sizeof(info));
+		r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info);
+		if (r >= 0 && copy_to_user(argp, &info, sizeof(info)))
+			r = -EFAULT;
+		break;
+	}
+	case KVM_PPC_RTAS_DEFINE_TOKEN: {
+		struct kvm *kvm = filp->private_data;
+
+		r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
+		break;
+	}
+	default: {
+		struct kvm *kvm = filp->private_data;
+		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
+	}
+#else /* CONFIG_PPC_BOOK3S_64 */
+	default:
+		r = -ENOTTY;
+#endif
+	}
+out:
+	return r;
+}
+
+static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)];
+static unsigned long nr_lpids;
+
+long kvmppc_alloc_lpid(void)
+{
+	long lpid;
+
+	do {
+		lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS);
+		if (lpid >= nr_lpids) {
+			pr_err("%s: No LPIDs free\n", __func__);
+			return -ENOMEM;
+		}
+	} while (test_and_set_bit(lpid, lpid_inuse));
+
+	return lpid;
+}
+EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid);
+
+void kvmppc_claim_lpid(long lpid)
+{
+	set_bit(lpid, lpid_inuse);
+}
+EXPORT_SYMBOL_GPL(kvmppc_claim_lpid);
+
+void kvmppc_free_lpid(long lpid)
+{
+	clear_bit(lpid, lpid_inuse);
+}
+EXPORT_SYMBOL_GPL(kvmppc_free_lpid);
+
+void kvmppc_init_lpid(unsigned long nr_lpids_param)
+{
+	nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param);
+	memset(lpid_inuse, 0, sizeof(lpid_inuse));
+}
+EXPORT_SYMBOL_GPL(kvmppc_init_lpid);
+
+int kvm_arch_init(void *opaque)
+{
+	return 0;
+}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr);
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
new file mode 100644
index 000000000..e44d2b2ea
--- /dev/null
+++ b/arch/powerpc/kvm/timing.c
@@ -0,0 +1,245 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+
+#include <asm/time.h>
+#include <asm-generic/div64.h>
+
+#include "timing.h"
+
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	/* Take a lock to avoid concurrent updates */
+	mutex_lock(&vcpu->arch.exit_timing_lock);
+
+	vcpu->arch.last_exit_type = 0xDEAD;
+	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+		vcpu->arch.timing_count_type[i] = 0;
+		vcpu->arch.timing_max_duration[i] = 0;
+		vcpu->arch.timing_min_duration[i] = 0xFFFFFFFF;
+		vcpu->arch.timing_sum_duration[i] = 0;
+		vcpu->arch.timing_sum_quad_duration[i] = 0;
+	}
+	vcpu->arch.timing_last_exit = 0;
+	vcpu->arch.timing_exit.tv64 = 0;
+	vcpu->arch.timing_last_enter.tv64 = 0;
+
+	mutex_unlock(&vcpu->arch.exit_timing_lock);
+}
+
+static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
+{
+	u64 old;
+
+	mutex_lock(&vcpu->arch.exit_timing_lock);
+
+	vcpu->arch.timing_count_type[type]++;
+
+	/* sum */
+	old = vcpu->arch.timing_sum_duration[type];
+	vcpu->arch.timing_sum_duration[type] += duration;
+	if (unlikely(old > vcpu->arch.timing_sum_duration[type])) {
+		printk(KERN_ERR"%s - wrap adding sum of durations"
+			" old %lld new %lld type %d exit # of type %d\n",
+			__func__, old, vcpu->arch.timing_sum_duration[type],
+			type, vcpu->arch.timing_count_type[type]);
+	}
+
+	/* square sum */
+	old = vcpu->arch.timing_sum_quad_duration[type];
+	vcpu->arch.timing_sum_quad_duration[type] += (duration*duration);
+	if (unlikely(old > vcpu->arch.timing_sum_quad_duration[type])) {
+		printk(KERN_ERR"%s - wrap adding sum of squared durations"
+			" old %lld new %lld type %d exit # of type %d\n",
+			__func__, old,
+			vcpu->arch.timing_sum_quad_duration[type],
+			type, vcpu->arch.timing_count_type[type]);
+	}
+
+	/* set min/max */
+	if (unlikely(duration < vcpu->arch.timing_min_duration[type]))
+		vcpu->arch.timing_min_duration[type] = duration;
+	if (unlikely(duration > vcpu->arch.timing_max_duration[type]))
+		vcpu->arch.timing_max_duration[type] = duration;
+
+	mutex_unlock(&vcpu->arch.exit_timing_lock);
+}
+
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
+{
+	u64 exit = vcpu->arch.timing_last_exit;
+	u64 enter = vcpu->arch.timing_last_enter.tv64;
+
+	/* save exit time, used next exit when the reenter time is known */
+	vcpu->arch.timing_last_exit = vcpu->arch.timing_exit.tv64;
+
+	if (unlikely(vcpu->arch.last_exit_type == 0xDEAD || exit == 0))
+		return; /* skip incomplete cycle (e.g. after reset) */
+
+	/* update statistics for average and standard deviation */
+	add_exit_timing(vcpu, (enter - exit), vcpu->arch.last_exit_type);
+	/* enter -> timing_last_exit is time spent in guest - log this too */
+	add_exit_timing(vcpu, (vcpu->arch.timing_last_exit - enter),
+			TIMEINGUEST);
+}
+
+static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
+	[MMIO_EXITS] =              "MMIO",
+	[SIGNAL_EXITS] =            "SIGNAL",
+	[ITLB_REAL_MISS_EXITS] =    "ITLBREAL",
+	[ITLB_VIRT_MISS_EXITS] =    "ITLBVIRT",
+	[DTLB_REAL_MISS_EXITS] =    "DTLBREAL",
+	[DTLB_VIRT_MISS_EXITS] =    "DTLBVIRT",
+	[SYSCALL_EXITS] =           "SYSCALL",
+	[ISI_EXITS] =               "ISI",
+	[DSI_EXITS] =               "DSI",
+	[EMULATED_INST_EXITS] =     "EMULINST",
+	[EMULATED_MTMSRWE_EXITS] =  "EMUL_WAIT",
+	[EMULATED_WRTEE_EXITS] =    "EMUL_WRTEE",
+	[EMULATED_MTSPR_EXITS] =    "EMUL_MTSPR",
+	[EMULATED_MFSPR_EXITS] =    "EMUL_MFSPR",
+	[EMULATED_MTMSR_EXITS] =    "EMUL_MTMSR",
+	[EMULATED_MFMSR_EXITS] =    "EMUL_MFMSR",
+	[EMULATED_TLBSX_EXITS] =    "EMUL_TLBSX",
+	[EMULATED_TLBWE_EXITS] =    "EMUL_TLBWE",
+	[EMULATED_RFI_EXITS] =      "EMUL_RFI",
+	[DEC_EXITS] =               "DEC",
+	[EXT_INTR_EXITS] =          "EXTINT",
+	[HALT_WAKEUP] =             "HALT",
+	[USR_PR_INST] =             "USR_PR_INST",
+	[FP_UNAVAIL] =              "FP_UNAVAIL",
+	[DEBUG_EXITS] =             "DEBUG",
+	[TIMEINGUEST] =             "TIMEINGUEST"
+};
+
+static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
+{
+	struct kvm_vcpu *vcpu = m->private;
+	int i;
+	u64 min, max, sum, sum_quad;
+
+	seq_printf(m, "%s", "type	count	min	max	sum	sum_squared\n");
+
+
+	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+
+		min = vcpu->arch.timing_min_duration[i];
+		do_div(min, tb_ticks_per_usec);
+		max = vcpu->arch.timing_max_duration[i];
+		do_div(max, tb_ticks_per_usec);
+		sum = vcpu->arch.timing_sum_duration[i];
+		do_div(sum, tb_ticks_per_usec);
+		sum_quad = vcpu->arch.timing_sum_quad_duration[i];
+		do_div(sum_quad, tb_ticks_per_usec);
+
+		seq_printf(m, "%12s	%10d	%10lld	%10lld	%20lld	%20lld\n",
+			kvm_exit_names[i],
+			vcpu->arch.timing_count_type[i],
+			min,
+			max,
+			sum,
+			sum_quad);
+
+	}
+	return 0;
+}
+
+/* Write 'c' to clear the timing statistics. */
+static ssize_t kvmppc_exit_timing_write(struct file *file,
+				       const char __user *user_buf,
+				       size_t count, loff_t *ppos)
+{
+	int err = -EINVAL;
+	char c;
+
+	if (count > 1) {
+		goto done;
+	}
+
+	if (get_user(c, user_buf)) {
+		err = -EFAULT;
+		goto done;
+	}
+
+	if (c == 'c') {
+		struct seq_file *seqf = file->private_data;
+		struct kvm_vcpu *vcpu = seqf->private;
+		/* Write does not affect our buffers previously generated with
+		 * show. seq_file is locked here to prevent races of init with
+		 * a show call */
+		mutex_lock(&seqf->lock);
+		kvmppc_init_timing_stats(vcpu);
+		mutex_unlock(&seqf->lock);
+		err = count;
+	}
+
+done:
+	return err;
+}
+
+static int kvmppc_exit_timing_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmppc_exit_timing_show, inode->i_private);
+}
+
+static const struct file_operations kvmppc_exit_timing_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kvmppc_exit_timing_open,
+	.read    = seq_read,
+	.write   = kvmppc_exit_timing_write,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id)
+{
+	static char dbg_fname[50];
+	struct dentry *debugfs_file;
+
+	snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%u_timing",
+		 current->pid, id);
+	debugfs_file = debugfs_create_file(dbg_fname, 0666,
+					kvm_debugfs_dir, vcpu,
+					&kvmppc_exit_timing_fops);
+
+	if (!debugfs_file) {
+		printk(KERN_ERR"%s: error creating debugfs file %s\n",
+			__func__, dbg_fname);
+		return;
+	}
+
+	vcpu->arch.debugfs_exit_timing = debugfs_file;
+}
+
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.debugfs_exit_timing) {
+		debugfs_remove(vcpu->arch.debugfs_exit_timing);
+		vcpu->arch.debugfs_exit_timing = NULL;
+	}
+}
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
new file mode 100644
index 000000000..3123690c8
--- /dev/null
+++ b/arch/powerpc/kvm/timing.h
@@ -0,0 +1,109 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#ifndef __POWERPC_KVM_EXITTIMING_H__
+#define __POWERPC_KVM_EXITTIMING_H__
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id);
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu);
+
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type)
+{
+	vcpu->arch.last_exit_type = type;
+}
+
+#else
+/* if exit timing is not configured there is no need to build the c file */
+static inline void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu,
+						unsigned int id) {}
+static inline void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {}
+#endif /* CONFIG_KVM_EXIT_TIMING */
+
+/* account the exit in kvm_stats */
+static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
+{
+	/* type has to be known at build time for optimization */
+
+	/* The BUILD_BUG_ON below breaks in funny ways, commented out
+	 * for now ... -BenH
+	BUILD_BUG_ON(!__builtin_constant_p(type));
+	*/
+	switch (type) {
+	case EXT_INTR_EXITS:
+		vcpu->stat.ext_intr_exits++;
+		break;
+	case DEC_EXITS:
+		vcpu->stat.dec_exits++;
+		break;
+	case EMULATED_INST_EXITS:
+		vcpu->stat.emulated_inst_exits++;
+		break;
+	case DSI_EXITS:
+		vcpu->stat.dsi_exits++;
+		break;
+	case ISI_EXITS:
+		vcpu->stat.isi_exits++;
+		break;
+	case SYSCALL_EXITS:
+		vcpu->stat.syscall_exits++;
+		break;
+	case DTLB_REAL_MISS_EXITS:
+		vcpu->stat.dtlb_real_miss_exits++;
+		break;
+	case DTLB_VIRT_MISS_EXITS:
+		vcpu->stat.dtlb_virt_miss_exits++;
+		break;
+	case MMIO_EXITS:
+		vcpu->stat.mmio_exits++;
+		break;
+	case ITLB_REAL_MISS_EXITS:
+		vcpu->stat.itlb_real_miss_exits++;
+		break;
+	case ITLB_VIRT_MISS_EXITS:
+		vcpu->stat.itlb_virt_miss_exits++;
+		break;
+	case SIGNAL_EXITS:
+		vcpu->stat.signal_exits++;
+		break;
+	case DBELL_EXITS:
+		vcpu->stat.dbell_exits++;
+		break;
+	case GDBELL_EXITS:
+		vcpu->stat.gdbell_exits++;
+		break;
+	}
+}
+
+/* wrapper to set exit time and account for it in kvm_stats */
+static inline void kvmppc_account_exit(struct kvm_vcpu *vcpu, int type)
+{
+	kvmppc_set_exit_type(vcpu, type);
+	kvmppc_account_exit_stat(vcpu, type);
+}
+
+#endif /* __POWERPC_KVM_EXITTIMING_H__ */
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
new file mode 100644
index 000000000..2e0e67ef3
--- /dev/null
+++ b/arch/powerpc/kvm/trace.h
@@ -0,0 +1,122 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Tracepoint for guest mode entry.
+ */
+TRACE_EVENT(kvm_ppc_instr,
+	TP_PROTO(unsigned int inst, unsigned long _pc, unsigned int emulate),
+	TP_ARGS(inst, _pc, emulate),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	inst		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned int,	emulate		)
+	),
+
+	TP_fast_assign(
+		__entry->inst		= inst;
+		__entry->pc		= _pc;
+		__entry->emulate	= emulate;
+	),
+
+	TP_printk("inst %u pc 0x%lx emulate %u\n",
+		  __entry->inst, __entry->pc, __entry->emulate)
+);
+
+TRACE_EVENT(kvm_stlb_inval,
+	TP_PROTO(unsigned int stlb_index),
+	TP_ARGS(stlb_index),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	stlb_index	)
+	),
+
+	TP_fast_assign(
+		__entry->stlb_index	= stlb_index;
+	),
+
+	TP_printk("stlb_index %u", __entry->stlb_index)
+);
+
+TRACE_EVENT(kvm_stlb_write,
+	TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0,
+		 unsigned int word1, unsigned int word2),
+	TP_ARGS(victim, tid, word0, word1, word2),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	victim		)
+		__field(	unsigned int,	tid		)
+		__field(	unsigned int,	word0		)
+		__field(	unsigned int,	word1		)
+		__field(	unsigned int,	word2		)
+	),
+
+	TP_fast_assign(
+		__entry->victim		= victim;
+		__entry->tid		= tid;
+		__entry->word0		= word0;
+		__entry->word1		= word1;
+		__entry->word2		= word2;
+	),
+
+	TP_printk("victim %u tid %u w0 %u w1 %u w2 %u",
+		__entry->victim, __entry->tid, __entry->word0,
+		__entry->word1, __entry->word2)
+);
+
+TRACE_EVENT(kvm_gtlb_write,
+	TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0,
+		 unsigned int word1, unsigned int word2),
+	TP_ARGS(gtlb_index, tid, word0, word1, word2),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	gtlb_index	)
+		__field(	unsigned int,	tid		)
+		__field(	unsigned int,	word0		)
+		__field(	unsigned int,	word1		)
+		__field(	unsigned int,	word2		)
+	),
+
+	TP_fast_assign(
+		__entry->gtlb_index	= gtlb_index;
+		__entry->tid		= tid;
+		__entry->word0		= word0;
+		__entry->word1		= word1;
+		__entry->word2		= word2;
+	),
+
+	TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u",
+		__entry->gtlb_index, __entry->tid, __entry->word0,
+		__entry->word1, __entry->word2)
+);
+
+TRACE_EVENT(kvm_check_requests,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	cpu_nr		)
+		__field(	__u32,	requests	)
+	),
+
+	TP_fast_assign(
+		__entry->cpu_nr		= vcpu->vcpu_id;
+		__entry->requests	= vcpu->requests;
+	),
+
+	TP_printk("vcpu=%x requests=%x",
+		__entry->cpu_nr, __entry->requests)
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h
new file mode 100644
index 000000000..f647ce0f4
--- /dev/null
+++ b/arch/powerpc/kvm/trace_book3s.h
@@ -0,0 +1,32 @@
+#if !defined(_TRACE_KVM_BOOK3S_H)
+#define _TRACE_KVM_BOOK3S_H
+
+/*
+ * Common defines used by the trace macros in trace_pr.h and trace_hv.h
+ */
+
+#define kvm_trace_symbol_exit \
+	{0x100, "SYSTEM_RESET"}, \
+	{0x200, "MACHINE_CHECK"}, \
+	{0x300, "DATA_STORAGE"}, \
+	{0x380, "DATA_SEGMENT"}, \
+	{0x400, "INST_STORAGE"}, \
+	{0x480, "INST_SEGMENT"}, \
+	{0x500, "EXTERNAL"}, \
+	{0x501, "EXTERNAL_LEVEL"}, \
+	{0x502, "EXTERNAL_HV"}, \
+	{0x600, "ALIGNMENT"}, \
+	{0x700, "PROGRAM"}, \
+	{0x800, "FP_UNAVAIL"}, \
+	{0x900, "DECREMENTER"}, \
+	{0x980, "HV_DECREMENTER"}, \
+	{0xc00, "SYSCALL"}, \
+	{0xd00, "TRACE"}, \
+	{0xe00, "H_DATA_STORAGE"}, \
+	{0xe20, "H_INST_STORAGE"}, \
+	{0xe40, "H_EMUL_ASSIST"}, \
+	{0xf00, "PERFMON"}, \
+	{0xf20, "ALTIVEC"}, \
+	{0xf40, "VSX"}
+
+#endif
diff --git a/arch/powerpc/kvm/trace_booke.h b/arch/powerpc/kvm/trace_booke.h
new file mode 100644
index 000000000..7ec534d1d
--- /dev/null
+++ b/arch/powerpc/kvm/trace_booke.h
@@ -0,0 +1,220 @@
+#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_BOOKE_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_booke
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_booke
+
+#define kvm_trace_symbol_exit \
+	{0, "CRITICAL"}, \
+	{1, "MACHINE_CHECK"}, \
+	{2, "DATA_STORAGE"}, \
+	{3, "INST_STORAGE"}, \
+	{4, "EXTERNAL"}, \
+	{5, "ALIGNMENT"}, \
+	{6, "PROGRAM"}, \
+	{7, "FP_UNAVAIL"}, \
+	{8, "SYSCALL"}, \
+	{9, "AP_UNAVAIL"}, \
+	{10, "DECREMENTER"}, \
+	{11, "FIT"}, \
+	{12, "WATCHDOG"}, \
+	{13, "DTLB_MISS"}, \
+	{14, "ITLB_MISS"}, \
+	{15, "DEBUG"}, \
+	{32, "SPE_UNAVAIL"}, \
+	{33, "SPE_FP_DATA"}, \
+	{34, "SPE_FP_ROUND"}, \
+	{35, "PERFORMANCE_MONITOR"}, \
+	{36, "DOORBELL"}, \
+	{37, "DOORBELL_CRITICAL"}, \
+	{38, "GUEST_DBELL"}, \
+	{39, "GUEST_DBELL_CRIT"}, \
+	{40, "HV_SYSCALL"}, \
+	{41, "HV_PRIV"}
+
+TRACE_EVENT(kvm_exit,
+	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_nr, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	exit_nr		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned long,	msr		)
+		__field(	unsigned long,	dar		)
+		__field(	unsigned long,	last_inst	)
+	),
+
+	TP_fast_assign(
+		__entry->exit_nr	= exit_nr;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->dar		= kvmppc_get_fault_dar(vcpu);
+		__entry->msr		= vcpu->arch.shared->msr;
+		__entry->last_inst	= vcpu->arch.last_inst;
+	),
+
+	TP_printk("exit=%s"
+		" | pc=0x%lx"
+		" | msr=0x%lx"
+		" | dar=0x%lx"
+		" | last_inst=0x%lx"
+		,
+		__print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+		__entry->pc,
+		__entry->msr,
+		__entry->dar,
+		__entry->last_inst
+		)
+);
+
+TRACE_EVENT(kvm_unmap_hva,
+	TP_PROTO(unsigned long hva),
+	TP_ARGS(hva),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	hva		)
+	),
+
+	TP_fast_assign(
+		__entry->hva		= hva;
+	),
+
+	TP_printk("unmap hva 0x%lx\n", __entry->hva)
+);
+
+TRACE_EVENT(kvm_booke206_stlb_write,
+	TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3),
+	TP_ARGS(mas0, mas8, mas1, mas2, mas7_3),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	mas0		)
+		__field(	__u32,	mas8		)
+		__field(	__u32,	mas1		)
+		__field(	__u64,	mas2		)
+		__field(	__u64,	mas7_3		)
+	),
+
+	TP_fast_assign(
+		__entry->mas0		= mas0;
+		__entry->mas8		= mas8;
+		__entry->mas1		= mas1;
+		__entry->mas2		= mas2;
+		__entry->mas7_3		= mas7_3;
+	),
+
+	TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx",
+		__entry->mas0, __entry->mas8, __entry->mas1,
+		__entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_gtlb_write,
+	TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3),
+	TP_ARGS(mas0, mas1, mas2, mas7_3),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	mas0		)
+		__field(	__u32,	mas1		)
+		__field(	__u64,	mas2		)
+		__field(	__u64,	mas7_3		)
+	),
+
+	TP_fast_assign(
+		__entry->mas0		= mas0;
+		__entry->mas1		= mas1;
+		__entry->mas2		= mas2;
+		__entry->mas7_3		= mas7_3;
+	),
+
+	TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx",
+		__entry->mas0, __entry->mas1,
+		__entry->mas2, __entry->mas7_3)
+);
+
+TRACE_EVENT(kvm_booke206_ref_release,
+	TP_PROTO(__u64 pfn, __u32 flags),
+	TP_ARGS(pfn, flags),
+
+	TP_STRUCT__entry(
+		__field(	__u64,	pfn		)
+		__field(	__u32,	flags		)
+	),
+
+	TP_fast_assign(
+		__entry->pfn		= pfn;
+		__entry->flags		= flags;
+	),
+
+	TP_printk("pfn=%llx flags=%x",
+		__entry->pfn, __entry->flags)
+);
+
+#ifdef CONFIG_SPE_POSSIBLE
+#define kvm_trace_symbol_irqprio_spe \
+	{BOOKE_IRQPRIO_SPE_UNAVAIL, "SPE_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_SPE_FP_DATA, "SPE_FP_DATA"}, \
+	{BOOKE_IRQPRIO_SPE_FP_ROUND, "SPE_FP_ROUND"},
+#else
+#define kvm_trace_symbol_irqprio_spe
+#endif
+
+#ifdef CONFIG_PPC_E500MC
+#define kvm_trace_symbol_irqprio_e500mc \
+	{BOOKE_IRQPRIO_ALTIVEC_UNAVAIL, "ALTIVEC_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_ALTIVEC_ASSIST, "ALTIVEC_ASSIST"},
+#else
+#define kvm_trace_symbol_irqprio_e500mc
+#endif
+
+#define kvm_trace_symbol_irqprio \
+	kvm_trace_symbol_irqprio_spe \
+	kvm_trace_symbol_irqprio_e500mc \
+	{BOOKE_IRQPRIO_DATA_STORAGE, "DATA_STORAGE"}, \
+	{BOOKE_IRQPRIO_INST_STORAGE, "INST_STORAGE"}, \
+	{BOOKE_IRQPRIO_ALIGNMENT, "ALIGNMENT"}, \
+	{BOOKE_IRQPRIO_PROGRAM, "PROGRAM"}, \
+	{BOOKE_IRQPRIO_FP_UNAVAIL, "FP_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_SYSCALL, "SYSCALL"}, \
+	{BOOKE_IRQPRIO_AP_UNAVAIL, "AP_UNAVAIL"}, \
+	{BOOKE_IRQPRIO_DTLB_MISS, "DTLB_MISS"}, \
+	{BOOKE_IRQPRIO_ITLB_MISS, "ITLB_MISS"}, \
+	{BOOKE_IRQPRIO_MACHINE_CHECK, "MACHINE_CHECK"}, \
+	{BOOKE_IRQPRIO_DEBUG, "DEBUG"}, \
+	{BOOKE_IRQPRIO_CRITICAL, "CRITICAL"}, \
+	{BOOKE_IRQPRIO_WATCHDOG, "WATCHDOG"}, \
+	{BOOKE_IRQPRIO_EXTERNAL, "EXTERNAL"}, \
+	{BOOKE_IRQPRIO_FIT, "FIT"}, \
+	{BOOKE_IRQPRIO_DECREMENTER, "DECREMENTER"}, \
+	{BOOKE_IRQPRIO_PERFORMANCE_MONITOR, "PERFORMANCE_MONITOR"}, \
+	{BOOKE_IRQPRIO_EXTERNAL_LEVEL, "EXTERNAL_LEVEL"}, \
+	{BOOKE_IRQPRIO_DBELL, "DBELL"}, \
+	{BOOKE_IRQPRIO_DBELL_CRIT, "DBELL_CRIT"} \
+
+TRACE_EVENT(kvm_booke_queue_irqprio,
+	TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
+	TP_ARGS(vcpu, priority),
+
+	TP_STRUCT__entry(
+		__field(	__u32,	cpu_nr		)
+		__field(	__u32,	priority		)
+		__field(	unsigned long,	pending		)
+	),
+
+	TP_fast_assign(
+		__entry->cpu_nr		= vcpu->vcpu_id;
+		__entry->priority	= priority;
+		__entry->pending	= vcpu->arch.pending_exceptions;
+	),
+
+	TP_printk("vcpu=%x prio=%s pending=%lx",
+		__entry->cpu_nr,
+		__print_symbolic(__entry->priority, kvm_trace_symbol_irqprio),
+		__entry->pending)
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
new file mode 100644
index 000000000..33d9daff5
--- /dev/null
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -0,0 +1,477 @@
+#if !defined(_TRACE_KVM_HV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_HV_H
+
+#include <linux/tracepoint.h>
+#include "trace_book3s.h"
+#include <asm/hvcall.h>
+#include <asm/kvm_asm.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_hv
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_hv
+
+#define kvm_trace_symbol_hcall \
+	{H_REMOVE,			"H_REMOVE"}, \
+	{H_ENTER,			"H_ENTER"}, \
+	{H_READ,			"H_READ"}, \
+	{H_CLEAR_MOD,			"H_CLEAR_MOD"}, \
+	{H_CLEAR_REF,			"H_CLEAR_REF"}, \
+	{H_PROTECT,			"H_PROTECT"}, \
+	{H_GET_TCE,			"H_GET_TCE"}, \
+	{H_PUT_TCE,			"H_PUT_TCE"}, \
+	{H_SET_SPRG0,			"H_SET_SPRG0"}, \
+	{H_SET_DABR,			"H_SET_DABR"}, \
+	{H_PAGE_INIT,			"H_PAGE_INIT"}, \
+	{H_SET_ASR,			"H_SET_ASR"}, \
+	{H_ASR_ON,			"H_ASR_ON"}, \
+	{H_ASR_OFF,			"H_ASR_OFF"}, \
+	{H_LOGICAL_CI_LOAD,		"H_LOGICAL_CI_LOAD"}, \
+	{H_LOGICAL_CI_STORE,		"H_LOGICAL_CI_STORE"}, \
+	{H_LOGICAL_CACHE_LOAD,		"H_LOGICAL_CACHE_LOAD"}, \
+	{H_LOGICAL_CACHE_STORE,		"H_LOGICAL_CACHE_STORE"}, \
+	{H_LOGICAL_ICBI,		"H_LOGICAL_ICBI"}, \
+	{H_LOGICAL_DCBF,		"H_LOGICAL_DCBF"}, \
+	{H_GET_TERM_CHAR,		"H_GET_TERM_CHAR"}, \
+	{H_PUT_TERM_CHAR,		"H_PUT_TERM_CHAR"}, \
+	{H_REAL_TO_LOGICAL,		"H_REAL_TO_LOGICAL"}, \
+	{H_HYPERVISOR_DATA,		"H_HYPERVISOR_DATA"}, \
+	{H_EOI,				"H_EOI"}, \
+	{H_CPPR,			"H_CPPR"}, \
+	{H_IPI,				"H_IPI"}, \
+	{H_IPOLL,			"H_IPOLL"}, \
+	{H_XIRR,			"H_XIRR"}, \
+	{H_PERFMON,			"H_PERFMON"}, \
+	{H_MIGRATE_DMA,			"H_MIGRATE_DMA"}, \
+	{H_REGISTER_VPA,		"H_REGISTER_VPA"}, \
+	{H_CEDE,			"H_CEDE"}, \
+	{H_CONFER,			"H_CONFER"}, \
+	{H_PROD,			"H_PROD"}, \
+	{H_GET_PPP,			"H_GET_PPP"}, \
+	{H_SET_PPP,			"H_SET_PPP"}, \
+	{H_PURR,			"H_PURR"}, \
+	{H_PIC,				"H_PIC"}, \
+	{H_REG_CRQ,			"H_REG_CRQ"}, \
+	{H_FREE_CRQ,			"H_FREE_CRQ"}, \
+	{H_VIO_SIGNAL,			"H_VIO_SIGNAL"}, \
+	{H_SEND_CRQ,			"H_SEND_CRQ"}, \
+	{H_COPY_RDMA,			"H_COPY_RDMA"}, \
+	{H_REGISTER_LOGICAL_LAN,	"H_REGISTER_LOGICAL_LAN"}, \
+	{H_FREE_LOGICAL_LAN,		"H_FREE_LOGICAL_LAN"}, \
+	{H_ADD_LOGICAL_LAN_BUFFER,	"H_ADD_LOGICAL_LAN_BUFFER"}, \
+	{H_SEND_LOGICAL_LAN,		"H_SEND_LOGICAL_LAN"}, \
+	{H_BULK_REMOVE,			"H_BULK_REMOVE"}, \
+	{H_MULTICAST_CTRL,		"H_MULTICAST_CTRL"}, \
+	{H_SET_XDABR,			"H_SET_XDABR"}, \
+	{H_STUFF_TCE,			"H_STUFF_TCE"}, \
+	{H_PUT_TCE_INDIRECT,		"H_PUT_TCE_INDIRECT"}, \
+	{H_CHANGE_LOGICAL_LAN_MAC,	"H_CHANGE_LOGICAL_LAN_MAC"}, \
+	{H_VTERM_PARTNER_INFO,		"H_VTERM_PARTNER_INFO"}, \
+	{H_REGISTER_VTERM,		"H_REGISTER_VTERM"}, \
+	{H_FREE_VTERM,			"H_FREE_VTERM"}, \
+	{H_RESET_EVENTS,		"H_RESET_EVENTS"}, \
+	{H_ALLOC_RESOURCE,		"H_ALLOC_RESOURCE"}, \
+	{H_FREE_RESOURCE,		"H_FREE_RESOURCE"}, \
+	{H_MODIFY_QP,			"H_MODIFY_QP"}, \
+	{H_QUERY_QP,			"H_QUERY_QP"}, \
+	{H_REREGISTER_PMR,		"H_REREGISTER_PMR"}, \
+	{H_REGISTER_SMR,		"H_REGISTER_SMR"}, \
+	{H_QUERY_MR,			"H_QUERY_MR"}, \
+	{H_QUERY_MW,			"H_QUERY_MW"}, \
+	{H_QUERY_HCA,			"H_QUERY_HCA"}, \
+	{H_QUERY_PORT,			"H_QUERY_PORT"}, \
+	{H_MODIFY_PORT,			"H_MODIFY_PORT"}, \
+	{H_DEFINE_AQP1,			"H_DEFINE_AQP1"}, \
+	{H_GET_TRACE_BUFFER,		"H_GET_TRACE_BUFFER"}, \
+	{H_DEFINE_AQP0,			"H_DEFINE_AQP0"}, \
+	{H_RESIZE_MR,			"H_RESIZE_MR"}, \
+	{H_ATTACH_MCQP,			"H_ATTACH_MCQP"}, \
+	{H_DETACH_MCQP,			"H_DETACH_MCQP"}, \
+	{H_CREATE_RPT,			"H_CREATE_RPT"}, \
+	{H_REMOVE_RPT,			"H_REMOVE_RPT"}, \
+	{H_REGISTER_RPAGES,		"H_REGISTER_RPAGES"}, \
+	{H_DISABLE_AND_GETC,		"H_DISABLE_AND_GETC"}, \
+	{H_ERROR_DATA,			"H_ERROR_DATA"}, \
+	{H_GET_HCA_INFO,		"H_GET_HCA_INFO"}, \
+	{H_GET_PERF_COUNT,		"H_GET_PERF_COUNT"}, \
+	{H_MANAGE_TRACE,		"H_MANAGE_TRACE"}, \
+	{H_FREE_LOGICAL_LAN_BUFFER,	"H_FREE_LOGICAL_LAN_BUFFER"}, \
+	{H_QUERY_INT_STATE,		"H_QUERY_INT_STATE"}, \
+	{H_POLL_PENDING,		"H_POLL_PENDING"}, \
+	{H_ILLAN_ATTRIBUTES,		"H_ILLAN_ATTRIBUTES"}, \
+	{H_MODIFY_HEA_QP,		"H_MODIFY_HEA_QP"}, \
+	{H_QUERY_HEA_QP,		"H_QUERY_HEA_QP"}, \
+	{H_QUERY_HEA,			"H_QUERY_HEA"}, \
+	{H_QUERY_HEA_PORT,		"H_QUERY_HEA_PORT"}, \
+	{H_MODIFY_HEA_PORT,		"H_MODIFY_HEA_PORT"}, \
+	{H_REG_BCMC,			"H_REG_BCMC"}, \
+	{H_DEREG_BCMC,			"H_DEREG_BCMC"}, \
+	{H_REGISTER_HEA_RPAGES,		"H_REGISTER_HEA_RPAGES"}, \
+	{H_DISABLE_AND_GET_HEA,		"H_DISABLE_AND_GET_HEA"}, \
+	{H_GET_HEA_INFO,		"H_GET_HEA_INFO"}, \
+	{H_ALLOC_HEA_RESOURCE,		"H_ALLOC_HEA_RESOURCE"}, \
+	{H_ADD_CONN,			"H_ADD_CONN"}, \
+	{H_DEL_CONN,			"H_DEL_CONN"}, \
+	{H_JOIN,			"H_JOIN"}, \
+	{H_VASI_STATE,			"H_VASI_STATE"}, \
+	{H_ENABLE_CRQ,			"H_ENABLE_CRQ"}, \
+	{H_GET_EM_PARMS,		"H_GET_EM_PARMS"}, \
+	{H_SET_MPP,			"H_SET_MPP"}, \
+	{H_GET_MPP,			"H_GET_MPP"}, \
+	{H_HOME_NODE_ASSOCIATIVITY,	"H_HOME_NODE_ASSOCIATIVITY"}, \
+	{H_BEST_ENERGY,			"H_BEST_ENERGY"}, \
+	{H_XIRR_X,			"H_XIRR_X"}, \
+	{H_RANDOM,			"H_RANDOM"}, \
+	{H_COP,				"H_COP"}, \
+	{H_GET_MPP_X,			"H_GET_MPP_X"}, \
+	{H_SET_MODE,			"H_SET_MODE"}, \
+	{H_RTAS,			"H_RTAS"}
+
+#define kvm_trace_symbol_kvmret \
+	{RESUME_GUEST,			"RESUME_GUEST"}, \
+	{RESUME_GUEST_NV,		"RESUME_GUEST_NV"}, \
+	{RESUME_HOST,			"RESUME_HOST"}, \
+	{RESUME_HOST_NV,		"RESUME_HOST_NV"}
+
+#define kvm_trace_symbol_hcall_rc \
+	{H_SUCCESS,			"H_SUCCESS"}, \
+	{H_BUSY,			"H_BUSY"}, \
+	{H_CLOSED,			"H_CLOSED"}, \
+	{H_NOT_AVAILABLE,		"H_NOT_AVAILABLE"}, \
+	{H_CONSTRAINED,			"H_CONSTRAINED"}, \
+	{H_PARTIAL,			"H_PARTIAL"}, \
+	{H_IN_PROGRESS,			"H_IN_PROGRESS"}, \
+	{H_PAGE_REGISTERED,		"H_PAGE_REGISTERED"}, \
+	{H_PARTIAL_STORE,		"H_PARTIAL_STORE"}, \
+	{H_PENDING,			"H_PENDING"}, \
+	{H_CONTINUE,			"H_CONTINUE"}, \
+	{H_LONG_BUSY_START_RANGE,	"H_LONG_BUSY_START_RANGE"}, \
+	{H_LONG_BUSY_ORDER_1_MSEC,	"H_LONG_BUSY_ORDER_1_MSEC"}, \
+	{H_LONG_BUSY_ORDER_10_MSEC,	"H_LONG_BUSY_ORDER_10_MSEC"}, \
+	{H_LONG_BUSY_ORDER_100_MSEC,	"H_LONG_BUSY_ORDER_100_MSEC"}, \
+	{H_LONG_BUSY_ORDER_1_SEC,	"H_LONG_BUSY_ORDER_1_SEC"}, \
+	{H_LONG_BUSY_ORDER_10_SEC,	"H_LONG_BUSY_ORDER_10_SEC"}, \
+	{H_LONG_BUSY_ORDER_100_SEC,	"H_LONG_BUSY_ORDER_100_SEC"}, \
+	{H_LONG_BUSY_END_RANGE,		"H_LONG_BUSY_END_RANGE"}, \
+	{H_TOO_HARD,			"H_TOO_HARD"}, \
+	{H_HARDWARE,			"H_HARDWARE"}, \
+	{H_FUNCTION,			"H_FUNCTION"}, \
+	{H_PRIVILEGE,			"H_PRIVILEGE"}, \
+	{H_PARAMETER,			"H_PARAMETER"}, \
+	{H_BAD_MODE,			"H_BAD_MODE"}, \
+	{H_PTEG_FULL,			"H_PTEG_FULL"}, \
+	{H_NOT_FOUND,			"H_NOT_FOUND"}, \
+	{H_RESERVED_DABR,		"H_RESERVED_DABR"}, \
+	{H_NO_MEM,			"H_NO_MEM"}, \
+	{H_AUTHORITY,			"H_AUTHORITY"}, \
+	{H_PERMISSION,			"H_PERMISSION"}, \
+	{H_DROPPED,			"H_DROPPED"}, \
+	{H_SOURCE_PARM,			"H_SOURCE_PARM"}, \
+	{H_DEST_PARM,			"H_DEST_PARM"}, \
+	{H_REMOTE_PARM,			"H_REMOTE_PARM"}, \
+	{H_RESOURCE,			"H_RESOURCE"}, \
+	{H_ADAPTER_PARM,		"H_ADAPTER_PARM"}, \
+	{H_RH_PARM,			"H_RH_PARM"}, \
+	{H_RCQ_PARM,			"H_RCQ_PARM"}, \
+	{H_SCQ_PARM,			"H_SCQ_PARM"}, \
+	{H_EQ_PARM,			"H_EQ_PARM"}, \
+	{H_RT_PARM,			"H_RT_PARM"}, \
+	{H_ST_PARM,			"H_ST_PARM"}, \
+	{H_SIGT_PARM,			"H_SIGT_PARM"}, \
+	{H_TOKEN_PARM,			"H_TOKEN_PARM"}, \
+	{H_MLENGTH_PARM,		"H_MLENGTH_PARM"}, \
+	{H_MEM_PARM,			"H_MEM_PARM"}, \
+	{H_MEM_ACCESS_PARM,		"H_MEM_ACCESS_PARM"}, \
+	{H_ATTR_PARM,			"H_ATTR_PARM"}, \
+	{H_PORT_PARM,			"H_PORT_PARM"}, \
+	{H_MCG_PARM,			"H_MCG_PARM"}, \
+	{H_VL_PARM,			"H_VL_PARM"}, \
+	{H_TSIZE_PARM,			"H_TSIZE_PARM"}, \
+	{H_TRACE_PARM,			"H_TRACE_PARM"}, \
+	{H_MASK_PARM,			"H_MASK_PARM"}, \
+	{H_MCG_FULL,			"H_MCG_FULL"}, \
+	{H_ALIAS_EXIST,			"H_ALIAS_EXIST"}, \
+	{H_P_COUNTER,			"H_P_COUNTER"}, \
+	{H_TABLE_FULL,			"H_TABLE_FULL"}, \
+	{H_ALT_TABLE,			"H_ALT_TABLE"}, \
+	{H_MR_CONDITION,		"H_MR_CONDITION"}, \
+	{H_NOT_ENOUGH_RESOURCES,	"H_NOT_ENOUGH_RESOURCES"}, \
+	{H_R_STATE,			"H_R_STATE"}, \
+	{H_RESCINDED,			"H_RESCINDED"}, \
+	{H_P2,				"H_P2"}, \
+	{H_P3,				"H_P3"}, \
+	{H_P4,				"H_P4"}, \
+	{H_P5,				"H_P5"}, \
+	{H_P6,				"H_P6"}, \
+	{H_P7,				"H_P7"}, \
+	{H_P8,				"H_P8"}, \
+	{H_P9,				"H_P9"}, \
+	{H_TOO_BIG,			"H_TOO_BIG"}, \
+	{H_OVERLAP,			"H_OVERLAP"}, \
+	{H_INTERRUPT,			"H_INTERRUPT"}, \
+	{H_BAD_DATA,			"H_BAD_DATA"}, \
+	{H_NOT_ACTIVE,			"H_NOT_ACTIVE"}, \
+	{H_SG_LIST,			"H_SG_LIST"}, \
+	{H_OP_MODE,			"H_OP_MODE"}, \
+	{H_COP_HW,			"H_COP_HW"}, \
+	{H_UNSUPPORTED_FLAG_START,	"H_UNSUPPORTED_FLAG_START"}, \
+	{H_UNSUPPORTED_FLAG_END,	"H_UNSUPPORTED_FLAG_END"}, \
+	{H_MULTI_THREADS_ACTIVE,	"H_MULTI_THREADS_ACTIVE"}, \
+	{H_OUTSTANDING_COP_OPS,		"H_OUTSTANDING_COP_OPS"}
+
+TRACE_EVENT(kvm_guest_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	pc)
+		__field(unsigned long,  pending_exceptions)
+		__field(u8,		ceded)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu->vcpu_id;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->ceded		= vcpu->arch.ceded;
+		__entry->pending_exceptions  = vcpu->arch.pending_exceptions;
+	),
+
+	TP_printk("VCPU %d: pc=0x%lx pexcp=0x%lx ceded=%d",
+			__entry->vcpu_id,
+			__entry->pc,
+			__entry->pending_exceptions, __entry->ceded)
+);
+
+TRACE_EVENT(kvm_guest_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(int,		trap)
+		__field(unsigned long,	pc)
+		__field(unsigned long,	msr)
+		__field(u8,		ceded)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu->vcpu_id;
+		__entry->trap	 = vcpu->arch.trap;
+		__entry->ceded	 = vcpu->arch.ceded;
+		__entry->pc	 = kvmppc_get_pc(vcpu);
+		__entry->msr	 = vcpu->arch.shregs.msr;
+	),
+
+	TP_printk("VCPU %d: trap=%s pc=0x%lx msr=0x%lx, ceded=%d",
+		__entry->vcpu_id,
+		__print_symbolic(__entry->trap, kvm_trace_symbol_exit),
+		__entry->pc, __entry->msr, __entry->ceded
+	)
+);
+
+TRACE_EVENT(kvm_page_fault_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu, unsigned long *hptep,
+		 struct kvm_memory_slot *memslot, unsigned long ea,
+		 unsigned long dsisr),
+
+	TP_ARGS(vcpu, hptep, memslot, ea, dsisr),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	hpte_v)
+		__field(unsigned long,	hpte_r)
+		__field(unsigned long,	gpte_r)
+		__field(unsigned long,	ea)
+		__field(u64,		base_gfn)
+		__field(u32,		slot_flags)
+		__field(u32,		dsisr)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->hpte_v	  = hptep[0];
+		__entry->hpte_r	  = hptep[1];
+		__entry->gpte_r	  = hptep[2];
+		__entry->ea	  = ea;
+		__entry->dsisr	  = dsisr;
+		__entry->base_gfn = memslot ? memslot->base_gfn : -1UL;
+		__entry->slot_flags = memslot ? memslot->flags : 0;
+	),
+
+	TP_printk("VCPU %d: hpte=0x%lx:0x%lx guest=0x%lx ea=0x%lx,%x slot=0x%llx,0x%x",
+		   __entry->vcpu_id,
+		   __entry->hpte_v, __entry->hpte_r, __entry->gpte_r,
+		   __entry->ea, __entry->dsisr,
+		   __entry->base_gfn, __entry->slot_flags)
+);
+
+TRACE_EVENT(kvm_page_fault_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu, unsigned long *hptep, long ret),
+
+	TP_ARGS(vcpu, hptep, ret),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	hpte_v)
+		__field(unsigned long,	hpte_r)
+		__field(long,		ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->hpte_v	= hptep[0];
+		__entry->hpte_r	= hptep[1];
+		__entry->ret = ret;
+	),
+
+	TP_printk("VCPU %d: hpte=0x%lx:0x%lx ret=0x%lx",
+		   __entry->vcpu_id,
+		   __entry->hpte_v, __entry->hpte_r, __entry->ret)
+);
+
+TRACE_EVENT(kvm_hcall_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	req)
+		__field(unsigned long,	gpr4)
+		__field(unsigned long,	gpr5)
+		__field(unsigned long,	gpr6)
+		__field(unsigned long,	gpr7)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->req   = kvmppc_get_gpr(vcpu, 3);
+		__entry->gpr4  = kvmppc_get_gpr(vcpu, 4);
+		__entry->gpr5  = kvmppc_get_gpr(vcpu, 5);
+		__entry->gpr6  = kvmppc_get_gpr(vcpu, 6);
+		__entry->gpr7  = kvmppc_get_gpr(vcpu, 7);
+	),
+
+	TP_printk("VCPU %d: hcall=%s GPR4-7=0x%lx,0x%lx,0x%lx,0x%lx",
+		   __entry->vcpu_id,
+		   __print_symbolic(__entry->req, kvm_trace_symbol_hcall),
+		   __entry->gpr4, __entry->gpr5, __entry->gpr6, __entry->gpr7)
+);
+
+TRACE_EVENT(kvm_hcall_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu, int ret),
+
+	TP_ARGS(vcpu, ret),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(unsigned long,	ret)
+		__field(unsigned long,	hcall_rc)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->ret	  = ret;
+		__entry->hcall_rc = kvmppc_get_gpr(vcpu, 3);
+	),
+
+	TP_printk("VCPU %d: ret=%s hcall_rc=%s",
+		   __entry->vcpu_id,
+		   __print_symbolic(__entry->ret, kvm_trace_symbol_kvmret),
+		   __print_symbolic(__entry->ret & RESUME_FLAG_HOST ?
+					H_TOO_HARD : __entry->hcall_rc,
+					kvm_trace_symbol_hcall_rc))
+);
+
+TRACE_EVENT(kvmppc_run_core,
+	TP_PROTO(struct kvmppc_vcore *vc, int where),
+
+	TP_ARGS(vc, where),
+
+	TP_STRUCT__entry(
+		__field(int,	n_runnable)
+		__field(int,	runner_vcpu)
+		__field(int,	where)
+		__field(pid_t,	tgid)
+	),
+
+	TP_fast_assign(
+		__entry->runner_vcpu	= vc->runner->vcpu_id;
+		__entry->n_runnable	= vc->n_runnable;
+		__entry->where		= where;
+		__entry->tgid		= current->tgid;
+	),
+
+	TP_printk("%s runner_vcpu==%d runnable=%d tgid=%d",
+		    __entry->where ? "Exit" : "Enter",
+		    __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
+);
+
+TRACE_EVENT(kvmppc_vcore_blocked,
+	TP_PROTO(struct kvmppc_vcore *vc, int where),
+
+	TP_ARGS(vc, where),
+
+	TP_STRUCT__entry(
+		__field(int,	n_runnable)
+		__field(int,	runner_vcpu)
+		__field(int,	where)
+		__field(pid_t,	tgid)
+	),
+
+	TP_fast_assign(
+		__entry->runner_vcpu = vc->runner->vcpu_id;
+		__entry->n_runnable  = vc->n_runnable;
+		__entry->where       = where;
+		__entry->tgid	     = current->tgid;
+	),
+
+	TP_printk("%s runner_vcpu=%d runnable=%d tgid=%d",
+		   __entry->where ? "Exit" : "Enter",
+		   __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
+);
+
+TRACE_EVENT(kvmppc_run_vcpu_enter,
+	TP_PROTO(struct kvm_vcpu *vcpu),
+
+	TP_ARGS(vcpu),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(pid_t,		tgid)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->tgid	  = current->tgid;
+	),
+
+	TP_printk("VCPU %d: tgid=%d", __entry->vcpu_id, __entry->tgid)
+);
+
+TRACE_EVENT(kvmppc_run_vcpu_exit,
+	TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_run *run),
+
+	TP_ARGS(vcpu, run),
+
+	TP_STRUCT__entry(
+		__field(int,		vcpu_id)
+		__field(int,		exit)
+		__field(int,		ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id  = vcpu->vcpu_id;
+		__entry->exit     = run->exit_reason;
+		__entry->ret      = vcpu->arch.ret;
+	),
+
+	TP_printk("VCPU %d: exit=%d, ret=%d",
+			__entry->vcpu_id, __entry->exit, __entry->ret)
+);
+
+#endif /* _TRACE_KVM_HV_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h
new file mode 100644
index 000000000..810507cb6
--- /dev/null
+++ b/arch/powerpc/kvm/trace_pr.h
@@ -0,0 +1,274 @@
+
+#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_PR_H
+
+#include <linux/tracepoint.h>
+#include "trace_book3s.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm_pr
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_pr
+
+TRACE_EVENT(kvm_book3s_reenter,
+	TP_PROTO(int r, struct kvm_vcpu *vcpu),
+	TP_ARGS(r, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	r		)
+		__field(	unsigned long,	pc		)
+	),
+
+	TP_fast_assign(
+		__entry->r		= r;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+	),
+
+	TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc)
+);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+TRACE_EVENT(kvm_book3s_64_mmu_map,
+	TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
+		 struct kvmppc_pte *orig_pte),
+	TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
+
+	TP_STRUCT__entry(
+		__field(	unsigned char,		flag_w		)
+		__field(	unsigned char,		flag_x		)
+		__field(	unsigned long,		eaddr		)
+		__field(	unsigned long,		hpteg		)
+		__field(	unsigned long,		va		)
+		__field(	unsigned long long,	vpage		)
+		__field(	unsigned long,		hpaddr		)
+	),
+
+	TP_fast_assign(
+		__entry->flag_w	= ((rflags & HPTE_R_PP) == 3) ? '-' : 'w';
+		__entry->flag_x	= (rflags & HPTE_R_N) ? '-' : 'x';
+		__entry->eaddr	= orig_pte->eaddr;
+		__entry->hpteg	= hpteg;
+		__entry->va	= va;
+		__entry->vpage	= orig_pte->vpage;
+		__entry->hpaddr	= hpaddr;
+	),
+
+	TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx",
+		  __entry->flag_w, __entry->flag_x, __entry->eaddr,
+		  __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr)
+);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
+TRACE_EVENT(kvm_book3s_mmu_map,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_vpn	)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_vpn	= pte->host_vpn;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_vpn, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_invalidate,
+	TP_PROTO(struct hpte_cache *pte),
+	TP_ARGS(pte),
+
+	TP_STRUCT__entry(
+		__field(	u64,		host_vpn	)
+		__field(	u64,		pfn		)
+		__field(	ulong,		eaddr		)
+		__field(	u64,		vpage		)
+		__field(	ulong,		raddr		)
+		__field(	int,		flags		)
+	),
+
+	TP_fast_assign(
+		__entry->host_vpn	= pte->host_vpn;
+		__entry->pfn		= pte->pfn;
+		__entry->eaddr		= pte->pte.eaddr;
+		__entry->vpage		= pte->pte.vpage;
+		__entry->raddr		= pte->pte.raddr;
+		__entry->flags		= (pte->pte.may_read ? 0x4 : 0) |
+					  (pte->pte.may_write ? 0x2 : 0) |
+					  (pte->pte.may_execute ? 0x1 : 0);
+	),
+
+	TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]",
+		  __entry->host_vpn, __entry->pfn, __entry->eaddr,
+		  __entry->vpage, __entry->raddr, __entry->flags)
+);
+
+TRACE_EVENT(kvm_book3s_mmu_flush,
+	TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1,
+		 unsigned long long p2),
+	TP_ARGS(type, vcpu, p1, p2),
+
+	TP_STRUCT__entry(
+		__field(	int,			count		)
+		__field(	unsigned long long,	p1		)
+		__field(	unsigned long long,	p2		)
+		__field(	const char *,		type		)
+	),
+
+	TP_fast_assign(
+		__entry->count		= to_book3s(vcpu)->hpte_cache_count;
+		__entry->p1		= p1;
+		__entry->p2		= p2;
+		__entry->type		= type;
+	),
+
+	TP_printk("Flush %d %sPTEs: %llx - %llx",
+		  __entry->count, __entry->type, __entry->p1, __entry->p2)
+);
+
+TRACE_EVENT(kvm_book3s_slb_found,
+	TP_PROTO(unsigned long long gvsid, unsigned long long hvsid),
+	TP_ARGS(gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	gvsid		)
+		__field(	unsigned long long,	hvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->gvsid		= gvsid;
+		__entry->hvsid		= hvsid;
+	),
+
+	TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_fail,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid),
+	TP_ARGS(sid_map_mask, gvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	gvsid		)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->gvsid		= gvsid;
+	),
+
+	TP_printk("%x/%x: %llx", __entry->sid_map_mask,
+		  SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid)
+);
+
+TRACE_EVENT(kvm_book3s_slb_map,
+	TP_PROTO(u16 sid_map_mask, unsigned long long gvsid,
+		 unsigned long long hvsid),
+	TP_ARGS(sid_map_mask, gvsid, hvsid),
+
+	TP_STRUCT__entry(
+		__field(	unsigned short,		sid_map_mask	)
+		__field(	unsigned long long,	guest_vsid	)
+		__field(	unsigned long long,	host_vsid	)
+	),
+
+	TP_fast_assign(
+		__entry->sid_map_mask	= sid_map_mask;
+		__entry->guest_vsid	= gvsid;
+		__entry->host_vsid	= hvsid;
+	),
+
+	TP_printk("%x: %llx -> %llx", __entry->sid_map_mask,
+		  __entry->guest_vsid, __entry->host_vsid)
+);
+
+TRACE_EVENT(kvm_book3s_slbmte,
+	TP_PROTO(u64 slb_vsid, u64 slb_esid),
+	TP_ARGS(slb_vsid, slb_esid),
+
+	TP_STRUCT__entry(
+		__field(	u64,	slb_vsid	)
+		__field(	u64,	slb_esid	)
+	),
+
+	TP_fast_assign(
+		__entry->slb_vsid	= slb_vsid;
+		__entry->slb_esid	= slb_esid;
+	),
+
+	TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid)
+);
+
+TRACE_EVENT(kvm_exit,
+	TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_nr, vcpu),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	exit_nr		)
+		__field(	unsigned long,	pc		)
+		__field(	unsigned long,	msr		)
+		__field(	unsigned long,	dar		)
+		__field(	unsigned long,	srr1		)
+		__field(	unsigned long,	last_inst	)
+	),
+
+	TP_fast_assign(
+		__entry->exit_nr	= exit_nr;
+		__entry->pc		= kvmppc_get_pc(vcpu);
+		__entry->dar		= kvmppc_get_fault_dar(vcpu);
+		__entry->msr		= kvmppc_get_msr(vcpu);
+		__entry->srr1		= vcpu->arch.shadow_srr1;
+		__entry->last_inst	= vcpu->arch.last_inst;
+	),
+
+	TP_printk("exit=%s"
+		" | pc=0x%lx"
+		" | msr=0x%lx"
+		" | dar=0x%lx"
+		" | srr1=0x%lx"
+		" | last_inst=0x%lx"
+		,
+		__print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+		__entry->pc,
+		__entry->msr,
+		__entry->dar,
+		__entry->srr1,
+		__entry->last_inst
+		)
+);
+
+TRACE_EVENT(kvm_unmap_hva,
+	TP_PROTO(unsigned long hva),
+	TP_ARGS(hva),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	hva		)
+	),
+
+	TP_fast_assign(
+		__entry->hva		= hva;
+	),
+
+	TP_printk("unmap hva 0x%lx\n", __entry->hva)
+);
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
commit	57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree	5e910f0e82173f4ef4f51111366a3f1299037a7b /arch/powerpc/kvm