51 files changed, 17500 insertions, 0 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
new file mode 100644
index 000000000..a482d1051
--- /dev/null
+++ b/arch/x86/mm/Makefile
@@ -0,0 +1,35 @@
+obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
+
+# Make sure __phys_addr has no stackprotector
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_physaddr.o		:= $(nostackp)
+CFLAGS_setup_nx.o		:= $(nostackp)
+
+CFLAGS_fault.o := -I$(src)/../include/asm/trace
+
+obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o
+obj-$(CONFIG_SMP)		+= tlb.o
+
+obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
+
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
+
+obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
+
+obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
+
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
+
+obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
+mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
+
+obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
+obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
+obj-$(CONFIG_ACPI_NUMA)		+= srat.o
+obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
+
+obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
new file mode 100644
index 000000000..2ca15b59f
--- /dev/null
+++ b/arch/x86/mm/amdtopology.c
@@ -0,0 +1,196 @@
+/*
+ * AMD NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * This version reads it directly from the AMD northbridge.
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <linux/acpi.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/amd_nb.h>
+
+static unsigned char __initdata nodeids[8];
+
+static __init int find_northbridge(void)
+{
+	int num;
+
+	for (num = 0; num < 32; num++) {
+		u32 header;
+
+		header = read_pci_config(0, num, 0, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+			continue;
+
+		header = read_pci_config(0, num, 1, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+			continue;
+		return num;
+	}
+
+	return -ENOENT;
+}
+
+static __init void early_get_boot_cpu_id(void)
+{
+	/*
+	 * need to get the APIC ID of the BSP so can use that to
+	 * create apicid_to_node in amd_scan_nodes()
+	 */
+#ifdef CONFIG_X86_MPPARSE
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		early_get_smp_config();
+#endif
+}
+
+int __init amd_numa_init(void)
+{
+	u64 start = PFN_PHYS(0);
+	u64 end = PFN_PHYS(max_pfn);
+	unsigned numnodes;
+	u64 prevbase;
+	int i, j, nb;
+	u32 nodeid, reg;
+	unsigned int bits, cores, apicid_base;
+
+	if (!early_pci_allowed())
+		return -EINVAL;
+
+	nb = find_northbridge();
+	if (nb < 0)
+		return nb;
+
+	pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
+
+	reg = read_pci_config(0, nb, 0, 0x60);
+	numnodes = ((reg >> 4) & 0xF) + 1;
+	if (numnodes <= 1)
+		return -ENOENT;
+
+	pr_info("Number of physical nodes %d\n", numnodes);
+
+	prevbase = 0;
+	for (i = 0; i < 8; i++) {
+		u64 base, limit;
+
+		base = read_pci_config(0, nb, 1, 0x40 + i*8);
+		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+		nodeids[i] = nodeid = limit & 7;
+		if ((base & 3) == 0) {
+			if (i < numnodes)
+				pr_info("Skipping disabled node %d\n", i);
+			continue;
+		}
+		if (nodeid >= numnodes) {
+			pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
+				base, limit);
+			continue;
+		}
+
+		if (!limit) {
+			pr_info("Skipping node entry %d (base %Lx)\n",
+				i, base);
+			continue;
+		}
+		if ((base >> 8) & 3 || (limit >> 8) & 3) {
+			pr_err("Node %d using interleaving mode %Lx/%Lx\n",
+			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
+			return -EINVAL;
+		}
+		if (node_isset(nodeid, numa_nodes_parsed)) {
+			pr_info("Node %d already present, skipping\n",
+				nodeid);
+			continue;
+		}
+
+		limit >>= 16;
+		limit++;
+		limit <<= 24;
+
+		if (limit > end)
+			limit = end;
+		if (limit <= base)
+			continue;
+
+		base >>= 16;
+		base <<= 24;
+
+		if (base < start)
+			base = start;
+		if (limit > end)
+			limit = end;
+		if (limit == base) {
+			pr_err("Empty node %d\n", nodeid);
+			continue;
+		}
+		if (limit < base) {
+			pr_err("Node %d bogus settings %Lx-%Lx.\n",
+			       nodeid, base, limit);
+			continue;
+		}
+
+		/* Could sort here, but pun for now. Should not happen anyroads. */
+		if (prevbase > base) {
+			pr_err("Node map not sorted %Lx,%Lx\n",
+			       prevbase, base);
+			return -EINVAL;
+		}
+
+		pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
+			nodeid, base, limit);
+
+		prevbase = base;
+		numa_add_memblk(nodeid, base, limit);
+		node_set(nodeid, numa_nodes_parsed);
+	}
+
+	if (!nodes_weight(numa_nodes_parsed))
+		return -ENOENT;
+
+	/*
+	 * We seem to have valid NUMA configuration.  Map apicids to nodes
+	 * using the coreid bits from early_identify_cpu.
+	 */
+	bits = boot_cpu_data.x86_coreid_bits;
+	cores = 1 << bits;
+	apicid_base = 0;
+
+	/* get the APIC ID of the BSP early for systems with apicid lifting */
+	early_get_boot_cpu_id();
+	if (boot_cpu_physical_apicid > 0) {
+		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+		apicid_base = boot_cpu_physical_apicid;
+	}
+
+	for_each_node_mask(i, numa_nodes_parsed)
+		for (j = apicid_base; j < cores + apicid_base; j++)
+			set_apicid_to_node((i << bits) + j, i);
+
+	return 0;
+}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
new file mode 100644
index 000000000..f0cedf339
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables.c
@@ -0,0 +1,426 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+	int level;
+	pgprot_t current_prot;
+	unsigned long start_address;
+	unsigned long current_address;
+	const struct addr_marker *marker;
+	unsigned long lines;
+	bool to_dmesg;
+};
+
+struct addr_marker {
+	unsigned long start_address;
+	const char *name;
+	unsigned long max_lines;
+};
+
+/* indices for address_markers; keep sync'd w/ address_markers below */
+enum address_markers_idx {
+	USER_SPACE_NR = 0,
+#ifdef CONFIG_X86_64
+	KERNEL_SPACE_NR,
+	LOW_KERNEL_NR,
+	VMALLOC_START_NR,
+	VMEMMAP_START_NR,
+# ifdef CONFIG_X86_ESPFIX64
+	ESPFIX_START_NR,
+# endif
+	HIGH_KERNEL_NR,
+	MODULES_VADDR_NR,
+	MODULES_END_NR,
+#else
+	KERNEL_SPACE_NR,
+	VMALLOC_START_NR,
+	VMALLOC_END_NR,
+# ifdef CONFIG_HIGHMEM
+	PKMAP_BASE_NR,
+# endif
+	FIXADDR_START_NR,
+#endif
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+	{ 0, "User Space" },
+#ifdef CONFIG_X86_64
+	{ 0x8000000000000000UL, "Kernel Space" },
+	{ PAGE_OFFSET,		"Low Kernel Mapping" },
+	{ VMALLOC_START,        "vmalloc() Area" },
+	{ VMEMMAP_START,        "Vmemmap" },
+# ifdef CONFIG_X86_ESPFIX64
+	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
+# endif
+# ifdef CONFIG_EFI
+	{ EFI_VA_END,		"EFI Runtime Services" },
+# endif
+	{ __START_KERNEL_map,   "High Kernel Mapping" },
+	{ MODULES_VADDR,        "Modules" },
+	{ MODULES_END,          "End Modules" },
+#else
+	{ PAGE_OFFSET,          "Kernel Mapping" },
+	{ 0/* VMALLOC_START */, "vmalloc() Area" },
+	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+	{ 0/*PKMAP_BASE*/,      "Persisent kmap() Area" },
+# endif
+	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
+#endif
+	{ -1, NULL }		/* End of list */
+};
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+#define pt_dump_seq_printf(m, to_dmesg, fmt, args...)		\
+({								\
+	if (to_dmesg)					\
+		printk(KERN_INFO fmt, ##args);			\
+	else							\
+		if (m)						\
+			seq_printf(m, fmt, ##args);		\
+})
+
+#define pt_dump_cont_printf(m, to_dmesg, fmt, args...)		\
+({								\
+	if (to_dmesg)					\
+		printk(KERN_CONT fmt, ##args);			\
+	else							\
+		if (m)						\
+			seq_printf(m, fmt, ##args);		\
+})
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
+{
+	pgprotval_t pr = pgprot_val(prot);
+	static const char * const level_name[] =
+		{ "cr3", "pgd", "pud", "pmd", "pte" };
+
+	if (!pgprot_val(prot)) {
+		/* Not present */
+		pt_dump_cont_printf(m, dmsg, "                              ");
+	} else {
+		if (pr & _PAGE_USER)
+			pt_dump_cont_printf(m, dmsg, "USR ");
+		else
+			pt_dump_cont_printf(m, dmsg, "    ");
+		if (pr & _PAGE_RW)
+			pt_dump_cont_printf(m, dmsg, "RW ");
+		else
+			pt_dump_cont_printf(m, dmsg, "ro ");
+		if (pr & _PAGE_PWT)
+			pt_dump_cont_printf(m, dmsg, "PWT ");
+		else
+			pt_dump_cont_printf(m, dmsg, "    ");
+		if (pr & _PAGE_PCD)
+			pt_dump_cont_printf(m, dmsg, "PCD ");
+		else
+			pt_dump_cont_printf(m, dmsg, "    ");
+
+		/* Bit 7 has a different meaning on level 3 vs 4 */
+		if (level <= 3 && pr & _PAGE_PSE)
+			pt_dump_cont_printf(m, dmsg, "PSE ");
+		else
+			pt_dump_cont_printf(m, dmsg, "    ");
+		if ((level == 4 && pr & _PAGE_PAT) ||
+		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
+			pt_dump_cont_printf(m, dmsg, "pat ");
+		else
+			pt_dump_cont_printf(m, dmsg, "    ");
+		if (pr & _PAGE_GLOBAL)
+			pt_dump_cont_printf(m, dmsg, "GLB ");
+		else
+			pt_dump_cont_printf(m, dmsg, "    ");
+		if (pr & _PAGE_NX)
+			pt_dump_cont_printf(m, dmsg, "NX ");
+		else
+			pt_dump_cont_printf(m, dmsg, "x  ");
+	}
+	pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
+}
+
+/*
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long normalize_addr(unsigned long u)
+{
+#ifdef CONFIG_X86_64
+	return (signed long)(u << 16) >> 16;
+#else
+	return u;
+#endif
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+		      pgprot_t new_prot, int level)
+{
+	pgprotval_t prot, cur;
+	static const char units[] = "BKMGTPE";
+
+	/*
+	 * If we have a "break" in the series, we need to flush the state that
+	 * we have now. "break" is either changing perms, levels or
+	 * address space marker.
+	 */
+	prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
+	cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
+
+	if (!st->level) {
+		/* First entry */
+		st->current_prot = new_prot;
+		st->level = level;
+		st->marker = address_markers;
+		st->lines = 0;
+		pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+				   st->marker->name);
+	} else if (prot != cur || level != st->level ||
+		   st->current_address >= st->marker[1].start_address) {
+		const char *unit = units;
+		unsigned long delta;
+		int width = sizeof(unsigned long) * 2;
+
+		/*
+		 * Now print the actual finished series
+		 */
+		if (!st->marker->max_lines ||
+		    st->lines < st->marker->max_lines) {
+			pt_dump_seq_printf(m, st->to_dmesg,
+					   "0x%0*lx-0x%0*lx   ",
+					   width, st->start_address,
+					   width, st->current_address);
+
+			delta = st->current_address - st->start_address;
+			while (!(delta & 1023) && unit[1]) {
+				delta >>= 10;
+				unit++;
+			}
+			pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
+					    delta, *unit);
+			printk_prot(m, st->current_prot, st->level,
+				    st->to_dmesg);
+		}
+		st->lines++;
+
+		/*
+		 * We print markers for special areas of address space,
+		 * such as the start of vmalloc space etc.
+		 * This helps in the interpretation.
+		 */
+		if (st->current_address >= st->marker[1].start_address) {
+			if (st->marker->max_lines &&
+			    st->lines > st->marker->max_lines) {
+				unsigned long nskip =
+					st->lines - st->marker->max_lines;
+				pt_dump_seq_printf(m, st->to_dmesg,
+						   "... %lu entr%s skipped ... \n",
+						   nskip,
+						   nskip == 1 ? "y" : "ies");
+			}
+			st->marker++;
+			st->lines = 0;
+			pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
+					   st->marker->name);
+		}
+
+		st->start_address = st->current_address;
+		st->current_prot = new_prot;
+		st->level = level;
+	}
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+							unsigned long P)
+{
+	int i;
+	pte_t *start;
+
+	start = (pte_t *) pmd_page_vaddr(addr);
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pgprot_t prot = pte_pgprot(*start);
+
+		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+		note_page(m, st, prot, 4);
+		start++;
+	}
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+							unsigned long P)
+{
+	int i;
+	pmd_t *start;
+
+	start = (pmd_t *) pud_page_vaddr(addr);
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
+		if (!pmd_none(*start)) {
+			pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
+
+			if (pmd_large(*start) || !pmd_present(*start))
+				note_page(m, st, __pgprot(prot), 3);
+			else
+				walk_pte_level(m, st, *start,
+					       P + i * PMD_LEVEL_MULT);
+		} else
+			note_page(m, st, __pgprot(0), 3);
+		start++;
+	}
+}
+
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a)  pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+							unsigned long P)
+{
+	int i;
+	pud_t *start;
+
+	start = (pud_t *) pgd_page_vaddr(addr);
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
+		if (!pud_none(*start)) {
+			pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
+
+			if (pud_large(*start) || !pud_present(*start))
+				note_page(m, st, __pgprot(prot), 2);
+			else
+				walk_pmd_level(m, st, *start,
+					       P + i * PUD_LEVEL_MULT);
+		} else
+			note_page(m, st, __pgprot(0), 2);
+
+		start++;
+	}
+}
+
+#else
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
+#endif
+
+void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+	pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+	pgd_t *start = swapper_pg_dir;
+#endif
+	int i;
+	struct pg_state st = {};
+
+	if (pgd) {
+		start = pgd;
+		st.to_dmesg = true;
+	}
+
+	for (i = 0; i < PTRS_PER_PGD; i++) {
+		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+		if (!pgd_none(*start)) {
+			pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
+
+			if (pgd_large(*start) || !pgd_present(*start))
+				note_page(m, &st, __pgprot(prot), 1);
+			else
+				walk_pud_level(m, &st, *start,
+					       i * PGD_LEVEL_MULT);
+		} else
+			note_page(m, &st, __pgprot(0), 1);
+
+		start++;
+	}
+
+	/* Flush out the last page */
+	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+	note_page(m, &st, __pgprot(0), 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+	ptdump_walk_pgd_level(m, NULL);
+	return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+	.open		= ptdump_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int pt_dump_init(void)
+{
+	struct dentry *pe;
+
+#ifdef CONFIG_X86_32
+	/* Not a compile-time constant on x86-32 */
+	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
+# endif
+	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+#endif
+
+	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+				 &ptdump_fops);
+	if (!pe)
+		return -ENOMEM;
+
+	return 0;
+}
+
+__initcall(pt_dump_init);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
new file mode 100644
index 000000000..903ec1e9c
--- /dev/null
+++ b/arch/x86/mm/extable.c
@@ -0,0 +1,169 @@
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+
+static inline unsigned long
+ex_insn_addr(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->insn + x->insn;
+}
+static inline unsigned long
+ex_fixup_addr(const struct exception_table_entry *x)
+{
+	return (unsigned long)&x->fixup + x->fixup;
+}
+
+int fixup_exception(struct pt_regs *regs)
+{
+	const struct exception_table_entry *fixup;
+	unsigned long new_ip;
+
+#ifdef CONFIG_PNPBIOS
+	if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
+		extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
+		extern u32 pnp_bios_is_utter_crap;
+		pnp_bios_is_utter_crap = 1;
+		printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
+		__asm__ volatile(
+			"movl %0, %%esp\n\t"
+			"jmp *%1\n\t"
+			: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
+		panic("do_trap: can't hit this");
+	}
+#endif
+
+	fixup = search_exception_tables(regs->ip);
+	if (fixup) {
+		new_ip = ex_fixup_addr(fixup);
+
+		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+			/* Special hack for uaccess_err */
+			current_thread_info()->uaccess_err = 1;
+			new_ip -= 0x7ffffff0;
+		}
+		regs->ip = new_ip;
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Restricted version used during very early boot */
+int __init early_fixup_exception(unsigned long *ip)
+{
+	const struct exception_table_entry *fixup;
+	unsigned long new_ip;
+
+	fixup = search_exception_tables(*ip);
+	if (fixup) {
+		new_ip = ex_fixup_addr(fixup);
+
+		if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
+			/* uaccess handling not supported during early boot */
+			return 0;
+		}
+
+		*ip = new_ip;
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Search one exception table for an entry corresponding to the
+ * given instruction address, and return the address of the entry,
+ * or NULL if none is found.
+ * We use a binary search, and thus we assume that the table is
+ * already sorted.
+ */
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+	       const struct exception_table_entry *last,
+	       unsigned long value)
+{
+	while (first <= last) {
+		const struct exception_table_entry *mid;
+		unsigned long addr;
+
+		mid = ((last - first) >> 1) + first;
+		addr = ex_insn_addr(mid);
+		if (addr < value)
+			first = mid + 1;
+		else if (addr > value)
+			last = mid - 1;
+		else
+			return mid;
+        }
+        return NULL;
+}
+
+/*
+ * The exception table needs to be sorted so that the binary
+ * search that we use to find entries in it works properly.
+ * This is used both for the kernel exception table and for
+ * the exception tables of modules that get loaded.
+ *
+ */
+static int cmp_ex(const void *a, const void *b)
+{
+	const struct exception_table_entry *x = a, *y = b;
+
+	/*
+	 * This value will always end up fittin in an int, because on
+	 * both i386 and x86-64 the kernel symbol-reachable address
+	 * space is < 2 GiB.
+	 *
+	 * This compare is only valid after normalization.
+	 */
+	return x->insn - y->insn;
+}
+
+void sort_extable(struct exception_table_entry *start,
+		  struct exception_table_entry *finish)
+{
+	struct exception_table_entry *p;
+	int i;
+
+	/* Convert all entries to being relative to the start of the section */
+	i = 0;
+	for (p = start; p < finish; p++) {
+		p->insn += i;
+		i += 4;
+		p->fixup += i;
+		i += 4;
+	}
+
+	sort(start, finish - start, sizeof(struct exception_table_entry),
+	     cmp_ex, NULL);
+
+	/* Denormalize all entries */
+	i = 0;
+	for (p = start; p < finish; p++) {
+		p->insn -= i;
+		i += 4;
+		p->fixup -= i;
+		i += 4;
+	}
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * If the exception table is sorted, any referring to the module init
+ * will be at the beginning or the end.
+ */
+void trim_init_extable(struct module *m)
+{
+	/*trim the beginning*/
+	while (m->num_exentries &&
+	       within_module_init(ex_insn_addr(&m->extable[0]), m)) {
+		m->extable++;
+		m->num_exentries--;
+	}
+	/*trim the end*/
+	while (m->num_exentries &&
+	       within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
+		m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
new file mode 100644
index 000000000..626899af4
--- /dev/null
+++ b/arch/x86/mm/fault.c
@@ -0,0 +1,1436 @@
+/*
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
+ *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
+ */
+#include <linux/sched.h>		/* test_thread_flag(), ...	*/
+#include <linux/kdebug.h>		/* oops_begin/end, ...		*/
+#include <linux/module.h>		/* search_exception_table	*/
+#include <linux/bootmem.h>		/* max_low_pfn			*/
+#include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
+#include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
+#include <linux/perf_event.h>		/* perf_sw_event		*/
+#include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
+#include <linux/prefetch.h>		/* prefetchw			*/
+#include <linux/context_tracking.h>	/* exception_enter(), ...	*/
+#include <linux/tuxonice.h>             /* incremental image support    */
+
+#include <asm/traps.h>			/* dotraplinkage, ...		*/
+#include <asm/pgalloc.h>		/* pgd_*(), ...			*/
+#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
+#include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
+#include <asm/vsyscall.h>		/* emulate_vsyscall		*/
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/exceptions.h>
+
+/*
+ * Page fault error code bits:
+ *
+ *   bit 0 ==	 0: no page found	1: protection fault
+ *   bit 1 ==	 0: read access		1: write access
+ *   bit 2 ==	 0: kernel-mode access	1: user-mode access
+ *   bit 3 ==				1: use of reserved bit detected
+ *   bit 4 ==				1: fault was an instruction fetch
+ */
+enum x86_pf_error_code {
+
+	PF_PROT		=		1 << 0,
+	PF_WRITE	=		1 << 1,
+	PF_USER		=		1 << 2,
+	PF_RSVD		=		1 << 3,
+	PF_INSTR	=		1 << 4,
+};
+
+/*
+ * Returns 0 if mmiotrace is disabled, or if the fault is not
+ * handled by mmiotrace:
+ */
+static nokprobe_inline int
+kmmio_fault(struct pt_regs *regs, unsigned long addr)
+{
+	if (unlikely(is_kmmio_active()))
+		if (kmmio_handler(regs, addr) == 1)
+			return -1;
+	return 0;
+}
+
+static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
+{
+	int ret = 0;
+
+	/* kprobe_running() needs smp_processor_id() */
+	if (kprobes_built_in() && !user_mode(regs)) {
+		preempt_disable();
+		if (kprobe_running() && kprobe_fault_handler(regs, 14))
+			ret = 1;
+		preempt_enable();
+	}
+
+	return ret;
+}
+
+/*
+ * Prefetch quirks:
+ *
+ * 32-bit mode:
+ *
+ *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * 64-bit mode:
+ *
+ *   Sometimes the CPU reports invalid exceptions on prefetch.
+ *   Check that here and ignore it.
+ *
+ * Opcode checker based on code by Richard Brunner.
+ */
+static inline int
+check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
+		      unsigned char opcode, int *prefetch)
+{
+	unsigned char instr_hi = opcode & 0xf0;
+	unsigned char instr_lo = opcode & 0x0f;
+
+	switch (instr_hi) {
+	case 0x20:
+	case 0x30:
+		/*
+		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
+		 * In X86_64 long mode, the CPU will signal invalid
+		 * opcode if some of these prefixes are present so
+		 * X86_64 will never get here anyway
+		 */
+		return ((instr_lo & 7) == 0x6);
+#ifdef CONFIG_X86_64
+	case 0x40:
+		/*
+		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
+		 * Need to figure out under what instruction mode the
+		 * instruction was issued. Could check the LDT for lm,
+		 * but for now it's good enough to assume that long
+		 * mode only uses well known segments or kernel.
+		 */
+		return (!user_mode(regs) || user_64bit_mode(regs));
+#endif
+	case 0x60:
+		/* 0x64 thru 0x67 are valid prefixes in all modes. */
+		return (instr_lo & 0xC) == 0x4;
+	case 0xF0:
+		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
+		return !instr_lo || (instr_lo>>1) == 1;
+	case 0x00:
+		/* Prefetch instruction is 0x0F0D or 0x0F18 */
+		if (probe_kernel_address(instr, opcode))
+			return 0;
+
+		*prefetch = (instr_lo == 0xF) &&
+			(opcode == 0x0D || opcode == 0x18);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static int
+is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
+{
+	unsigned char *max_instr;
+	unsigned char *instr;
+	int prefetch = 0;
+
+	/*
+	 * If it was a exec (instruction fetch) fault on NX page, then
+	 * do not ignore the fault:
+	 */
+	if (error_code & PF_INSTR)
+		return 0;
+
+	instr = (void *)convert_ip_to_linear(current, regs);
+	max_instr = instr + 15;
+
+	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
+		return 0;
+
+	while (instr < max_instr) {
+		unsigned char opcode;
+
+		if (probe_kernel_address(instr, opcode))
+			break;
+
+		instr++;
+
+		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
+			break;
+	}
+	return prefetch;
+}
+
+static void
+force_sig_info_fault(int si_signo, int si_code, unsigned long address,
+		     struct task_struct *tsk, int fault)
+{
+	unsigned lsb = 0;
+	siginfo_t info;
+
+	info.si_signo	= si_signo;
+	info.si_errno	= 0;
+	info.si_code	= si_code;
+	info.si_addr	= (void __user *)address;
+	if (fault & VM_FAULT_HWPOISON_LARGE)
+		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
+	if (fault & VM_FAULT_HWPOISON)
+		lsb = PAGE_SHIFT;
+	info.si_addr_lsb = lsb;
+
+	force_sig_info(si_signo, &info, tsk);
+}
+
+DEFINE_SPINLOCK(pgd_lock);
+LIST_HEAD(pgd_list);
+
+#ifdef CONFIG_X86_32
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	/*
+	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
+	 * and redundant with the set_pmd() on non-PAE. As would
+	 * set_pud.
+	 */
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, *pmd_k);
+	else
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+
+	return pmd_k;
+}
+
+void vmalloc_sync_all(void)
+{
+	unsigned long address;
+
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	for (address = VMALLOC_START & PMD_MASK;
+	     address >= TASK_SIZE && address < FIXADDR_TOP;
+	     address += PMD_SIZE) {
+		struct page *page;
+
+		spin_lock(&pgd_lock);
+		list_for_each_entry(page, &pgd_list, lru) {
+			spinlock_t *pgt_lock;
+			pmd_t *ret;
+
+			/* the pgt_lock only for Xen */
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+
+			spin_lock(pgt_lock);
+			ret = vmalloc_sync_one(page_address(page), address);
+			spin_unlock(pgt_lock);
+
+			if (!ret)
+				break;
+		}
+		spin_unlock(&pgd_lock);
+	}
+}
+
+/*
+ * 32-bit:
+ *
+ *   Handle a fault on the vmalloc or module mapping area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+	unsigned long pgd_paddr;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	WARN_ON_ONCE(in_nmi());
+
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_paddr = read_cr3();
+	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
+	if (!pmd_k)
+		return -1;
+
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+
+	return 0;
+}
+NOKPROBE_SYMBOL(vmalloc_fault);
+
+/*
+ * Did it hit the DOS screen memory VA from vm86 mode?
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+	unsigned long bit;
+
+	if (!v8086_mode(regs))
+		return;
+
+	bit = (address - 0xA0000) >> PAGE_SHIFT;
+	if (bit < 32)
+		tsk->thread.screen_bitmap |= 1 << bit;
+}
+
+static bool low_pfn(unsigned long pfn)
+{
+	return pfn < max_low_pfn;
+}
+
+static void dump_pagetable(unsigned long address)
+{
+	pgd_t *base = __va(read_cr3());
+	pgd_t *pgd = &base[pgd_index(address)];
+	pmd_t *pmd;
+	pte_t *pte;
+
+#ifdef CONFIG_X86_PAE
+	printk("*pdpt = %016Lx ", pgd_val(*pgd));
+	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
+		goto out;
+#endif
+	pmd = pmd_offset(pud_offset(pgd, address), address);
+	printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
+
+	/*
+	 * We must not directly access the pte in the highpte
+	 * case if the page table is located in highmem.
+	 * And let's rather not kmap-atomic the pte, just in case
+	 * it's allocated already:
+	 */
+	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
+		goto out;
+
+	pte = pte_offset_kernel(pmd, address);
+	printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
+out:
+	printk("\n");
+}
+
+#else /* CONFIG_X86_64: */
+
+void vmalloc_sync_all(void)
+{
+	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
+}
+
+/*
+ * 64-bit:
+ *
+ *   Handle a fault on the vmalloc area
+ *
+ * This assumes no large pages in there.
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd, *pgd_ref;
+	pud_t *pud, *pud_ref;
+	pmd_t *pmd, *pmd_ref;
+	pte_t *pte, *pte_ref;
+
+	/* Make sure we are in vmalloc area: */
+	if (!(address >= VMALLOC_START && address < VMALLOC_END))
+		return -1;
+
+	WARN_ON_ONCE(in_nmi());
+
+	/*
+	 * Copy kernel mappings over when needed. This can also
+	 * happen within a race in page table update. In the later
+	 * case just flush:
+	 */
+	pgd = pgd_offset(current->active_mm, address);
+	pgd_ref = pgd_offset_k(address);
+	if (pgd_none(*pgd_ref))
+		return -1;
+
+	if (pgd_none(*pgd)) {
+		set_pgd(pgd, *pgd_ref);
+		arch_flush_lazy_mmu_mode();
+	} else {
+		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+	}
+
+	/*
+	 * Below here mismatches are bugs because these lower tables
+	 * are shared:
+	 */
+
+	pud = pud_offset(pgd, address);
+	pud_ref = pud_offset(pgd_ref, address);
+	if (pud_none(*pud_ref))
+		return -1;
+
+	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
+		BUG();
+
+	pmd = pmd_offset(pud, address);
+	pmd_ref = pmd_offset(pud_ref, address);
+	if (pmd_none(*pmd_ref))
+		return -1;
+
+	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
+		BUG();
+
+	pte_ref = pte_offset_kernel(pmd_ref, address);
+	if (!pte_present(*pte_ref))
+		return -1;
+
+	pte = pte_offset_kernel(pmd, address);
+
+	/*
+	 * Don't use pte_page here, because the mappings can point
+	 * outside mem_map, and the NUMA hash lookup cannot handle
+	 * that:
+	 */
+	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
+		BUG();
+
+	return 0;
+}
+NOKPROBE_SYMBOL(vmalloc_fault);
+
+#ifdef CONFIG_CPU_SUP_AMD
+static const char errata93_warning[] =
+KERN_ERR 
+"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
+"******* Working around it, but it may cause SEGVs or burn power.\n"
+"******* Please consider a BIOS update.\n"
+"******* Disabling USB legacy in the BIOS may also help.\n";
+#endif
+
+/*
+ * No vm86 mode in 64-bit mode:
+ */
+static inline void
+check_v8086_mode(struct pt_regs *regs, unsigned long address,
+		 struct task_struct *tsk)
+{
+}
+
+static int bad_address(void *p)
+{
+	unsigned long dummy;
+
+	return probe_kernel_address((unsigned long *)p, dummy);
+}
+
+static void dump_pagetable(unsigned long address)
+{
+	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+	pgd_t *pgd = base + pgd_index(address);
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (bad_address(pgd))
+		goto bad;
+
+	printk("PGD %lx ", pgd_val(*pgd));
+
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (bad_address(pud))
+		goto bad;
+
+	printk("PUD %lx ", pud_val(*pud));
+	if (!pud_present(*pud) || pud_large(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (bad_address(pmd))
+		goto bad;
+
+	printk("PMD %lx ", pmd_val(*pmd));
+	if (!pmd_present(*pmd) || pmd_large(*pmd))
+		goto out;
+
+	pte = pte_offset_kernel(pmd, address);
+	if (bad_address(pte))
+		goto bad;
+
+	printk("PTE %lx", pte_val(*pte));
+out:
+	printk("\n");
+	return;
+bad:
+	printk("BAD\n");
+}
+
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Workaround for K8 erratum #93 & buggy BIOS.
+ *
+ * BIOS SMM functions are required to use a specific workaround
+ * to avoid corruption of the 64bit RIP register on C stepping K8.
+ *
+ * A lot of BIOS that didn't get tested properly miss this.
+ *
+ * The OS sees this as a page fault with the upper 32bits of RIP cleared.
+ * Try to work around it here.
+ *
+ * Note we only handle faults in kernel here.
+ * Does nothing on 32-bit.
+ */
+static int is_errata93(struct pt_regs *regs, unsigned long address)
+{
+#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
+	    || boot_cpu_data.x86 != 0xf)
+		return 0;
+
+	if (address != regs->ip)
+		return 0;
+
+	if ((address >> 32) != 0)
+		return 0;
+
+	address |= 0xffffffffUL << 32;
+	if ((address >= (u64)_stext && address <= (u64)_etext) ||
+	    (address >= MODULES_VADDR && address <= MODULES_END)) {
+		printk_once(errata93_warning);
+		regs->ip = address;
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+/*
+ * Work around K8 erratum #100 K8 in compat mode occasionally jumps
+ * to illegal addresses >4GB.
+ *
+ * We catch this in the page fault handler because these addresses
+ * are not reachable. Just detect this case and return.  Any code
+ * segment in LDT is compatibility mode.
+ */
+static int is_errata100(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_64
+	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
+		return 1;
+#endif
+	return 0;
+}
+
+static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
+{
+#ifdef CONFIG_X86_F00F_BUG
+	unsigned long nr;
+
+	/*
+	 * Pentium F0 0F C7 C8 bug workaround:
+	 */
+	if (boot_cpu_has_bug(X86_BUG_F00F)) {
+		nr = (address - idt_descr.address) >> 3;
+
+		if (nr == 6) {
+			do_invalid_op(regs, 0);
+			return 1;
+		}
+	}
+#endif
+	return 0;
+}
+
+static const char nx_warning[] = KERN_CRIT
+"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static const char smep_warning[] = KERN_CRIT
+"unable to execute userspace code (SMEP?) (uid: %d)\n";
+
+static void
+show_fault_oops(struct pt_regs *regs, unsigned long error_code,
+		unsigned long address)
+{
+	if (!oops_may_print())
+		return;
+
+	if (error_code & PF_INSTR) {
+		unsigned int level;
+		pgd_t *pgd;
+		pte_t *pte;
+
+		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+		pgd += pgd_index(address);
+
+		pte = lookup_address_in_pgd(pgd, address, &level);
+
+		if (pte && pte_present(*pte) && !pte_exec(*pte))
+			printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+		if (pte && pte_present(*pte) && pte_exec(*pte) &&
+				(pgd_flags(*pgd) & _PAGE_USER) &&
+				(__read_cr4() & X86_CR4_SMEP))
+			printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
+	}
+
+	printk(KERN_ALERT "BUG: unable to handle kernel ");
+	if (address < PAGE_SIZE)
+		printk(KERN_CONT "NULL pointer dereference");
+	else
+		printk(KERN_CONT "paging request");
+
+	printk(KERN_CONT " at %p\n", (void *) address);
+	printk(KERN_ALERT "IP:");
+	printk_address(regs->ip);
+
+	dump_pagetable(address);
+}
+
+static noinline void
+pgtable_bad(struct pt_regs *regs, unsigned long error_code,
+	    unsigned long address)
+{
+	struct task_struct *tsk;
+	unsigned long flags;
+	int sig;
+
+	flags = oops_begin();
+	tsk = current;
+	sig = SIGKILL;
+
+	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
+	       tsk->comm, address);
+	dump_pagetable(address);
+
+	tsk->thread.cr2		= address;
+	tsk->thread.trap_nr	= X86_TRAP_PF;
+	tsk->thread.error_code	= error_code;
+
+	if (__die("Bad pagetable", regs, error_code))
+		sig = 0;
+
+	oops_end(flags, regs, sig);
+}
+
+static noinline void
+no_context(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address, int signal, int si_code)
+{
+	struct task_struct *tsk = current;
+	unsigned long flags;
+	int sig;
+
+        if (toi_make_writable(init_mm.pgd, address)) {
+            return;
+        }
+
+	/* Are we prepared to handle this kernel fault? */
+	if (fixup_exception(regs)) {
+		/*
+		 * Any interrupt that takes a fault gets the fixup. This makes
+		 * the below recursive fault logic only apply to a faults from
+		 * task context.
+		 */
+		if (in_interrupt())
+			return;
+
+		/*
+		 * Per the above we're !in_interrupt(), aka. task context.
+		 *
+		 * In this case we need to make sure we're not recursively
+		 * faulting through the emulate_vsyscall() logic.
+		 */
+		if (current_thread_info()->sig_on_uaccess_error && signal) {
+			tsk->thread.trap_nr = X86_TRAP_PF;
+			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.cr2 = address;
+
+			/* XXX: hwpoison faults will set the wrong code. */
+			force_sig_info_fault(signal, si_code, address, tsk, 0);
+		}
+
+		/*
+		 * Barring that, we can do the fixup and be happy.
+		 */
+		return;
+	}
+
+	/*
+	 * 32-bit:
+	 *
+	 *   Valid to do another page fault here, because if this fault
+	 *   had been triggered by is_prefetch fixup_exception would have
+	 *   handled it.
+	 *
+	 * 64-bit:
+	 *
+	 *   Hall of shame of CPU/BIOS bugs.
+	 */
+	if (is_prefetch(regs, error_code, address))
+		return;
+
+	if (is_errata93(regs, address))
+		return;
+
+	/*
+	 * Oops. The kernel tried to access some bad page. We'll have to
+	 * terminate things with extreme prejudice:
+	 */
+	flags = oops_begin();
+
+	show_fault_oops(regs, error_code, address);
+
+	if (task_stack_end_corrupted(tsk))
+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+	tsk->thread.cr2		= address;
+	tsk->thread.trap_nr	= X86_TRAP_PF;
+	tsk->thread.error_code	= error_code;
+
+	sig = SIGKILL;
+	if (__die("Oops", regs, error_code))
+		sig = 0;
+
+	/* Executive summary in case the body of the oops scrolled away */
+	printk(KERN_DEFAULT "CR2: %016lx\n", address);
+
+	oops_end(flags, regs, sig);
+}
+
+/*
+ * Print out info about fatal segfaults, if the show_unhandled_signals
+ * sysctl is set:
+ */
+static inline void
+show_signal_msg(struct pt_regs *regs, unsigned long error_code,
+		unsigned long address, struct task_struct *tsk)
+{
+	if (!unhandled_signal(tsk, SIGSEGV))
+		return;
+
+	if (!printk_ratelimit())
+		return;
+
+	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+		tsk->comm, task_pid_nr(tsk), address,
+		(void *)regs->ip, (void *)regs->sp, error_code);
+
+	print_vma_addr(KERN_CONT " in ", regs->ip);
+
+	printk(KERN_CONT "\n");
+}
+
+static void
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		       unsigned long address, int si_code)
+{
+	struct task_struct *tsk = current;
+
+	/* User mode accesses just cause a SIGSEGV */
+	if (error_code & PF_USER) {
+		/*
+		 * It's possible to have interrupts off here:
+		 */
+		local_irq_enable();
+
+		/*
+		 * Valid to do another page fault here because this one came
+		 * from user space:
+		 */
+		if (is_prefetch(regs, error_code, address))
+			return;
+
+		if (is_errata100(regs, address))
+			return;
+
+#ifdef CONFIG_X86_64
+		/*
+		 * Instruction fetch faults in the vsyscall page might need
+		 * emulation.
+		 */
+		if (unlikely((error_code & PF_INSTR) &&
+			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
+			if (emulate_vsyscall(regs, address))
+				return;
+		}
+#endif
+		/* Kernel addresses are always protection faults: */
+		if (address >= TASK_SIZE)
+			error_code |= PF_PROT;
+
+		if (likely(show_unhandled_signals))
+			show_signal_msg(regs, error_code, address, tsk);
+
+		tsk->thread.cr2		= address;
+		tsk->thread.error_code	= error_code;
+		tsk->thread.trap_nr	= X86_TRAP_PF;
+
+		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+
+		return;
+	}
+
+	if (is_f00f_bug(regs, address))
+		return;
+
+	no_context(regs, error_code, address, SIGSEGV, si_code);
+}
+
+static noinline void
+bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
+		     unsigned long address)
+{
+	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+}
+
+static void
+__bad_area(struct pt_regs *regs, unsigned long error_code,
+	   unsigned long address, int si_code)
+{
+	struct mm_struct *mm = current->mm;
+
+	/*
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
+	 */
+	up_read(&mm->mmap_sem);
+
+	__bad_area_nosemaphore(regs, error_code, address, si_code);
+}
+
+static noinline void
+bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_MAPERR);
+}
+
+static noinline void
+bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
+		      unsigned long address)
+{
+	__bad_area(regs, error_code, address, SEGV_ACCERR);
+}
+
+static void
+do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+	  unsigned int fault)
+{
+	struct task_struct *tsk = current;
+	int code = BUS_ADRERR;
+
+	/* Kernel mode? Handle exceptions or die: */
+	if (!(error_code & PF_USER)) {
+		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+		return;
+	}
+
+	/* User-space => ok to do another page fault: */
+	if (is_prefetch(regs, error_code, address))
+		return;
+
+	tsk->thread.cr2		= address;
+	tsk->thread.error_code	= error_code;
+	tsk->thread.trap_nr	= X86_TRAP_PF;
+
+#ifdef CONFIG_MEMORY_FAILURE
+	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+		printk(KERN_ERR
+	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+			tsk->comm, tsk->pid, address);
+		code = BUS_MCEERR_AR;
+	}
+#endif
+	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+}
+
+static noinline void
+mm_fault_error(struct pt_regs *regs, unsigned long error_code,
+	       unsigned long address, unsigned int fault)
+{
+	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+		no_context(regs, error_code, address, 0, 0);
+		return;
+	}
+
+	if (fault & VM_FAULT_OOM) {
+		/* Kernel mode? Handle exceptions or die: */
+		if (!(error_code & PF_USER)) {
+			no_context(regs, error_code, address,
+				   SIGSEGV, SEGV_MAPERR);
+			return;
+		}
+
+		/*
+		 * We ran out of memory, call the OOM killer, and return the
+		 * userspace (which will retry the fault, or kill us if we got
+		 * oom-killed):
+		 */
+		pagefault_out_of_memory();
+	} else {
+		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+			     VM_FAULT_HWPOISON_LARGE))
+			do_sigbus(regs, error_code, address, fault);
+		else if (fault & VM_FAULT_SIGSEGV)
+			bad_area_nosemaphore(regs, error_code, address);
+		else
+			BUG();
+	}
+}
+
+#ifdef CONFIG_TOI_INCREMENTAL
+/**
+ * _toi_do_cbw - Do a copy-before-write before letting the faulting process continue
+ */
+static void toi_do_cbw(struct page *page)
+{
+    struct toi_cbw_state *state = this_cpu_ptr(&toi_cbw_states);
+
+    state->active = 1;
+    wmb();
+
+    if (state->enabled && state->next && PageTOI_CBW(page)) {
+        struct toi_cbw *this = state->next;
+        memcpy(this->virt, page_address(page), PAGE_SIZE);
+        this->pfn = page_to_pfn(page);
+        state->next = this->next;
+    }
+
+    state->active = 0;
+}
+
+/**
+ * _toi_make_writable - Defuse TOI's write protection
+ */
+int _toi_make_writable(pte_t *pte)
+{
+    struct page *page = pte_page(*pte);
+    if (PageTOI_RO(page)) {
+        pgd_t *pgd = __va(read_cr3());
+        /*
+         * If this is a TuxOnIce caused fault, we may not have permission to
+         * write to a page needed to reset the permissions of the original
+         * page. Use swapper_pg_dir to get around this.
+         */
+        load_cr3(swapper_pg_dir);
+
+        set_pte_atomic(pte, pte_mkwrite(*pte));
+        SetPageTOI_Dirty(page);
+        ClearPageTOI_RO(page);
+
+        toi_do_cbw(page);
+
+        load_cr3(pgd);
+        return 1;
+    }
+    return 0;
+}
+
+/**
+ * toi_make_writable - Handle a (potential) fault caused by TOI's write protection
+ *
+ * Make a page writable that was protected. Might be because of a fault, or
+ * because we're allocating it and want it to be untracked.
+ *
+ * Note that in the fault handling case, we don't care about the error code. If
+ * called from the double fault handler, we won't have one. We just check to
+ * see if the page was made RO by TOI, and mark it dirty/release the protection
+ * if it was.
+ */
+int toi_make_writable(pgd_t *pgd, unsigned long address)
+{
+    pud_t *pud;
+    pmd_t *pmd;
+    pte_t *pte;
+
+    pgd = pgd + pgd_index(address);
+    if (!pgd_present(*pgd))
+        return 0;
+
+    pud = pud_offset(pgd, address);
+    if (!pud_present(*pud))
+        return 0;
+
+    if (pud_large(*pud))
+        return _toi_make_writable((pte_t *) pud);
+
+    pmd = pmd_offset(pud, address);
+    if (!pmd_present(*pmd))
+        return 0;
+
+    if (pmd_large(*pmd))
+        return _toi_make_writable((pte_t *) pmd);
+
+    pte = pte_offset_kernel(pmd, address);
+    if (!pte_present(*pte))
+        return 0;
+
+    return _toi_make_writable(pte);
+}
+#endif
+
+static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+{
+	if ((error_code & PF_WRITE) && !pte_write(*pte))
+                return 0;
+
+	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Handle a spurious fault caused by a stale TLB entry.
+ *
+ * This allows us to lazily refresh the TLB when increasing the
+ * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
+ * eagerly is very expensive since that implies doing a full
+ * cross-processor TLB flush, even if no stale TLB entries exist
+ * on other processors.
+ *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry.  Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
+ * There are no security implications to leaving a stale TLB when
+ * increasing the permissions on a page.
+ *
+ * Returns non-zero if a spurious fault was handled, zero otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
+ */
+static noinline int
+spurious_fault(unsigned long error_code, unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int ret;
+
+	/*
+	 * Only writes to RO or instruction fetches from NX may cause
+	 * spurious faults.
+	 *
+	 * These could be from user or supervisor accesses but the TLB
+	 * is only lazily flushed after a kernel mapping protection
+	 * change, so user accesses are not expected to cause spurious
+	 * faults.
+	 */
+	if (error_code != (PF_WRITE | PF_PROT)
+	    && error_code != (PF_INSTR | PF_PROT))
+		return 0;
+
+	pgd = init_mm.pgd + pgd_index(address);
+	if (!pgd_present(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		return 0;
+
+	if (pud_large(*pud))
+		return spurious_fault_check(error_code, (pte_t *) pud);
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return spurious_fault_check(error_code, (pte_t *) pmd);
+
+	pte = pte_offset_kernel(pmd, address);
+	if (!pte_present(*pte))
+		return 0;
+
+	ret = spurious_fault_check(error_code, pte);
+	if (!ret)
+		return 0;
+
+	/*
+	 * Make sure we have permissions in PMD.
+	 * If not, then there's a bug in the page tables:
+	 */
+	ret = spurious_fault_check(error_code, (pte_t *) pmd);
+	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
+
+	return ret;
+}
+NOKPROBE_SYMBOL(spurious_fault);
+
+int show_unhandled_signals = 1;
+
+static inline int
+access_error(unsigned long error_code, struct vm_area_struct *vma)
+{
+	if (error_code & PF_WRITE) {
+		/* write, present and write, not present: */
+		if (unlikely(!(vma->vm_flags & VM_WRITE)))
+			return 1;
+		return 0;
+	}
+
+	/* read, present: */
+	if (unlikely(error_code & PF_PROT))
+		return 1;
+
+	/* read, not present: */
+	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+		return 1;
+
+	return 0;
+}
+
+static int fault_in_kernel_space(unsigned long address)
+{
+	return address >= TASK_SIZE_MAX;
+}
+
+static inline bool smap_violation(int error_code, struct pt_regs *regs)
+{
+	if (!IS_ENABLED(CONFIG_X86_SMAP))
+		return false;
+
+	if (!static_cpu_has(X86_FEATURE_SMAP))
+		return false;
+
+	if (error_code & PF_USER)
+		return false;
+
+	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
+		return false;
+
+	return true;
+}
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * This function must have noinline because both callers
+ * {,trace_}do_page_fault() have notrace on. Having this an actual function
+ * guarantees there's a function trace entry.
+ */
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+		unsigned long address)
+{
+	struct vm_area_struct *vma;
+	struct task_struct *tsk;
+	struct mm_struct *mm;
+	int fault, major = 0;
+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+	tsk = current;
+	mm = tsk->mm;
+
+	/*
+	 * Detect and handle instructions that would cause a page fault for
+	 * both a tracked kernel page and a userspace page.
+	 */
+	if (kmemcheck_active(regs))
+		kmemcheck_hide(regs);
+	prefetchw(&mm->mmap_sem);
+
+        /*
+         * Detect and handle page faults due to TuxOnIce making pages read-only
+         * so that it can create incremental images.
+         *
+         * Do it early to avoid double faults.
+         */
+        if (unlikely(toi_make_writable(init_mm.pgd, address)))
+            return;
+
+	if (unlikely(kmmio_fault(regs, address)))
+		return;
+
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 *
+	 * This verifies that the fault happens in kernel space
+	 * (error_code & 4) == 0, and that the fault was not a
+	 * protection error (error_code & 9) == 0.
+	 */
+	if (unlikely(fault_in_kernel_space(address))) {
+		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+			if (vmalloc_fault(address) >= 0)
+				return;
+
+			if (kmemcheck_fault(regs, address, error_code))
+				return;
+		}
+
+		/* Can handle a stale RO->RW TLB: */
+		if (spurious_fault(error_code, address))
+			return;
+
+		/* kprobes don't want to hook the spurious faults: */
+		if (kprobes_fault(regs))
+			return;
+		/*
+		 * Don't take the mm semaphore here. If we fixup a prefetch
+		 * fault we could otherwise deadlock:
+		 */
+		bad_area_nosemaphore(regs, error_code, address);
+
+		return;
+	}
+
+	/* kprobes don't want to hook the spurious faults: */
+	if (unlikely(kprobes_fault(regs)))
+		return;
+
+	if (unlikely(error_code & PF_RSVD))
+		pgtable_bad(regs, error_code, address);
+
+	if (unlikely(smap_violation(error_code, regs))) {
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * If we're in an interrupt, have no user context or are running
+	 * in an atomic region then we must not take the fault:
+	 */
+	if (unlikely(in_atomic() || !mm)) {
+		bad_area_nosemaphore(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * It's safe to allow irq's after cr2 has been saved and the
+	 * vmalloc fault has been handled.
+	 *
+	 * User-mode registers count as a user access even for any
+	 * potential system fault or CPU buglet:
+	 */
+	if (user_mode(regs)) {
+		local_irq_enable();
+		error_code |= PF_USER;
+		flags |= FAULT_FLAG_USER;
+	} else {
+		if (regs->flags & X86_EFLAGS_IF)
+			local_irq_enable();
+	}
+
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+
+	if (error_code & PF_WRITE)
+		flags |= FAULT_FLAG_WRITE;
+
+	/*
+	 * When running in the kernel we expect faults to occur only to
+	 * addresses in user space.  All other faults represent errors in
+	 * the kernel and should generate an OOPS.  Unfortunately, in the
+	 * case of an erroneous fault occurring in a code path which already
+	 * holds mmap_sem we will deadlock attempting to validate the fault
+	 * against the address space.  Luckily the kernel only validly
+	 * references user space from well defined areas of code, which are
+	 * listed in the exceptions table.
+	 *
+	 * As the vast majority of faults will be valid we will only perform
+	 * the source reference check when there is a possibility of a
+	 * deadlock. Attempt to lock the address space, if we cannot we then
+	 * validate the source. If this is invalid we can skip the address
+	 * space check, thus avoiding the deadlock:
+	 */
+	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+		if ((error_code & PF_USER) == 0 &&
+		    !search_exception_tables(regs->ip)) {
+			bad_area_nosemaphore(regs, error_code, address);
+			return;
+		}
+retry:
+		down_read(&mm->mmap_sem);
+	} else {
+		/*
+		 * The above down_read_trylock() might have succeeded in
+		 * which case we'll have missed the might_sleep() from
+		 * down_read():
+		 */
+		might_sleep();
+	}
+
+	vma = find_vma(mm, address);
+	if (unlikely(!vma)) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (likely(vma->vm_start <= address))
+		goto good_area;
+	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+	if (error_code & PF_USER) {
+		/*
+		 * Accessing the stack below %sp is always a bug.
+		 * The large cushion allows instructions like enter
+		 * and pusha to work. ("enter $65535, $31" pushes
+		 * 32 pointers and then decrements %sp by 65535.)
+		 */
+		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
+			bad_area(regs, error_code, address);
+			return;
+		}
+	}
+	if (unlikely(expand_stack(vma, address))) {
+		bad_area(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * Ok, we have a good vm_area for this memory access, so
+	 * we can handle it..
+	 */
+good_area:
+	if (unlikely(access_error(error_code, vma))) {
+		bad_area_access_error(regs, error_code, address);
+		return;
+	}
+
+	/*
+	 * If for any reason at all we couldn't handle the fault,
+	 * make sure we exit gracefully rather than endlessly redo
+	 * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+	 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
+	 */
+	fault = handle_mm_fault(mm, vma, address, flags);
+	major |= fault & VM_FAULT_MAJOR;
+
+	/*
+	 * If we need to retry the mmap_sem has already been released,
+	 * and if there is a fatal signal pending there is no guarantee
+	 * that we made any progress. Handle this case first.
+	 */
+	if (unlikely(fault & VM_FAULT_RETRY)) {
+		/* Retry at most once */
+		if (flags & FAULT_FLAG_ALLOW_RETRY) {
+			flags &= ~FAULT_FLAG_ALLOW_RETRY;
+			flags |= FAULT_FLAG_TRIED;
+			if (!fatal_signal_pending(tsk))
+				goto retry;
+		}
+
+		/* User mode? Just return to handle the fatal exception */
+		if (flags & FAULT_FLAG_USER)
+			return;
+
+		/* Not returning to user mode? Handle exceptions or die: */
+		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+		return;
+	}
+
+	up_read(&mm->mmap_sem);
+	if (unlikely(fault & VM_FAULT_ERROR)) {
+		mm_fault_error(regs, error_code, address, fault);
+		return;
+	}
+
+	/*
+	 * Major/minor page fault accounting. If any of the events
+	 * returned VM_FAULT_MAJOR, we account it as a major fault.
+	 */
+	if (major) {
+		tsk->maj_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+	} else {
+		tsk->min_flt++;
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+	}
+
+	check_v8086_mode(regs, address, tsk);
+}
+NOKPROBE_SYMBOL(__do_page_fault);
+
+dotraplinkage void notrace
+do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	unsigned long address = read_cr2(); /* Get the faulting address */
+	enum ctx_state prev_state;
+
+	/*
+	 * We must have this function tagged with __kprobes, notrace and call
+	 * read_cr2() before calling anything else. To avoid calling any kind
+	 * of tracing machinery before we've observed the CR2 value.
+	 *
+	 * exception_{enter,exit}() contain all sorts of tracepoints.
+	 */
+
+	prev_state = exception_enter();
+	__do_page_fault(regs, error_code, address);
+	exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(do_page_fault);
+
+#ifdef CONFIG_TRACING
+static nokprobe_inline void
+trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
+			 unsigned long error_code)
+{
+	if (user_mode(regs))
+		trace_page_fault_user(address, regs, error_code);
+	else
+		trace_page_fault_kernel(address, regs, error_code);
+}
+
+dotraplinkage void notrace
+trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+	/*
+	 * The exception_enter and tracepoint processing could
+	 * trigger another page faults (user space callchain
+	 * reading) and destroy the original cr2 value, so read
+	 * the faulting address now.
+	 */
+	unsigned long address = read_cr2();
+	enum ctx_state prev_state;
+
+	prev_state = exception_enter();
+	trace_page_fault_entries(address, regs, error_code);
+	__do_page_fault(regs, error_code, address);
+	exception_exit(prev_state);
+}
+NOKPROBE_SYMBOL(trace_do_page_fault);
+#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000..81bf3d2af
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,405 @@
+/*
+ * Lockless get_user_pages_fast for x86
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/highmem.h>
+#include <linux/swap.h>
+
+#include <asm/pgtable.h>
+
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+#ifndef CONFIG_X86_PAE
+	return READ_ONCE(*ptep);
+#else
+	/*
+	 * With get_user_pages_fast, we walk down the pagetables without taking
+	 * any locks.  For this we would like to load the pointers atomically,
+	 * but that is not possible (without expensive cmpxchg8b) on PAE.  What
+	 * we do have is the guarantee that a pte will only either go from not
+	 * present to present, or present to not present or both -- it will not
+	 * switch to a completely different present page without a TLB flush in
+	 * between; something that we are blocking by holding interrupts off.
+	 *
+	 * Setting ptes from not present to present goes:
+	 * ptep->pte_high = h;
+	 * smp_wmb();
+	 * ptep->pte_low = l;
+	 *
+	 * And present to not present goes:
+	 * ptep->pte_low = 0;
+	 * smp_wmb();
+	 * ptep->pte_high = 0;
+	 *
+	 * We must ensure here that the load of pte_low sees l iff pte_high
+	 * sees h. We load pte_high *after* loading pte_low, which ensures we
+	 * don't see an older value of pte_high.  *Then* we recheck pte_low,
+	 * which ensures that we haven't picked up a changed pte high. We might
+	 * have got rubbish values from pte_low and pte_high, but we are
+	 * guaranteed that pte_low will not have the present bit set *unless*
+	 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
+	 * we're safe.
+	 *
+	 * gup_get_pte should not be used or copied outside gup.c without being
+	 * very careful -- it does not atomically load the pte or anything that
+	 * is likely to be useful for you.
+	 */
+	pte_t pte;
+
+retry:
+	pte.pte_low = ptep->pte_low;
+	smp_rmb();
+	pte.pte_high = ptep->pte_high;
+	smp_rmb();
+	if (unlikely(pte.pte_low != ptep->pte_low))
+		goto retry;
+
+	return pte;
+#endif
+}
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	pte_t *ptep;
+
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+
+	ptep = pte_offset_map(&pmd, addr);
+	do {
+		pte_t pte = gup_get_pte(ptep);
+		struct page *page;
+
+		/* Similar to the PMD case, NUMA hinting must take slow path */
+		if (pte_protnone(pte)) {
+			pte_unmap(ptep);
+			return 0;
+		}
+
+		if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+			pte_unmap(ptep);
+			return 0;
+		}
+		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
+		get_page(page);
+		SetPageReferenced(page);
+		pages[*nr] = page;
+		(*nr)++;
+
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+	pte_unmap(ptep - 1);
+
+	return 1;
+}
+
+static inline void get_head_page_multiple(struct page *page, int nr)
+{
+	VM_BUG_ON_PAGE(page != compound_head(page), page);
+	VM_BUG_ON_PAGE(page_count(page) == 0, page);
+	atomic_add(nr, &page->_count);
+	SetPageReferenced(page);
+}
+
+static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	pte_t pte = *(pte_t *)&pmd;
+	struct page *head, *page;
+	int refs;
+
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+	if ((pte_flags(pte) & mask) != mask)
+		return 0;
+	/* hugepages are never "special" */
+	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON_PAGE(compound_head(page) != head, page);
+		pages[*nr] = page;
+		if (PageTail(page))
+			get_huge_page_tail(page);
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+	get_head_page_multiple(head, refs);
+
+	return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+		int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pmd_t *pmdp;
+
+	pmdp = pmd_offset(&pud, addr);
+	do {
+		pmd_t pmd = *pmdp;
+
+		next = pmd_addr_end(addr, end);
+		/*
+		 * The pmd_trans_splitting() check below explains why
+		 * pmdp_splitting_flush has to flush the tlb, to stop
+		 * this gup-fast code from running while we set the
+		 * splitting bit in the pmd. Returning zero will take
+		 * the slow path that will call wait_split_huge_page()
+		 * if the pmd is still in splitting state. gup-fast
+		 * can't because it has irq disabled and
+		 * wait_split_huge_page() would never return as the
+		 * tlb flush IPI wouldn't run.
+		 */
+		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+			return 0;
+		if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
+			/*
+			 * NUMA hinting faults need to be handled in the GUP
+			 * slowpath for accounting purposes and so that they
+			 * can be serialised against THP migration.
+			 */
+			if (pmd_protnone(pmd))
+				return 0;
+			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+				return 0;
+		} else {
+			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pmdp++, addr = next, addr != end);
+
+	return 1;
+}
+
+static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
+		unsigned long end, int write, struct page **pages, int *nr)
+{
+	unsigned long mask;
+	pte_t pte = *(pte_t *)&pud;
+	struct page *head, *page;
+	int refs;
+
+	mask = _PAGE_PRESENT|_PAGE_USER;
+	if (write)
+		mask |= _PAGE_RW;
+	if ((pte_flags(pte) & mask) != mask)
+		return 0;
+	/* hugepages are never "special" */
+	VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
+	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+	refs = 0;
+	head = pte_page(pte);
+	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	do {
+		VM_BUG_ON_PAGE(compound_head(page) != head, page);
+		pages[*nr] = page;
+		if (PageTail(page))
+			get_huge_page_tail(page);
+		(*nr)++;
+		page++;
+		refs++;
+	} while (addr += PAGE_SIZE, addr != end);
+	get_head_page_multiple(head, refs);
+
+	return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+			int write, struct page **pages, int *nr)
+{
+	unsigned long next;
+	pud_t *pudp;
+
+	pudp = pud_offset(&pgd, addr);
+	do {
+		pud_t pud = *pudp;
+
+		next = pud_addr_end(addr, end);
+		if (pud_none(pud))
+			return 0;
+		if (unlikely(pud_large(pud))) {
+			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
+				return 0;
+		} else {
+			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+				return 0;
+		}
+	} while (pudp++, addr = next, addr != end);
+
+	return 1;
+}
+
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			  struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	unsigned long flags;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+	end = start + len;
+	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+					(void __user *)start, len)))
+		return 0;
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_save(flags);
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			break;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			break;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_restore(flags);
+
+	return nr;
+}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start:	starting user address
+ * @nr_pages:	number of pages from start to pin
+ * @write:	whether pages will be written to
+ * @pages:	array that receives pointers to the pages pinned.
+ * 		Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+			struct page **pages)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr, len, end;
+	unsigned long next;
+	pgd_t *pgdp;
+	int nr = 0;
+
+	start &= PAGE_MASK;
+	addr = start;
+	len = (unsigned long) nr_pages << PAGE_SHIFT;
+
+	end = start + len;
+	if (end < start)
+		goto slow_irqon;
+
+#ifdef CONFIG_X86_64
+	if (end >> __VIRTUAL_MASK_SHIFT)
+		goto slow_irqon;
+#endif
+
+	/*
+	 * XXX: batch / limit 'nr', to avoid large irq off latency
+	 * needs some instrumenting to determine the common sizes used by
+	 * important workloads (eg. DB2), and whether limiting the batch size
+	 * will decrease performance.
+	 *
+	 * It seems like we're in the clear for the moment. Direct-IO is
+	 * the main guy that batches up lots of get_user_pages, and even
+	 * they are limited to 64-at-a-time which is not so many.
+	 */
+	/*
+	 * This doesn't prevent pagetable teardown, but does prevent
+	 * the pagetables and pages from being freed on x86.
+	 *
+	 * So long as we atomically load page table pointers versus teardown
+	 * (which we do on x86, with the above PAE exception), we can follow the
+	 * address down to the the page and take a ref on it.
+	 */
+	local_irq_disable();
+	pgdp = pgd_offset(mm, addr);
+	do {
+		pgd_t pgd = *pgdp;
+
+		next = pgd_addr_end(addr, end);
+		if (pgd_none(pgd))
+			goto slow;
+		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+			goto slow;
+	} while (pgdp++, addr = next, addr != end);
+	local_irq_enable();
+
+	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+	return nr;
+
+	{
+		int ret;
+
+slow:
+		local_irq_enable();
+slow_irqon:
+		/* Try to get the remaining pages with get_user_pages */
+		start += nr << PAGE_SHIFT;
+		pages += nr;
+
+		ret = get_user_pages_unlocked(current, mm, start,
+					      (end - start) >> PAGE_SHIFT,
+					      write, 0, pages);
+
+		/* Have to be a bit careful with return values */
+		if (nr > 0) {
+			if (ret < 0)
+				ret = nr;
+			else
+				ret += nr;
+		}
+
+		return ret;
+	}
+}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
new file mode 100644
index 000000000..4500142bc
--- /dev/null
+++ b/arch/x86/mm/highmem_32.c
@@ -0,0 +1,146 @@
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/swap.h> /* for totalram_pages */
+#include <linux/bootmem.h>
+
+void *kmap(struct page *page)
+{
+	might_sleep();
+	if (!PageHighMem(page))
+		return page_address(page);
+	return kmap_high(page);
+}
+EXPORT_SYMBOL(kmap);
+
+void kunmap(struct page *page)
+{
+	if (in_interrupt())
+		BUG();
+	if (!PageHighMem(page))
+		return;
+	kunmap_high(page);
+}
+EXPORT_SYMBOL(kunmap);
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap it is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+	unsigned long vaddr;
+	int idx, type;
+
+	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	pagefault_disable();
+
+	if (!PageHighMem(page))
+		return page_address(page);
+
+	type = kmap_atomic_idx_push();
+	idx = type + KM_TYPE_NR*smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	BUG_ON(!pte_none(*(kmap_pte-idx)));
+	set_pte(kmap_pte-idx, mk_pte(page, prot));
+	arch_flush_lazy_mmu_mode();
+
+	return (void *)vaddr;
+}
+EXPORT_SYMBOL(kmap_atomic_prot);
+
+void *kmap_atomic(struct page *page)
+{
+	return kmap_atomic_prot(page, kmap_prot);
+}
+EXPORT_SYMBOL(kmap_atomic);
+
+/*
+ * This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn)
+{
+	return kmap_atomic_prot_pfn(pfn, kmap_prot);
+}
+EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
+
+void __kunmap_atomic(void *kvaddr)
+{
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
+		kpte_clear_flush(kmap_pte-idx, vaddr);
+		kmap_atomic_idx_pop();
+		arch_flush_lazy_mmu_mode();
+	}
+#ifdef CONFIG_DEBUG_HIGHMEM
+	else {
+		BUG_ON(vaddr < PAGE_OFFSET);
+		BUG_ON(vaddr >= (unsigned long)high_memory);
+	}
+#endif
+
+	pagefault_enable();
+}
+EXPORT_SYMBOL(__kunmap_atomic);
+
+struct page *kmap_atomic_to_page(void *ptr)
+{
+	unsigned long idx, vaddr = (unsigned long)ptr;
+	pte_t *pte;
+
+	if (vaddr < FIXADDR_START)
+		return virt_to_page(ptr);
+
+	idx = virt_to_fix(vaddr);
+	pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+	return pte_page(*pte);
+}
+EXPORT_SYMBOL(kmap_atomic_to_page);
+
+void __init set_highmem_pages_init(void)
+{
+	struct zone *zone;
+	int nid;
+
+	/*
+	 * Explicitly reset zone->managed_pages because set_highmem_pages_init()
+	 * is invoked before free_all_bootmem()
+	 */
+	reset_all_zones_managed_pages();
+	for_each_zone(zone) {
+		unsigned long zone_start_pfn, zone_end_pfn;
+
+		if (!is_highmem(zone))
+			continue;
+
+		zone_start_pfn = zone->zone_start_pfn;
+		zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+
+		nid = zone_to_nid(zone);
+		printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
+				zone->name, nid, zone_start_pfn, zone_end_pfn);
+
+		add_highpages_with_active_regions(nid, zone_start_pfn,
+				 zone_end_pfn);
+	}
+}
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
new file mode 100644
index 000000000..42982b26e
--- /dev/null
+++ b/arch/x86/mm/hugetlbpage.c
@@ -0,0 +1,186 @@
+/*
+ * IA-32 Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+
+#if 0	/* This is just for testing */
+struct page *
+follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
+{
+	unsigned long start = address;
+	int length = 1;
+	int nr;
+	struct page *page;
+	struct vm_area_struct *vma;
+
+	vma = find_vma(mm, addr);
+	if (!vma || !is_vm_hugetlb_page(vma))
+		return ERR_PTR(-EINVAL);
+
+	pte = huge_pte_offset(mm, address);
+
+	/* hugetlb should be locked, and hence, prefaulted */
+	WARN_ON(!pte || pte_none(*pte));
+
+	page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+	WARN_ON(!PageHead(page));
+
+	return page;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+
+int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
+#else
+
+/*
+ * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
+ * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
+ * Otherwise, returns 0.
+ */
+int pmd_huge(pmd_t pmd)
+{
+	return !pmd_none(pmd) &&
+		(pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
+}
+
+int pud_huge(pud_t pud)
+{
+	return !!(pud_val(pud) & _PAGE_PSE);
+}
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+		unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	struct hstate *h = hstate_file(file);
+	struct vm_unmapped_area_info info;
+
+	info.flags = 0;
+	info.length = len;
+	info.low_limit = current->mm->mmap_legacy_base;
+	info.high_limit = TASK_SIZE;
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+	return vm_unmapped_area(&info);
+}
+
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+		unsigned long addr0, unsigned long len,
+		unsigned long pgoff, unsigned long flags)
+{
+	struct hstate *h = hstate_file(file);
+	struct vm_unmapped_area_info info;
+	unsigned long addr;
+
+	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+	info.length = len;
+	info.low_limit = PAGE_SIZE;
+	info.high_limit = current->mm->mmap_base;
+	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+	info.align_offset = 0;
+	addr = vm_unmapped_area(&info);
+
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	if (addr & ~PAGE_MASK) {
+		VM_BUG_ON(addr != -ENOMEM);
+		info.flags = 0;
+		info.low_limit = TASK_UNMAPPED_BASE;
+		info.high_limit = TASK_SIZE;
+		addr = vm_unmapped_area(&info);
+	}
+
+	return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct hstate *h = hstate_file(file);
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+
+	if (len & ~huge_page_mask(h))
+		return -EINVAL;
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (flags & MAP_FIXED) {
+		if (prepare_hugepage_range(file, addr, len))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (addr) {
+		addr = ALIGN(addr, huge_page_size(h));
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+	if (mm->get_unmapped_area == arch_get_unmapped_area)
+		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+				pgoff, flags);
+	else
+		return hugetlb_get_unmapped_area_topdown(file, addr, len,
+				pgoff, flags);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_X86_64
+static __init int setup_hugepagesz(char *opt)
+{
+	unsigned long ps = memparse(opt, &opt);
+	if (ps == PMD_SIZE) {
+		hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+	} else if (ps == PUD_SIZE && cpu_has_gbpages) {
+		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	} else {
+		printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
+			ps >> 20);
+		return 0;
+	}
+	return 1;
+}
+__setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_CMA
+static __init int gigantic_pages_init(void)
+{
+	/* With CMA we can allocate gigantic pages at runtime */
+	if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
+		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+	return 0;
+}
+arch_initcall(gigantic_pages_init);
+#endif
+#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
new file mode 100644
index 000000000..f971ac820
--- /dev/null
+++ b/arch/x86/mm/init.c
@@ -0,0 +1,752 @@
+#include <linux/gfp.h>
+#include <linux/initrd.h>
+#include <linux/ioport.h>
+#include <linux/swap.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>	/* for max_low_pfn */
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
+#include <asm/page.h>
+#include <asm/page_types.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/proto.h>
+#include <asm/dma.h>		/* for MAX_DMA_PFN */
+#include <asm/microcode.h>
+
+/*
+ * We need to define the tracepoints somewhere, and tlb.c
+ * is only compied when SMP=y.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/tlb.h>
+
+#include "mm_internal.h"
+
+/*
+ * Tables translating between page_cache_type_t and pte encoding.
+ *
+ * Minimal supported modes are defined statically, they are modified
+ * during bootup if more supported cache modes are available.
+ *
+ *   Index into __cachemode2pte_tbl[] is the cachemode.
+ *
+ *   Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
+ *   (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
+ */
+uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
+	[_PAGE_CACHE_MODE_WB      ]	= 0         | 0        ,
+	[_PAGE_CACHE_MODE_WC      ]	= _PAGE_PWT | 0        ,
+	[_PAGE_CACHE_MODE_UC_MINUS]	= 0         | _PAGE_PCD,
+	[_PAGE_CACHE_MODE_UC      ]	= _PAGE_PWT | _PAGE_PCD,
+	[_PAGE_CACHE_MODE_WT      ]	= 0         | _PAGE_PCD,
+	[_PAGE_CACHE_MODE_WP      ]	= 0         | _PAGE_PCD,
+};
+EXPORT_SYMBOL(__cachemode2pte_tbl);
+
+uint8_t __pte2cachemode_tbl[8] = {
+	[__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
+	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
+	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
+	[__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
+	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
+};
+EXPORT_SYMBOL(__pte2cachemode_tbl);
+
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
+
+static unsigned long min_pfn_mapped;
+
+static bool __initdata can_use_brk_pgt = true;
+
+/*
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ *    279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
+ */
+__ref void *alloc_low_pages(unsigned int num)
+{
+	unsigned long pfn;
+	int i;
+
+	if (after_bootmem) {
+		unsigned int order;
+
+		order = get_order((unsigned long)num << PAGE_SHIFT);
+		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+						__GFP_ZERO, order);
+	}
+
+	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
+		unsigned long ret;
+		if (min_pfn_mapped >= max_pfn_mapped)
+			panic("alloc_low_pages: ran out of memory");
+		ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+					max_pfn_mapped << PAGE_SHIFT,
+					PAGE_SIZE * num , PAGE_SIZE);
+		if (!ret)
+			panic("alloc_low_pages: can not alloc memory");
+		memblock_reserve(ret, PAGE_SIZE * num);
+		pfn = ret >> PAGE_SHIFT;
+	} else {
+		pfn = pgt_buf_end;
+		pgt_buf_end += num;
+		printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+			pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
+	}
+
+	for (i = 0; i < num; i++) {
+		void *adr;
+
+		adr = __va((pfn + i) << PAGE_SHIFT);
+		clear_page(adr);
+	}
+
+	return __va(pfn << PAGE_SHIFT);
+}
+
+/* need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE	(6 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void  __init early_alloc_pgt_buf(void)
+{
+	unsigned long tables = INIT_PGT_BUF_SIZE;
+	phys_addr_t base;
+
+	base = __pa(extend_brk(tables, PAGE_SIZE));
+
+	pgt_buf_start = base >> PAGE_SHIFT;
+	pgt_buf_end = pgt_buf_start;
+	pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
+int after_bootmem;
+
+early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
+
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+static int page_size_mask;
+
+static void __init probe_page_size_mask(void)
+{
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_TOI_INCREMENTAL)
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC or TuxOnIce's incremental image support,
+         * identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	if (cpu_has_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		cr4_set_bits_and_update_boot(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		cr4_set_bits_and_update_boot(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	} else
+		__supported_pte_mask &= ~_PAGE_GLOBAL;
+
+	/* Enable 1 GB linear kernel mappings if available: */
+	if (direct_gbpages && cpu_has_gbpages) {
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	} else {
+		direct_gbpages = 0;
+	}
+}
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int __meminit save_mr(struct map_range *mr, int nr_range,
+			     unsigned long start_pfn, unsigned long end_pfn,
+			     unsigned long page_size_mask)
+{
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
+	}
+
+	return nr_range;
+}
+
+/*
+ * adjust the page_size_mask for small range to go with
+ *	big page size instead small one if nearby are ram too.
+ */
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+							 int nr_range)
+{
+	int i;
+
+	for (i = 0; i < nr_range; i++) {
+		if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
+		    !(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
+			unsigned long start = round_down(mr[i].start, PMD_SIZE);
+			unsigned long end = round_up(mr[i].end, PMD_SIZE);
+
+#ifdef CONFIG_X86_32
+			if ((end >> PAGE_SHIFT) > max_low_pfn)
+				continue;
+#endif
+
+			if (memblock_is_region_memory(start, end - start))
+				mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
+		}
+		if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
+		    !(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
+			unsigned long start = round_down(mr[i].start, PUD_SIZE);
+			unsigned long end = round_up(mr[i].end, PUD_SIZE);
+
+			if (memblock_is_region_memory(start, end - start))
+				mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
+		}
+	}
+}
+
+static const char *page_size_string(struct map_range *mr)
+{
+	static const char str_1g[] = "1G";
+	static const char str_2m[] = "2M";
+	static const char str_4m[] = "4M";
+	static const char str_4k[] = "4k";
+
+	if (mr->page_size_mask & (1<<PG_LEVEL_1G))
+		return str_1g;
+	/*
+	 * 32-bit without PAE has a 4M large page size.
+	 * PG_LEVEL_2M is misnamed, but we can at least
+	 * print out the right size in the string.
+	 */
+	if (IS_ENABLED(CONFIG_X86_32) &&
+	    !IS_ENABLED(CONFIG_X86_PAE) &&
+	    mr->page_size_mask & (1<<PG_LEVEL_2M))
+		return str_4m;
+
+	if (mr->page_size_mask & (1<<PG_LEVEL_2M))
+		return str_2m;
+
+	return str_4k;
+}
+
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+				     unsigned long start,
+				     unsigned long end)
+{
+	unsigned long start_pfn, end_pfn, limit_pfn;
+	unsigned long pfn;
+	int i;
+
+	limit_pfn = PFN_DOWN(end);
+
+	/* head if not big page alignment ? */
+	pfn = start_pfn = PFN_DOWN(start);
+#ifdef CONFIG_X86_32
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	if (pfn == 0)
+		end_pfn = PFN_DOWN(PMD_SIZE);
+	else
+		end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#else /* CONFIG_X86_64 */
+	end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#endif
+	if (end_pfn > limit_pfn)
+		end_pfn = limit_pfn;
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+		pfn = end_pfn;
+	}
+
+	/* big page (2M) range */
+	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+#ifdef CONFIG_X86_32
+	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+#else /* CONFIG_X86_64 */
+	end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+	if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+		end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+#endif
+
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pfn = end_pfn;
+	}
+
+#ifdef CONFIG_X86_64
+	/* big page (1G) range */
+	start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
+	end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+		pfn = end_pfn;
+	}
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
+	end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pfn = end_pfn;
+	}
+#endif
+
+	/* tail is not big page (2M) alignment */
+	start_pfn = pfn;
+	end_pfn = limit_pfn;
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	if (!after_bootmem)
+		adjust_range_page_size_mask(mr, nr_range);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			(nr_range - 1 - i) * sizeof(struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
+	}
+
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
+				mr[i].start, mr[i].end - 1,
+				page_size_string(&mr[i]));
+
+	return nr_range;
+}
+
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+	nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+					     nr_pfn_mapped, start_pfn, end_pfn);
+	nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+	max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+	if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+		max_low_pfn_mapped = max(max_low_pfn_mapped,
+					 min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+	int i;
+
+	for (i = 0; i < nr_pfn_mapped; i++)
+		if ((start_pfn >= pfn_mapped[i].start) &&
+		    (end_pfn <= pfn_mapped[i].end))
+			return true;
+
+	return false;
+}
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+					       unsigned long end)
+{
+	struct map_range mr[NR_RANGE_MR];
+	unsigned long ret = 0;
+	int nr_range, i;
+
+	pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
+	       start, end - 1);
+
+	memset(mr, 0, sizeof(mr));
+	nr_range = split_mem_range(mr, 0, start, end);
+
+	for (i = 0; i < nr_range; i++)
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+
+	add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
+
+	return ret >> PAGE_SHIFT;
+}
+
+/*
+ * We need to iterate through the E820 memory map and create direct mappings
+ * for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
+ * create direct mappings for all pfns from [0 to max_low_pfn) and
+ * [4GB to max_pfn) because of possible memory holes in high addresses
+ * that cannot be marked as UC by fixed/variable range MTRRs.
+ * Depending on the alignment of E820 ranges, this may possibly result
+ * in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
+ *
+ * init_mem_mapping() calls init_range_memory_mapping() with big range.
+ * That range would have hole in the middle or ends, and only ram parts
+ * will be mapped in init_range_memory_mapping().
+ */
+static unsigned long __init init_range_memory_mapping(
+					   unsigned long r_start,
+					   unsigned long r_end)
+{
+	unsigned long start_pfn, end_pfn;
+	unsigned long mapped_ram_size = 0;
+	int i;
+
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+		u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+		u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+		if (start >= end)
+			continue;
+
+		/*
+		 * if it is overlapping with brk pgt, we need to
+		 * alloc pgt buf from memblock instead.
+		 */
+		can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
+				    min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
+		init_memory_mapping(start, end);
+		mapped_ram_size += end - start;
+		can_use_brk_pgt = true;
+	}
+
+	return mapped_ram_size;
+}
+
+static unsigned long __init get_new_step_size(unsigned long step_size)
+{
+	/*
+	 * Initial mapped size is PMD_SIZE (2M).
+	 * We can not set step_size to be PUD_SIZE (1G) yet.
+	 * In worse case, when we cross the 1G boundary, and
+	 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
+	 * to map 1G range with PTE. Hence we use one less than the
+	 * difference of page table level shifts.
+	 *
+	 * Don't need to worry about overflow in the top-down case, on 32bit,
+	 * when step_size is 0, round_down() returns 0 for start, and that
+	 * turns it into 0x100000000ULL.
+	 * In the bottom-up case, round_up(x, 0) returns 0 though too, which
+	 * needs to be taken into consideration by the code below.
+	 */
+	return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
+}
+
+/**
+ * memory_map_top_down - Map [map_start, map_end) top down
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in top-down. That said, the page tables
+ * will be allocated at the end of the memory, and we map the
+ * memory in top-down.
+ */
+static void __init memory_map_top_down(unsigned long map_start,
+				       unsigned long map_end)
+{
+	unsigned long real_end, start, last_start;
+	unsigned long step_size;
+	unsigned long addr;
+	unsigned long mapped_ram_size = 0;
+
+	/* xen has big range in reserved near end of ram, skip it at first.*/
+	addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+	real_end = addr + PMD_SIZE;
+
+	/* step_size need to be small so pgt_buf from BRK could cover it */
+	step_size = PMD_SIZE;
+	max_pfn_mapped = 0; /* will get exact value next */
+	min_pfn_mapped = real_end >> PAGE_SHIFT;
+	last_start = start = real_end;
+
+	/*
+	 * We start from the top (end of memory) and go to the bottom.
+	 * The memblock_find_in_range() gets us a block of RAM from the
+	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+	 * for page table.
+	 */
+	while (last_start > map_start) {
+		if (last_start > step_size) {
+			start = round_down(last_start - 1, step_size);
+			if (start < map_start)
+				start = map_start;
+		} else
+			start = map_start;
+		mapped_ram_size += init_range_memory_mapping(start,
+							last_start);
+		last_start = start;
+		min_pfn_mapped = last_start >> PAGE_SHIFT;
+		if (mapped_ram_size >= step_size)
+			step_size = get_new_step_size(step_size);
+	}
+
+	if (real_end < map_end)
+		init_range_memory_mapping(real_end, map_end);
+}
+
+/**
+ * memory_map_bottom_up - Map [map_start, map_end) bottom up
+ * @map_start: start address of the target memory range
+ * @map_end: end address of the target memory range
+ *
+ * This function will setup direct mapping for memory range
+ * [map_start, map_end) in bottom-up. Since we have limited the
+ * bottom-up allocation above the kernel, the page tables will
+ * be allocated just above the kernel and we map the memory
+ * in [map_start, map_end) in bottom-up.
+ */
+static void __init memory_map_bottom_up(unsigned long map_start,
+					unsigned long map_end)
+{
+	unsigned long next, start;
+	unsigned long mapped_ram_size = 0;
+	/* step_size need to be small so pgt_buf from BRK could cover it */
+	unsigned long step_size = PMD_SIZE;
+
+	start = map_start;
+	min_pfn_mapped = start >> PAGE_SHIFT;
+
+	/*
+	 * We start from the bottom (@map_start) and go to the top (@map_end).
+	 * The memblock_find_in_range() gets us a block of RAM from the
+	 * end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
+	 * for page table.
+	 */
+	while (start < map_end) {
+		if (step_size && map_end - start > step_size) {
+			next = round_up(start + 1, step_size);
+			if (next > map_end)
+				next = map_end;
+		} else {
+			next = map_end;
+		}
+
+		mapped_ram_size += init_range_memory_mapping(start, next);
+		start = next;
+
+		if (mapped_ram_size >= step_size)
+			step_size = get_new_step_size(step_size);
+	}
+}
+
+void __init init_mem_mapping(void)
+{
+	unsigned long end;
+
+	probe_page_size_mask();
+
+#ifdef CONFIG_X86_64
+	end = max_pfn << PAGE_SHIFT;
+#else
+	end = max_low_pfn << PAGE_SHIFT;
+#endif
+
+	/* the ISA range is always mapped regardless of memory holes */
+	init_memory_mapping(0, ISA_END_ADDRESS);
+
+	/*
+	 * If the allocation is in bottom-up direction, we setup direct mapping
+	 * in bottom-up, otherwise we setup direct mapping in top-down.
+	 */
+	if (memblock_bottom_up()) {
+		unsigned long kernel_end = __pa_symbol(_end);
+
+		/*
+		 * we need two separate calls here. This is because we want to
+		 * allocate page tables above the kernel. So we first map
+		 * [kernel_end, end) to make memory above the kernel be mapped
+		 * as soon as possible. And then use page tables allocated above
+		 * the kernel to map [ISA_END_ADDRESS, kernel_end).
+		 */
+		memory_map_bottom_up(kernel_end, end);
+		memory_map_bottom_up(ISA_END_ADDRESS, kernel_end);
+	} else {
+		memory_map_top_down(ISA_END_ADDRESS, end);
+	}
+
+#ifdef CONFIG_X86_64
+	if (max_pfn > max_low_pfn) {
+		/* can we preseve max_low_pfn ?*/
+		max_low_pfn = max_pfn;
+	}
+#else
+	early_ioremap_page_table_range_init();
+#endif
+
+	load_cr3(swapper_pg_dir);
+	__flush_tlb_all();
+
+	early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
+}
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains BIOS code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr < 256)
+		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
+
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
+{
+	unsigned long begin_aligned, end_aligned;
+
+	/* Make sure boundaries are page aligned */
+	begin_aligned = PAGE_ALIGN(begin);
+	end_aligned   = end & PAGE_MASK;
+
+	if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
+		begin = begin_aligned;
+		end   = end_aligned;
+	}
+
+	if (begin >= end)
+		return;
+
+	/*
+	 * If debugging page accesses then do not free this memory but
+	 * mark them not present - any buggy init-section access will
+	 * create a kernel page fault:
+	 */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
+		begin, end - 1);
+	set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+#else
+	/*
+	 * We just marked the kernel text read only above, now that
+	 * we are going to free part of that, we need to make that
+	 * writeable and non-executable first.
+	 */
+	set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
+	set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+
+	free_reserved_area((void *)begin, (void *)end, POISON_FREE_INITMEM, what);
+#endif
+}
+
+void free_initmem(void)
+{
+	free_init_pages("unused kernel",
+			(unsigned long)(&__init_begin),
+			(unsigned long)(&__init_end));
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init free_initrd_mem(unsigned long start, unsigned long end)
+{
+#ifdef CONFIG_MICROCODE_EARLY
+	/*
+	 * Remember, initrd memory may contain microcode or other useful things.
+	 * Before we lose initrd mem, we need to find a place to hold them
+	 * now that normal virtual memory is enabled.
+	 */
+	save_microcode_in_initrd();
+#endif
+
+	/*
+	 * end could be not aligned, and We can not align that,
+	 * decompresser could be confused by aligned initrd_end
+	 * We already reserve the end partial page before in
+	 *   - i386_start_kernel()
+	 *   - x86_64_start_kernel()
+	 *   - relocate_initrd()
+	 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
+	 */
+	free_init_pages("initrd", start, PAGE_ALIGN(end));
+}
+#endif
+
+void __init zone_sizes_init(void)
+{
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+	max_zone_pfns[ZONE_DMA]		= min(MAX_DMA_PFN, max_low_pfn);
+#endif
+#ifdef CONFIG_ZONE_DMA32
+	max_zone_pfns[ZONE_DMA32]	= min(MAX_DMA32_PFN, max_low_pfn);
+#endif
+	max_zone_pfns[ZONE_NORMAL]	= max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+	max_zone_pfns[ZONE_HIGHMEM]	= max_pfn;
+#endif
+
+	free_area_init_nodes(max_zone_pfns);
+}
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+	.active_mm = &init_mm,
+	.state = 0,
+#endif
+	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
+
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
+{
+	/* entry 0 MUST be WB (hardwired to speed up translations) */
+	BUG_ON(!entry && cache != _PAGE_CACHE_MODE_WB);
+
+	__cachemode2pte_tbl[cache] = __cm_idx2pte(entry);
+	__pte2cachemode_tbl[entry] = cache;
+}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
new file mode 100644
index 000000000..c8140e128
--- /dev/null
+++ b/arch/x86/mm/init_32.c
@@ -0,0 +1,961 @@
+/*
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/memory_hotplug.h>
+#include <linux/initrd.h>
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+
+#include <asm/asm.h>
+#include <asm/bios_ebda.h>
+#include <asm/processor.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/bugs.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+#include <asm/paravirt.h>
+#include <asm/setup.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/init.h>
+
+#include "mm_internal.h"
+
+unsigned long highstart_pfn, highend_pfn;
+
+static noinline int do_test_wp_bit(void);
+
+bool __read_mostly __vmalloc_start_set = false;
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+	pud_t *pud;
+	pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
+		pmd_table = (pmd_t *)alloc_low_page();
+		paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+		set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+		pud = pud_offset(pgd, 0);
+		BUG_ON(pmd_table != pmd_offset(pud, 0));
+
+		return pmd_table;
+	}
+#endif
+	pud = pud_offset(pgd, 0);
+	pmd_table = pmd_offset(pud, 0);
+
+	return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry:
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
+		pte_t *page_table = (pte_t *)alloc_low_page();
+
+		paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
+	}
+
+	return pte_offset_kernel(pmd, 0);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	int pgd_idx = pgd_index(vaddr);
+	int pmd_idx = pmd_index(vaddr);
+
+	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	int pte_idx = pte_index(vaddr);
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return one_page_table_init(pmd) + pte_idx;
+}
+
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+	unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+	int pgd_idx, pmd_idx;
+	unsigned long vaddr;
+
+	if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+		return 0;
+
+	vaddr = start;
+	pgd_idx = pgd_index(vaddr);
+
+	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+							pmd_idx++) {
+			if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+			    (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+				count++;
+			vaddr += PMD_SIZE;
+		}
+		pmd_idx = 0;
+	}
+#endif
+	return count;
+}
+
+static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
+					   unsigned long vaddr, pte_t *lastpte,
+					   void **adr)
+{
+#ifdef CONFIG_HIGHMEM
+	/*
+	 * Something (early fixmap) may already have put a pte
+	 * page here, which causes the page table allocation
+	 * to become nonlinear. Attempt to fix it, and if it
+	 * is still nonlinear then we have to bug.
+	 */
+	int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+	int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+
+	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
+	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
+	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
+		pte_t *newpte;
+		int i;
+
+		BUG_ON(after_bootmem);
+		newpte = *adr;
+		for (i = 0; i < PTRS_PER_PTE; i++)
+			set_pte(newpte + i, pte[i]);
+		*adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
+
+		paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
+		set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
+		BUG_ON(newpte != pte_offset_kernel(pmd, 0));
+		__flush_tlb_all();
+
+		paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
+		pte = newpte;
+	}
+	BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
+	       && vaddr > fix_to_virt(FIX_KMAP_END)
+	       && lastpte && lastpte + PTRS_PER_PTE != pte);
+#endif
+	return pte;
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ *
+ * NOTE: The pagetables are allocated contiguous on the physical space
+ * so we can cache the place of the first one and move around without
+ * checking the pgd every time.
+ */
+static void __init
+page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
+{
+	int pgd_idx, pmd_idx;
+	unsigned long vaddr;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte = NULL;
+	unsigned long count = page_table_range_init_count(start, end);
+	void *adr = NULL;
+
+	if (count)
+		adr = alloc_low_pages(count);
+
+	vaddr = start;
+	pgd_idx = pgd_index(vaddr);
+	pmd_idx = pmd_index(vaddr);
+	pgd = pgd_base + pgd_idx;
+
+	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+		pmd = one_md_table_init(pgd);
+		pmd = pmd + pmd_index(vaddr);
+		for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+							pmd++, pmd_idx++) {
+			pte = page_table_kmap_check(one_page_table_init(pmd),
+						    pmd, vaddr, pte, &adr);
+
+			vaddr += PMD_SIZE;
+		}
+		pmd_idx = 0;
+	}
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
+		return 1;
+	return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total
+ * of max_low_pfn pages, by creating page tables starting from address
+ * PAGE_OFFSET:
+ */
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
+{
+	int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+	unsigned long last_map_addr = end;
+	unsigned long start_pfn, end_pfn;
+	pgd_t *pgd_base = swapper_pg_dir;
+	int pgd_idx, pmd_idx, pte_ofs;
+	unsigned long pfn;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned pages_2m, pages_4k;
+	int mapping_iter;
+
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
+	/*
+	 * First iteration will setup identity mapping using large/small pages
+	 * based on use_pse, with other attributes same as set by
+	 * the early code in head_32.S
+	 *
+	 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
+	 * as desired for the kernel identity mapping.
+	 *
+	 * This two pass mechanism conforms to the TLB app note which says:
+	 *
+	 *     "Software should not write to a paging-structure entry in a way
+	 *      that would change, for any linear address, both the page size
+	 *      and either the page frame or attributes."
+	 */
+	mapping_iter = 1;
+
+	if (!cpu_has_pse)
+		use_pse = 0;
+
+repeat:
+	pages_2m = pages_4k = 0;
+	pfn = start_pfn;
+	pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+	pgd = pgd_base + pgd_idx;
+	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+		pmd = one_md_table_init(pgd);
+
+		if (pfn >= end_pfn)
+			continue;
+#ifdef CONFIG_X86_PAE
+		pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+		pmd += pmd_idx;
+#else
+		pmd_idx = 0;
+#endif
+		for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
+		     pmd++, pmd_idx++) {
+			unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
+
+			/*
+			 * Map with big pages if possible, otherwise
+			 * create normal page tables:
+			 */
+			if (use_pse) {
+				unsigned int addr2;
+				pgprot_t prot = PAGE_KERNEL_LARGE;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute + _PAGE_PSE.
+				 */
+				pgprot_t init_prot =
+					__pgprot(PTE_IDENT_ATTR |
+						 _PAGE_PSE);
+
+				pfn &= PMD_MASK >> PAGE_SHIFT;
+				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
+					PAGE_OFFSET + PAGE_SIZE-1;
+
+				if (is_kernel_text(addr) ||
+				    is_kernel_text(addr2))
+					prot = PAGE_KERNEL_LARGE_EXEC;
+
+				pages_2m++;
+				if (mapping_iter == 1)
+					set_pmd(pmd, pfn_pmd(pfn, init_prot));
+				else
+					set_pmd(pmd, pfn_pmd(pfn, prot));
+
+				pfn += PTRS_PER_PTE;
+				continue;
+			}
+			pte = one_page_table_init(pmd);
+
+			pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
+			pte += pte_ofs;
+			for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
+			     pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
+				pgprot_t prot = PAGE_KERNEL;
+				/*
+				 * first pass will use the same initial
+				 * identity mapping attribute.
+				 */
+				pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
+
+				if (is_kernel_text(addr))
+					prot = PAGE_KERNEL_EXEC;
+
+				pages_4k++;
+				if (mapping_iter == 1) {
+					set_pte(pte, pfn_pte(pfn, init_prot));
+					last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
+				} else
+					set_pte(pte, pfn_pte(pfn, prot));
+			}
+		}
+	}
+	if (mapping_iter == 1) {
+		/*
+		 * update direct mapping page count only in the first
+		 * iteration.
+		 */
+		update_page_count(PG_LEVEL_2M, pages_2m);
+		update_page_count(PG_LEVEL_4K, pages_4k);
+
+		/*
+		 * local global flush tlb, which will flush the previous
+		 * mappings present in both small and large page TLB's.
+		 */
+		__flush_tlb_all();
+
+		/*
+		 * Second iteration will set the actual desired PTE attributes.
+		 */
+		mapping_iter = 2;
+		goto repeat;
+	}
+	return last_map_addr;
+}
+
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
+{
+	return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
+			vaddr), vaddr), vaddr);
+}
+
+static void __init kmap_init(void)
+{
+	unsigned long kmap_vstart;
+
+	/*
+	 * Cache the first kmap pte:
+	 */
+	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+	kmap_prot = PAGE_KERNEL;
+}
+
+#ifdef CONFIG_HIGHMEM
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+	unsigned long vaddr;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	vaddr = PKMAP_BASE;
+	page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	pud = pud_offset(pgd, vaddr);
+	pmd = pmd_offset(pud, vaddr);
+	pte = pte_offset_kernel(pmd, vaddr);
+	pkmap_page_table = pte;
+}
+
+void __init add_highpages_with_active_regions(int nid,
+			 unsigned long start_pfn, unsigned long end_pfn)
+{
+	phys_addr_t start, end;
+	u64 i;
+
+	for_each_free_mem_range(i, nid, &start, &end, NULL) {
+		unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+					    start_pfn, end_pfn);
+		unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+					      start_pfn, end_pfn);
+		for ( ; pfn < e_pfn; pfn++)
+			if (pfn_valid(pfn))
+				free_highmem_page(pfn_to_page(pfn));
+	}
+}
+#else
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+#endif /* CONFIG_HIGHMEM */
+
+void __init native_pagetable_init(void)
+{
+	unsigned long pfn, va;
+	pgd_t *pgd, *base = swapper_pg_dir;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	/*
+	 * Remove any mappings which extend past the end of physical
+	 * memory from the boot time page table.
+	 * In virtual address space, we should have at least two pages
+	 * from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+	 * definition. And max_low_pfn is set to VMALLOC_END physical
+	 * address. If initial memory mapping is doing right job, we
+	 * should have pte used near max_low_pfn or one pmd is not present.
+	 */
+	for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+		va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
+		pgd = base + pgd_index(va);
+		if (!pgd_present(*pgd))
+			break;
+
+		pud = pud_offset(pgd, va);
+		pmd = pmd_offset(pud, va);
+		if (!pmd_present(*pmd))
+			break;
+
+		/* should not be large page here */
+		if (pmd_large(*pmd)) {
+			pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
+				pfn, pmd, __pa(pmd));
+			BUG_ON(1);
+		}
+
+		pte = pte_offset_kernel(pmd, va);
+		if (!pte_present(*pte))
+			break;
+
+		printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
+				pfn, pmd, __pa(pmd), pte, __pa(pte));
+		pte_clear(NULL, va, pte);
+	}
+	paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
+	paging_init();
+}
+
+/*
+ * Build a proper pagetable for the kernel mappings.  Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/x86/kernel/head_32.S.  The root of the
+ * pagetable will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir.  In any case,
+ * paravirt_pagetable_init() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+void __init early_ioremap_page_table_range_init(void)
+{
+	pgd_t *pgd_base = swapper_pg_dir;
+	unsigned long vaddr, end;
+
+	/*
+	 * Fixed mappings, only the page table structure has to be
+	 * created - mappings will be set by set_fixmap():
+	 */
+	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+	page_table_range_init(vaddr, end, pgd_base);
+	early_ioremap_reset();
+}
+
+static void __init pagetable_init(void)
+{
+	pgd_t *pgd_base = swapper_pg_dir;
+
+	permanent_kmaps_init(pgd_base);
+}
+
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+	return 0;
+}
+early_param("highmem", parse_highmem);
+
+#define MSG_HIGHMEM_TOO_BIG \
+	"highmem size (%luMB) is bigger than pages available (%luMB)!\n"
+
+#define MSG_LOWMEM_TOO_SMALL \
+	"highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
+/*
+ * All of RAM fits into lowmem - but if user wants highmem
+ * artificially via the highmem=x boot parameter then create
+ * it:
+ */
+static void __init lowmem_pfn_init(void)
+{
+	/* max_low_pfn is 0, we already have early_res support */
+	max_low_pfn = max_pfn;
+
+	if (highmem_pages == -1)
+		highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+	if (highmem_pages >= max_pfn) {
+		printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
+			pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
+		highmem_pages = 0;
+	}
+	if (highmem_pages) {
+		if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
+			printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
+				pages_to_mb(highmem_pages));
+			highmem_pages = 0;
+		}
+		max_low_pfn -= highmem_pages;
+	}
+#else
+	if (highmem_pages)
+		printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
+#endif
+}
+
+#define MSG_HIGHMEM_TOO_SMALL \
+	"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
+
+#define MSG_HIGHMEM_TRIMMED \
+	"Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
+/*
+ * We have more RAM than fits into lowmem - we try to put it into
+ * highmem, also taking the highmem=x boot parameter into account:
+ */
+static void __init highmem_pfn_init(void)
+{
+	max_low_pfn = MAXMEM_PFN;
+
+	if (highmem_pages == -1)
+		highmem_pages = max_pfn - MAXMEM_PFN;
+
+	if (highmem_pages + MAXMEM_PFN < max_pfn)
+		max_pfn = MAXMEM_PFN + highmem_pages;
+
+	if (highmem_pages + MAXMEM_PFN > max_pfn) {
+		printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
+			pages_to_mb(max_pfn - MAXMEM_PFN),
+			pages_to_mb(highmem_pages));
+		highmem_pages = 0;
+	}
+#ifndef CONFIG_HIGHMEM
+	/* Maximum memory usable is what is directly addressable */
+	printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
+	if (max_pfn > MAX_NONPAE_PFN)
+		printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
+	else
+		printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+	max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_HIGHMEM64G
+	if (max_pfn > MAX_NONPAE_PFN) {
+		max_pfn = MAX_NONPAE_PFN;
+		printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
+	}
+#endif /* !CONFIG_HIGHMEM64G */
+#endif /* !CONFIG_HIGHMEM */
+}
+
+/*
+ * Determine low and high memory ranges:
+ */
+void __init find_low_pfn_range(void)
+{
+	/* it could update max_pfn */
+
+	if (max_pfn <= MAXMEM_PFN)
+		lowmem_pfn_init();
+	else
+		highmem_pfn_init();
+}
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+void __init initmem_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > max_low_pfn)
+		highstart_pfn = max_low_pfn;
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+		pages_to_mb(highend_pfn - highstart_pfn));
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+	sparse_memory_present_with_active_regions(0);
+
+#ifdef CONFIG_FLATMEM
+	max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn;
+#endif
+	__vmalloc_start_set = true;
+
+	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+			pages_to_mb(max_low_pfn));
+
+	setup_bootmem_allocator();
+}
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
+
+void __init setup_bootmem_allocator(void)
+{
+	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
+		 max_pfn_mapped<<PAGE_SHIFT);
+	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
+}
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+	pagetable_init();
+
+	__flush_tlb_all();
+
+	kmap_init();
+
+	/*
+	 * NOTE: at this point the bootmem allocator is fully available.
+	 */
+	olpc_dt_build_devicetree();
+	sparse_memory_present_with_active_regions(MAX_NUMNODES);
+	sparse_init();
+	zone_sizes_init();
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
+ */
+static void __init test_wp_bit(void)
+{
+	printk(KERN_INFO
+  "Checking if this processor honours the WP bit even in supervisor mode...");
+
+	/* Any page-aligned address will do, the test is non-destructive */
+	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
+	boot_cpu_data.wp_works_ok = do_test_wp_bit();
+	clear_fixmap(FIX_WP_TEST);
+
+	if (!boot_cpu_data.wp_works_ok) {
+		printk(KERN_CONT "No.\n");
+		panic("Linux doesn't support CPUs with broken WP.");
+	} else {
+		printk(KERN_CONT "Ok.\n");
+	}
+}
+
+void __init mem_init(void)
+{
+	pci_iommu_alloc();
+
+#ifdef CONFIG_FLATMEM
+	BUG_ON(!mem_map);
+#endif
+	/*
+	 * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+	 * be done before free_all_bootmem(). Memblock use free low memory for
+	 * temporary data (see find_range_array()) and for this purpose can use
+	 * pages that was already passed to the buddy allocator, hence marked as
+	 * not accessible in the page tables when compiled with
+	 * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+	 * important here.
+	 */
+	set_highmem_pages_init();
+
+	/* this will put all low memory onto the freelists */
+	free_all_bootmem();
+
+	after_bootmem = 1;
+
+	mem_init_print_info(NULL);
+	printk(KERN_INFO "virtual kernel memory layout:\n"
+		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#ifdef CONFIG_HIGHMEM
+		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+#endif
+		"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+		"      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+		"      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+		FIXADDR_START, FIXADDR_TOP,
+		(FIXADDR_TOP - FIXADDR_START) >> 10,
+
+#ifdef CONFIG_HIGHMEM
+		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+		(LAST_PKMAP*PAGE_SIZE) >> 10,
+#endif
+
+		VMALLOC_START, VMALLOC_END,
+		(VMALLOC_END - VMALLOC_START) >> 20,
+
+		(unsigned long)__va(0), (unsigned long)high_memory,
+		((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+		(unsigned long)&__init_begin, (unsigned long)&__init_end,
+		((unsigned long)&__init_end -
+		 (unsigned long)&__init_begin) >> 10,
+
+		(unsigned long)&_etext, (unsigned long)&_edata,
+		((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+		(unsigned long)&_text, (unsigned long)&_etext,
+		((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+	/*
+	 * Check boundaries twice: Some fundamental inconsistencies can
+	 * be detected at build time already.
+	 */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+#ifdef CONFIG_HIGHMEM
+	BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
+	BUILD_BUG_ON(VMALLOC_END			> PKMAP_BASE);
+#endif
+#define high_memory (-128UL << 20)
+	BUILD_BUG_ON(VMALLOC_START			>= VMALLOC_END);
+#undef high_memory
+#undef __FIXADDR_TOP
+#ifdef CONFIG_RANDOMIZE_BASE
+	BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+	BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE	> FIXADDR_START);
+	BUG_ON(VMALLOC_END				> PKMAP_BASE);
+#endif
+	BUG_ON(VMALLOC_START				>= VMALLOC_END);
+	BUG_ON((unsigned long)high_memory		> VMALLOC_START);
+
+	if (boot_cpu_data.wp_works_ok < 0)
+		test_wp_bit();
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdata = NODE_DATA(nid);
+	struct zone *zone = pgdata->node_zones +
+		zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+
+	return __add_pages(nid, zone, start_pfn, nr_pages);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+int arch_remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+	return __remove_pages(zone, start_pfn, nr_pages);
+}
+#endif
+#endif
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section.  Put this after the callers, so that it cannot be inlined.
+ */
+static noinline int do_test_wp_bit(void)
+{
+	char tmp_reg;
+	int flag;
+
+	__asm__ __volatile__(
+		"	movb %0, %1	\n"
+		"1:	movb %1, %0	\n"
+		"	xorl %2, %2	\n"
+		"2:			\n"
+		_ASM_EXTABLE(1b,2b)
+		:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+		 "=q" (tmp_reg),
+		 "=r" (flag)
+		:"2" (1)
+		:"memory");
+
+	return flag;
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly __read_mostly;
+
+void set_kernel_text_rw(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, start+size);
+
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, start+size);
+
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+static void mark_nxdata_nx(void)
+{
+	/*
+	 * When this called, init has already been executed and released,
+	 * so everything past _etext should be NX.
+	 */
+	unsigned long start = PFN_ALIGN(_etext);
+	/*
+	 * This comes from is_kernel_text upper limit. Also HPAGE where used:
+	 */
+	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+	if (__supported_pte_mask & _PAGE_NX)
+		printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+	set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long size = PFN_ALIGN(_etext) - start;
+
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+		size >> 10);
+
+	kernel_set_to_readonly = 1;
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+		start, start+size);
+	set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
+#endif
+
+	start += size;
+	size = (unsigned long)__end_rodata - start;
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+		size >> 10);
+	rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
+	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: write protecting again\n");
+	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+#endif
+	mark_nxdata_nx();
+}
+#endif
+
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
new file mode 100644
index 000000000..3fba623e3
--- /dev/null
+++ b/arch/x86/mm/init_64.c
@@ -0,0 +1,1376 @@
+/*
+ *  linux/arch/x86_64/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *  Copyright (C) 2000  Pavel Machek <pavel@ucw.cz>
+ *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/proc_fs.h>
+#include <linux/pci.h>
+#include <linux/pfn.h>
+#include <linux/poison.h>
+#include <linux/dma-mapping.h>
+#include <linux/module.h>
+#include <linux/memory.h>
+#include <linux/memory_hotplug.h>
+#include <linux/nmi.h>
+#include <linux/gfp.h>
+#include <linux/kcore.h>
+
+#include <asm/processor.h>
+#include <asm/bios_ebda.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <asm/smp.h>
+#include <asm/sections.h>
+#include <asm/kdebug.h>
+#include <asm/numa.h>
+#include <asm/cacheflush.h>
+#include <asm/init.h>
+#include <asm/setup.h>
+
+#include "mm_internal.h"
+
+static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+			   unsigned long addr, unsigned long end)
+{
+	addr &= PMD_MASK;
+	for (; addr < end; addr += PMD_SIZE) {
+		pmd_t *pmd = pmd_page + pmd_index(addr);
+
+		if (!pmd_present(*pmd))
+			set_pmd(pmd, __pmd(addr | pmd_flag));
+	}
+}
+static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
+			  unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+
+	for (; addr < end; addr = next) {
+		pud_t *pud = pud_page + pud_index(addr);
+		pmd_t *pmd;
+
+		next = (addr & PUD_MASK) + PUD_SIZE;
+		if (next > end)
+			next = end;
+
+		if (pud_present(*pud)) {
+			pmd = pmd_offset(pud, 0);
+			ident_pmd_init(info->pmd_flag, pmd, addr, next);
+			continue;
+		}
+		pmd = (pmd_t *)info->alloc_pgt_page(info->context);
+		if (!pmd)
+			return -ENOMEM;
+		ident_pmd_init(info->pmd_flag, pmd, addr, next);
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	}
+
+	return 0;
+}
+
+int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
+			      unsigned long addr, unsigned long end)
+{
+	unsigned long next;
+	int result;
+	int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
+
+	for (; addr < end; addr = next) {
+		pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+		pud_t *pud;
+
+		next = (addr & PGDIR_MASK) + PGDIR_SIZE;
+		if (next > end)
+			next = end;
+
+		if (pgd_present(*pgd)) {
+			pud = pud_offset(pgd, 0);
+			result = ident_pud_init(info, pud, addr, next);
+			if (result)
+				return result;
+			continue;
+		}
+
+		pud = (pud_t *)info->alloc_pgt_page(info->context);
+		if (!pud)
+			return -ENOMEM;
+		result = ident_pud_init(info, pud, addr, next);
+		if (result)
+			return result;
+		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+	}
+
+	return 0;
+}
+
+/*
+ * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
+ * physical space so we can cache the place of the first one and move
+ * around without checking the pgd every time.
+ */
+
+pteval_t __supported_pte_mask __read_mostly = ~0;
+EXPORT_SYMBOL_GPL(__supported_pte_mask);
+
+int force_personality32;
+
+/*
+ * noexec32=on|off
+ * Control non executable heap for 32bit processes.
+ * To control the stack too use noexec=off
+ *
+ * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
+ * off	PROT_READ implies PROT_EXEC
+ */
+static int __init nonx32_setup(char *str)
+{
+	if (!strcmp(str, "on"))
+		force_personality32 &= ~READ_IMPLIES_EXEC;
+	else if (!strcmp(str, "off"))
+		force_personality32 |= READ_IMPLIES_EXEC;
+	return 1;
+}
+__setup("noexec32=", nonx32_setup);
+
+/*
+ * When memory was added/removed make sure all the processes MM have
+ * suitable PGD entries in the local PGD level page.
+ */
+void sync_global_pgds(unsigned long start, unsigned long end, int removed)
+{
+	unsigned long address;
+
+	for (address = start; address <= end; address += PGDIR_SIZE) {
+		const pgd_t *pgd_ref = pgd_offset_k(address);
+		struct page *page;
+
+		/*
+		 * When it is called after memory hot remove, pgd_none()
+		 * returns true. In this case (removed == 1), we must clear
+		 * the PGD entries in the local PGD level page.
+		 */
+		if (pgd_none(*pgd_ref) && !removed)
+			continue;
+
+		spin_lock(&pgd_lock);
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			spinlock_t *pgt_lock;
+
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			/* the pgt_lock only for Xen */
+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+			spin_lock(pgt_lock);
+
+			if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+				BUG_ON(pgd_page_vaddr(*pgd)
+				       != pgd_page_vaddr(*pgd_ref));
+
+			if (removed) {
+				if (pgd_none(*pgd_ref) && !pgd_none(*pgd))
+					pgd_clear(pgd);
+			} else {
+				if (pgd_none(*pgd))
+					set_pgd(pgd, *pgd_ref);
+			}
+
+			spin_unlock(pgt_lock);
+		}
+		spin_unlock(&pgd_lock);
+	}
+}
+
+/*
+ * NOTE: This function is marked __ref because it calls __init function
+ * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
+ */
+static __ref void *spp_getpage(void)
+{
+	void *ptr;
+
+	if (after_bootmem)
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+	else
+		ptr = alloc_bootmem_pages(PAGE_SIZE);
+
+	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
+		panic("set_pte_phys: cannot allocate page data %s\n",
+			after_bootmem ? "after bootmem" : "");
+	}
+
+	pr_debug("spp_getpage %p\n", ptr);
+
+	return ptr;
+}
+
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
+{
+	if (pgd_none(*pgd)) {
+		pud_t *pud = (pud_t *)spp_getpage();
+		pgd_populate(&init_mm, pgd, pud);
+		if (pud != pud_offset(pgd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+			       pud, pud_offset(pgd, 0));
+	}
+	return pud_offset(pgd, vaddr);
+}
+
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
+	if (pud_none(*pud)) {
+		pmd_t *pmd = (pmd_t *) spp_getpage();
+		pud_populate(&init_mm, pud, pmd);
+		if (pmd != pmd_offset(pud, 0))
+			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
+			       pmd, pmd_offset(pud, 0));
+	}
+	return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
+	if (pmd_none(*pmd)) {
+		pte_t *pte = (pte_t *) spp_getpage();
+		pmd_populate_kernel(&init_mm, pmd, pte);
+		if (pte != pte_offset_kernel(pmd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #02!\n");
+	}
+	return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pud = pud_page + pud_index(vaddr);
+	pmd = fill_pmd(pud, vaddr);
+	pte = fill_pte(pmd, vaddr);
+
+	set_pte(pte, new_pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+	pgd_t *pgd;
+	pud_t *pud_page;
+
+	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
+
+	pgd = pgd_offset_k(vaddr);
+	if (pgd_none(*pgd)) {
+		printk(KERN_ERR
+			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
+		return;
+	}
+	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
+	set_pte_vaddr_pud(pud_page, vaddr, pteval);
+}
+
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = pgd_offset_k(vaddr);
+	pud = fill_pud(pgd, vaddr);
+	return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return fill_pte(pmd, vaddr);
+}
+
+/*
+ * Create large page table mappings for a range of physical addresses.
+ */
+static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
+					enum page_cache_mode cache)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pgprot_t prot;
+
+	pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
+		pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
+	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
+	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
+		pgd = pgd_offset_k((unsigned long)__va(phys));
+		if (pgd_none(*pgd)) {
+			pud = (pud_t *) spp_getpage();
+			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
+						_PAGE_USER));
+		}
+		pud = pud_offset(pgd, (unsigned long)__va(phys));
+		if (pud_none(*pud)) {
+			pmd = (pmd_t *) spp_getpage();
+			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
+						_PAGE_USER));
+		}
+		pmd = pmd_offset(pud, phys);
+		BUG_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
+	}
+}
+
+void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
+{
+	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_WB);
+}
+
+void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
+{
+	__init_extra_mapping(phys, size, _PAGE_CACHE_MODE_UC);
+}
+
+/*
+ * The head.S code sets up the kernel high mapping:
+ *
+ *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
+ *
+ * phys_base holds the negative offset to the kernel, which is added
+ * to the compile time generated pmds. This results in invalid pmds up
+ * to the point where we hit the physaddr 0 mapping.
+ *
+ * We limit the mappings to the region from _text to _brk_end.  _brk_end
+ * is rounded up to the 2MB boundary. This catches the invalid pmds as
+ * well, as they are located before _text:
+ */
+void __init cleanup_highmap(void)
+{
+	unsigned long vaddr = __START_KERNEL_map;
+	unsigned long vaddr_end = __START_KERNEL_map + KERNEL_IMAGE_SIZE;
+	unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
+	pmd_t *pmd = level2_kernel_pgt;
+
+	/*
+	 * Native path, max_pfn_mapped is not set yet.
+	 * Xen has valid max_pfn_mapped set in
+	 *	arch/x86/xen/mmu.c:xen_setup_kernel_pagetable().
+	 */
+	if (max_pfn_mapped)
+		vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
+
+	for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
+		if (pmd_none(*pmd))
+			continue;
+		if (vaddr < (unsigned long) _text || vaddr > end)
+			set_pmd(pmd, __pmd(0));
+	}
+}
+
+static unsigned long __meminit
+phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+	      pgprot_t prot)
+{
+	unsigned long pages = 0, next;
+	unsigned long last_map_addr = end;
+	int i;
+
+	pte_t *pte = pte_page + pte_index(addr);
+
+	for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+		next = (addr & PAGE_MASK) + PAGE_SIZE;
+		if (addr >= end) {
+			if (!after_bootmem &&
+			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
+			    !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+				set_pte(pte, __pte(0));
+			continue;
+		}
+
+		/*
+		 * We will re-use the existing mapping.
+		 * Xen for example has some special requirements, like mapping
+		 * pagetable pages as RO. So assume someone who pre-setup
+		 * these mappings are more intelligent.
+		 */
+		if (pte_val(*pte)) {
+			if (!after_bootmem)
+				pages++;
+			continue;
+		}
+
+		if (0)
+			printk("   pte=%p addr=%lx pte=%016lx\n",
+			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+		pages++;
+		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
+		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+	}
+
+	update_page_count(PG_LEVEL_4K, pages);
+
+	return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+	      unsigned long page_size_mask, pgprot_t prot)
+{
+	unsigned long pages = 0, next;
+	unsigned long last_map_addr = end;
+
+	int i = pmd_index(address);
+
+	for (; i < PTRS_PER_PMD; i++, address = next) {
+		pmd_t *pmd = pmd_page + pmd_index(address);
+		pte_t *pte;
+		pgprot_t new_prot = prot;
+
+		next = (address & PMD_MASK) + PMD_SIZE;
+		if (address >= end) {
+			if (!after_bootmem &&
+			    !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
+			    !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+				set_pmd(pmd, __pmd(0));
+			continue;
+		}
+
+		if (pmd_val(*pmd)) {
+			if (!pmd_large(*pmd)) {
+				spin_lock(&init_mm.page_table_lock);
+				pte = (pte_t *)pmd_page_vaddr(*pmd);
+				last_map_addr = phys_pte_init(pte, address,
+								end, prot);
+				spin_unlock(&init_mm.page_table_lock);
+				continue;
+			}
+			/*
+			 * If we are ok with PG_LEVEL_2M mapping, then we will
+			 * use the existing mapping,
+			 *
+			 * Otherwise, we will split the large page mapping but
+			 * use the same existing protection bits except for
+			 * large page, so that we don't violate Intel's TLB
+			 * Application note (317080) which says, while changing
+			 * the page sizes, new and old translations should
+			 * not differ with respect to page frame and
+			 * attributes.
+			 */
+			if (page_size_mask & (1 << PG_LEVEL_2M)) {
+				if (!after_bootmem)
+					pages++;
+				last_map_addr = next;
+				continue;
+			}
+			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
+		}
+
+		if (page_size_mask & (1<<PG_LEVEL_2M)) {
+			pages++;
+			spin_lock(&init_mm.page_table_lock);
+			set_pte((pte_t *)pmd,
+				pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
+					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+			spin_unlock(&init_mm.page_table_lock);
+			last_map_addr = next;
+			continue;
+		}
+
+		pte = alloc_low_page();
+		last_map_addr = phys_pte_init(pte, address, end, new_prot);
+
+		spin_lock(&init_mm.page_table_lock);
+		pmd_populate_kernel(&init_mm, pmd, pte);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+	update_page_count(PG_LEVEL_2M, pages);
+	return last_map_addr;
+}
+
+static unsigned long __meminit
+phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
+			 unsigned long page_size_mask)
+{
+	unsigned long pages = 0, next;
+	unsigned long last_map_addr = end;
+	int i = pud_index(addr);
+
+	for (; i < PTRS_PER_PUD; i++, addr = next) {
+		pud_t *pud = pud_page + pud_index(addr);
+		pmd_t *pmd;
+		pgprot_t prot = PAGE_KERNEL;
+
+		next = (addr & PUD_MASK) + PUD_SIZE;
+		if (addr >= end) {
+			if (!after_bootmem &&
+			    !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+			    !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+				set_pud(pud, __pud(0));
+			continue;
+		}
+
+		if (pud_val(*pud)) {
+			if (!pud_large(*pud)) {
+				pmd = pmd_offset(pud, 0);
+				last_map_addr = phys_pmd_init(pmd, addr, end,
+							 page_size_mask, prot);
+				__flush_tlb_all();
+				continue;
+			}
+			/*
+			 * If we are ok with PG_LEVEL_1G mapping, then we will
+			 * use the existing mapping.
+			 *
+			 * Otherwise, we will split the gbpage mapping but use
+			 * the same existing protection  bits except for large
+			 * page, so that we don't violate Intel's TLB
+			 * Application note (317080) which says, while changing
+			 * the page sizes, new and old translations should
+			 * not differ with respect to page frame and
+			 * attributes.
+			 */
+			if (page_size_mask & (1 << PG_LEVEL_1G)) {
+				if (!after_bootmem)
+					pages++;
+				last_map_addr = next;
+				continue;
+			}
+			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
+		}
+
+		if (page_size_mask & (1<<PG_LEVEL_1G)) {
+			pages++;
+			spin_lock(&init_mm.page_table_lock);
+			set_pte((pte_t *)pud,
+				pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+					PAGE_KERNEL_LARGE));
+			spin_unlock(&init_mm.page_table_lock);
+			last_map_addr = next;
+			continue;
+		}
+
+		pmd = alloc_low_page();
+		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
+					      prot);
+
+		spin_lock(&init_mm.page_table_lock);
+		pud_populate(&init_mm, pud, pmd);
+		spin_unlock(&init_mm.page_table_lock);
+	}
+	__flush_tlb_all();
+
+	update_page_count(PG_LEVEL_1G, pages);
+
+	return last_map_addr;
+}
+
+unsigned long __meminit
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
+{
+	bool pgd_changed = false;
+	unsigned long next, last_map_addr = end;
+	unsigned long addr;
+
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
+	addr = start;
+
+	for (; start < end; start = next) {
+		pgd_t *pgd = pgd_offset_k(start);
+		pud_t *pud;
+
+		next = (start & PGDIR_MASK) + PGDIR_SIZE;
+
+		if (pgd_val(*pgd)) {
+			pud = (pud_t *)pgd_page_vaddr(*pgd);
+			last_map_addr = phys_pud_init(pud, __pa(start),
+						 __pa(end), page_size_mask);
+			continue;
+		}
+
+		pud = alloc_low_page();
+		last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
+						 page_size_mask);
+
+		spin_lock(&init_mm.page_table_lock);
+		pgd_populate(&init_mm, pgd, pud);
+		spin_unlock(&init_mm.page_table_lock);
+		pgd_changed = true;
+	}
+
+	if (pgd_changed)
+		sync_global_pgds(addr, end - 1, 0);
+
+	__flush_tlb_all();
+
+	return last_map_addr;
+}
+
+#ifndef CONFIG_NUMA
+void __init initmem_init(void)
+{
+	memblock_set_node(0, (phys_addr_t)ULLONG_MAX, &memblock.memory, 0);
+}
+#endif
+
+void __init paging_init(void)
+{
+	sparse_memory_present_with_active_regions(MAX_NUMNODES);
+	sparse_init();
+
+	/*
+	 * clear the default setting with node 0
+	 * note: don't use nodes_clear here, that is really clearing when
+	 *	 numa support is not compiled in, and later node_set_state
+	 *	 will not set it back.
+	 */
+	node_clear_state(0, N_MEMORY);
+	if (N_MEMORY != N_NORMAL_MEMORY)
+		node_clear_state(0, N_NORMAL_MEMORY);
+
+	zone_sizes_init();
+}
+
+/*
+ * Memory hotplug specific functions
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
+ * updating.
+ */
+static void  update_end_of_memory_vars(u64 start, u64 size)
+{
+	unsigned long end_pfn = PFN_UP(start + size);
+
+	if (end_pfn > max_pfn) {
+		max_pfn = end_pfn;
+		max_low_pfn = end_pfn;
+		high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+	}
+}
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
+int arch_add_memory(int nid, u64 start, u64 size)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *zone = pgdat->node_zones +
+		zone_for_memory(nid, start, size, ZONE_NORMAL);
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int ret;
+
+	init_memory_mapping(start, start + size);
+
+	ret = __add_pages(nid, zone, start_pfn, nr_pages);
+	WARN_ON_ONCE(ret);
+
+	/* update max_pfn, max_low_pfn and high_memory */
+	update_end_of_memory_vars(start, size);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(arch_add_memory);
+
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+	unsigned long magic;
+	unsigned int nr_pages = 1 << order;
+
+	/* bootmem page has reserved flag */
+	if (PageReserved(page)) {
+		__ClearPageReserved(page);
+
+		magic = (unsigned long)page->lru.next;
+		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
+			while (nr_pages--)
+				put_page_bootmem(page++);
+		} else
+			while (nr_pages--)
+				free_reserved_page(page++);
+	} else
+		free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (pte_val(*pte))
+			return;
+	}
+
+	/* free a pte talbe */
+	free_pagetable(pmd_page(*pmd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pmd_clear(pmd);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (pmd_val(*pmd))
+			return;
+	}
+
+	/* free a pmd talbe */
+	free_pagetable(pud_page(*pud), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pud_clear(pud);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (pud_val(*pud))
+			return false;
+	}
+
+	/* free a pud table */
+	free_pagetable(pgd_page(*pgd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pgd_clear(pgd);
+	spin_unlock(&init_mm.page_table_lock);
+
+	return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long next, pages = 0;
+	pte_t *pte;
+	void *page_addr;
+	phys_addr_t phys_addr;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr = next, pte++) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		if (!pte_present(*pte))
+			continue;
+
+		/*
+		 * We mapped [0,1G) memory as identity mapping when
+		 * initializing, in arch/x86/kernel/head_64.S. These
+		 * pagetables cannot be removed.
+		 */
+		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+		if (phys_addr < (phys_addr_t)0x40000000)
+			return;
+
+		if (IS_ALIGNED(addr, PAGE_SIZE) &&
+		    IS_ALIGNED(next, PAGE_SIZE)) {
+			/*
+			 * Do not free direct mapping pages since they were
+			 * freed when offlining, or simplely not in use.
+			 */
+			if (!direct)
+				free_pagetable(pte_page(*pte), 0);
+
+			spin_lock(&init_mm.page_table_lock);
+			pte_clear(&init_mm, addr, pte);
+			spin_unlock(&init_mm.page_table_lock);
+
+			/* For non-direct mapping, pages means nothing. */
+			pages++;
+		} else {
+			/*
+			 * If we are here, we are freeing vmemmap pages since
+			 * direct mapped memory ranges to be freed are aligned.
+			 *
+			 * If we are not removing the whole page, it means
+			 * other page structs in this page are being used and
+			 * we canot remove them. So fill the unused page_structs
+			 * with 0xFD, and remove the page when it is wholly
+			 * filled with 0xFD.
+			 */
+			memset((void *)addr, PAGE_INUSE, next - addr);
+
+			page_addr = page_address(pte_page(*pte));
+			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+				free_pagetable(pte_page(*pte), 0);
+
+				spin_lock(&init_mm.page_table_lock);
+				pte_clear(&init_mm, addr, pte);
+				spin_unlock(&init_mm.page_table_lock);
+			}
+		}
+	}
+
+	/* Call free_pte_table() in remove_pmd_table(). */
+	flush_tlb_all();
+	if (direct)
+		update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long next, pages = 0;
+	pte_t *pte_base;
+	pmd_t *pmd;
+	void *page_addr;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_large(*pmd)) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
+				if (!direct)
+					free_pagetable(pmd_page(*pmd),
+						       get_order(PMD_SIZE));
+
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
+				pages++;
+			} else {
+				/* If here, we are freeing vmemmap pages. */
+				memset((void *)addr, PAGE_INUSE, next - addr);
+
+				page_addr = page_address(pmd_page(*pmd));
+				if (!memchr_inv(page_addr, PAGE_INUSE,
+						PMD_SIZE)) {
+					free_pagetable(pmd_page(*pmd),
+						       get_order(PMD_SIZE));
+
+					spin_lock(&init_mm.page_table_lock);
+					pmd_clear(pmd);
+					spin_unlock(&init_mm.page_table_lock);
+				}
+			}
+
+			continue;
+		}
+
+		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+		remove_pte_table(pte_base, addr, next, direct);
+		free_pte_table(pte_base, pmd);
+	}
+
+	/* Call free_pmd_table() in remove_pud_table(). */
+	if (direct)
+		update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long next, pages = 0;
+	pmd_t *pmd_base;
+	pud_t *pud;
+	void *page_addr;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_large(*pud)) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE)) {
+				if (!direct)
+					free_pagetable(pud_page(*pud),
+						       get_order(PUD_SIZE));
+
+				spin_lock(&init_mm.page_table_lock);
+				pud_clear(pud);
+				spin_unlock(&init_mm.page_table_lock);
+				pages++;
+			} else {
+				/* If here, we are freeing vmemmap pages. */
+				memset((void *)addr, PAGE_INUSE, next - addr);
+
+				page_addr = page_address(pud_page(*pud));
+				if (!memchr_inv(page_addr, PAGE_INUSE,
+						PUD_SIZE)) {
+					free_pagetable(pud_page(*pud),
+						       get_order(PUD_SIZE));
+
+					spin_lock(&init_mm.page_table_lock);
+					pud_clear(pud);
+					spin_unlock(&init_mm.page_table_lock);
+				}
+			}
+
+			continue;
+		}
+
+		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+		remove_pmd_table(pmd_base, addr, next, direct);
+		free_pmd_table(pmd_base, pud);
+	}
+
+	if (direct)
+		update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+	unsigned long next;
+	unsigned long addr;
+	pgd_t *pgd;
+	pud_t *pud;
+	bool pgd_changed = false;
+
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+
+		pgd = pgd_offset_k(addr);
+		if (!pgd_present(*pgd))
+			continue;
+
+		pud = (pud_t *)pgd_page_vaddr(*pgd);
+		remove_pud_table(pud, addr, next, direct);
+		if (free_pud_table(pud, pgd))
+			pgd_changed = true;
+	}
+
+	if (pgd_changed)
+		sync_global_pgds(start, end - 1, 1);
+
+	flush_tlb_all();
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end)
+{
+	remove_pagetable(start, end, false);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void __meminit
+kernel_physical_mapping_remove(unsigned long start, unsigned long end)
+{
+	start = (unsigned long)__va(start);
+	end = (unsigned long)__va(end);
+
+	remove_pagetable(start, end, true);
+}
+
+int __ref arch_remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long nr_pages = size >> PAGE_SHIFT;
+	struct zone *zone;
+	int ret;
+
+	zone = page_zone(pfn_to_page(start_pfn));
+	kernel_physical_mapping_remove(start, start + size);
+	ret = __remove_pages(zone, start_pfn, nr_pages);
+	WARN_ON_ONCE(ret);
+
+	return ret;
+}
+#endif
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static struct kcore_list kcore_vsyscall;
+
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+	int i;
+
+	for_each_online_node(i)
+		register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
+void __init mem_init(void)
+{
+	pci_iommu_alloc();
+
+	/* clear_bss() already clear the empty_zero_page */
+
+	register_page_bootmem_info();
+
+	/* this will put all memory onto the freelists */
+	free_all_bootmem();
+	after_bootmem = 1;
+
+	/* Register memory areas for /proc/kcore */
+	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
+			 PAGE_SIZE, KCORE_OTHER);
+
+	mem_init_print_info(NULL);
+}
+
+#ifdef CONFIG_DEBUG_RODATA
+const int rodata_test_data = 0xC3;
+EXPORT_SYMBOL_GPL(rodata_test_data);
+
+int kernel_set_to_readonly;
+
+void set_kernel_text_rw(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read write\n",
+		 start, end);
+
+	/*
+	 * Make the kernel identity mapping for text RW. Kernel text
+	 * mapping will always be RO. Refer to the comment in
+	 * static_protections() in pageattr.c
+	 */
+	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
+}
+
+void set_kernel_text_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long end = PFN_ALIGN(__stop___ex_table);
+
+	if (!kernel_set_to_readonly)
+		return;
+
+	pr_debug("Set kernel text: %lx - %lx for read only\n",
+		 start, end);
+
+	/*
+	 * Set the kernel identity mapping for text RO.
+	 */
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+}
+
+void mark_rodata_ro(void)
+{
+	unsigned long start = PFN_ALIGN(_text);
+	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
+	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
+	unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+	unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+	unsigned long all_end;
+
+	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
+	       (end - start) >> 10);
+	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
+
+	kernel_set_to_readonly = 1;
+
+	/*
+	 * The rodata/data/bss/brk section (but not the kernel text!)
+	 * should also be not-executable.
+	 *
+	 * We align all_end to PMD_SIZE because the existing mapping
+	 * is a full PMD. If we would align _brk_end to PAGE_SIZE we
+	 * split the PMD and the reminder between _brk_end and the end
+	 * of the PMD will remain mapped executable.
+	 *
+	 * Any PMD which was setup after the one which covers _brk_end
+	 * has been zapped already via cleanup_highmem().
+	 */
+	all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
+	set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
+
+	rodata_test();
+
+#ifdef CONFIG_CPA_DEBUG
+	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
+	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
+
+	printk(KERN_INFO "Testing CPA: again\n");
+	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
+#endif
+
+	free_init_pages("unused kernel",
+			(unsigned long) __va(__pa_symbol(text_end)),
+			(unsigned long) __va(__pa_symbol(rodata_start)));
+	free_init_pages("unused kernel",
+			(unsigned long) __va(__pa_symbol(rodata_end)),
+			(unsigned long) __va(__pa_symbol(_sdata)));
+}
+
+#endif
+
+int kern_addr_valid(unsigned long addr)
+{
+	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	if (above != 0 && above != -1UL)
+		return 0;
+
+	pgd = pgd_offset_k(addr);
+	if (pgd_none(*pgd))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (pud_none(*pud))
+		return 0;
+
+	if (pud_large(*pud))
+		return pfn_valid(pud_pfn(*pud));
+
+	pmd = pmd_offset(pud, addr);
+	if (pmd_none(*pmd))
+		return 0;
+
+	if (pmd_large(*pmd))
+		return pfn_valid(pmd_pfn(*pmd));
+
+	pte = pte_offset_kernel(pmd, addr);
+	if (pte_none(*pte))
+		return 0;
+
+	return pfn_valid(pte_pfn(*pte));
+}
+
+static unsigned long probe_memory_block_size(void)
+{
+	/* start from 2g */
+	unsigned long bz = 1UL<<31;
+
+	if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) {
+		pr_info("Using 2GB memory block size for large-memory system\n");
+		return 2UL * 1024 * 1024 * 1024;
+	}
+
+	/* less than 64g installed */
+	if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
+		return MIN_MEMORY_BLOCK_SIZE;
+
+	/* get the tail size */
+	while (bz > MIN_MEMORY_BLOCK_SIZE) {
+		if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
+			break;
+		bz >>= 1;
+	}
+
+	printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20);
+
+	return bz;
+}
+
+static unsigned long memory_block_size_probed;
+unsigned long memory_block_size_bytes(void)
+{
+	if (!memory_block_size_probed)
+		memory_block_size_probed = probe_memory_block_size();
+
+	return memory_block_size_probed;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+/*
+ * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
+ */
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
+static int __meminit vmemmap_populate_hugepages(unsigned long start,
+						unsigned long end, int node)
+{
+	unsigned long addr;
+	unsigned long next;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	for (addr = start; addr < end; addr = next) {
+		next = pmd_addr_end(addr, end);
+
+		pgd = vmemmap_pgd_populate(addr, node);
+		if (!pgd)
+			return -ENOMEM;
+
+		pud = vmemmap_pud_populate(pgd, addr, node);
+		if (!pud)
+			return -ENOMEM;
+
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd)) {
+			void *p;
+
+			p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+			if (p) {
+				pte_t entry;
+
+				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
+						PAGE_KERNEL_LARGE);
+				set_pmd(pmd, __pmd(pte_val(entry)));
+
+				/* check to see if we have contiguous blocks */
+				if (p_end != p || node_start != node) {
+					if (p_start)
+						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+						       addr_start, addr_end-1, p_start, p_end-1, node_start);
+					addr_start = addr;
+					node_start = node;
+					p_start = p;
+				}
+
+				addr_end = addr + PMD_SIZE;
+				p_end = p + PMD_SIZE;
+				continue;
+			}
+		} else if (pmd_large(*pmd)) {
+			vmemmap_verify((pte_t *)pmd, node, addr, next);
+			continue;
+		}
+		pr_warn_once("vmemmap: falling back to regular page backing\n");
+		if (vmemmap_populate_basepages(addr, next, node))
+			return -ENOMEM;
+	}
+	return 0;
+}
+
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
+{
+	int err;
+
+	if (cpu_has_pse)
+		err = vmemmap_populate_hugepages(start, end, node);
+	else
+		err = vmemmap_populate_basepages(start, end, node);
+	if (!err)
+		sync_global_pgds(start, end - 1, 0);
+	return err;
+}
+
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+void register_page_bootmem_memmap(unsigned long section_nr,
+				  struct page *start_page, unsigned long size)
+{
+	unsigned long addr = (unsigned long)start_page;
+	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long next;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned int nr_pages;
+	struct page *page;
+
+	for (; addr < end; addr = next) {
+		pte_t *pte = NULL;
+
+		pgd = pgd_offset_k(addr);
+		if (pgd_none(*pgd)) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			continue;
+		}
+		get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
+
+		pud = pud_offset(pgd, addr);
+		if (pud_none(*pud)) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			continue;
+		}
+		get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
+
+		if (!cpu_has_pse) {
+			next = (addr + PAGE_SIZE) & PAGE_MASK;
+			pmd = pmd_offset(pud, addr);
+			if (pmd_none(*pmd))
+				continue;
+			get_page_bootmem(section_nr, pmd_page(*pmd),
+					 MIX_SECTION_INFO);
+
+			pte = pte_offset_kernel(pmd, addr);
+			if (pte_none(*pte))
+				continue;
+			get_page_bootmem(section_nr, pte_page(*pte),
+					 SECTION_INFO);
+		} else {
+			next = pmd_addr_end(addr, end);
+
+			pmd = pmd_offset(pud, addr);
+			if (pmd_none(*pmd))
+				continue;
+
+			nr_pages = 1 << (get_order(PMD_SIZE));
+			page = pmd_page(*pmd);
+			while (nr_pages--)
+				get_page_bootmem(section_nr, page++,
+						 SECTION_INFO);
+		}
+	}
+}
+#endif
+
+void __meminit vmemmap_populate_print_last(void)
+{
+	if (p_start) {
+		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+			addr_start, addr_end-1, p_start, p_end-1, node_start);
+		p_start = NULL;
+		p_end = NULL;
+		node_start = 0;
+	}
+}
+#endif
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
new file mode 100644
index 000000000..9ca35fc60
--- /dev/null
+++ b/arch/x86/mm/iomap_32.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright © 2008 Ingo Molnar
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <asm/iomap.h>
+#include <asm/pat.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+
+static int is_io_mapping_possible(resource_size_t base, unsigned long size)
+{
+#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
+	/* There is no way to map greater than 1 << 32 address without PAE */
+	if (base + size > 0x100000000ULL)
+		return 0;
+#endif
+	return 1;
+}
+
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
+{
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC;
+	int ret;
+
+	if (!is_io_mapping_possible(base, size))
+		return -EINVAL;
+
+	ret = io_reserve_memtype(base, base + size, &pcm);
+	if (ret)
+		return ret;
+
+	*prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_create_wc);
+
+void iomap_free(resource_size_t base, unsigned long size)
+{
+	io_free_memtype(base, base + size);
+}
+EXPORT_SYMBOL_GPL(iomap_free);
+
+void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+	unsigned long vaddr;
+	int idx, type;
+
+	pagefault_disable();
+
+	type = kmap_atomic_idx_push();
+	idx = type + KM_TYPE_NR * smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
+	arch_flush_lazy_mmu_mode();
+
+	return (void *)vaddr;
+}
+
+/*
+ * Map 'pfn' using protections 'prot'
+ */
+void __iomem *
+iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
+{
+	/*
+	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
+	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
+	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
+	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
+	 */
+	if (!pat_enabled && pgprot_val(prot) ==
+	    (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
+		prot = __pgprot(__PAGE_KERNEL |
+				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
+
+	return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
+}
+EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
+
+void
+iounmap_atomic(void __iomem *kvaddr)
+{
+	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+
+	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
+	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
+		int idx, type;
+
+		type = kmap_atomic_idx();
+		idx = type + KM_TYPE_NR * smp_processor_id();
+
+#ifdef CONFIG_DEBUG_HIGHMEM
+		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+#endif
+		/*
+		 * Force other mappings to Oops if they'll try to access this
+		 * pte without first remap it.  Keeping stale mappings around
+		 * is a bad idea also, in case the page changes cacheability
+		 * attributes or becomes a protected page in a hypervisor.
+		 */
+		kpte_clear_flush(kmap_pte-idx, vaddr);
+		kmap_atomic_idx_pop();
+	}
+
+	pagefault_enable();
+}
+EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
new file mode 100644
index 000000000..70e7444c6
--- /dev/null
+++ b/arch/x86/mm/ioremap.c
@@ -0,0 +1,458 @@
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
+
+#include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
+#include <asm/pat.h>
+
+#include "physaddr.h"
+
+/*
+ * Fix up the linear direct mapping of the kernel to avoid cache attribute
+ * conflicts.
+ */
+int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+			enum page_cache_mode pcm)
+{
+	unsigned long nrpages = size >> PAGE_SHIFT;
+	int err;
+
+	switch (pcm) {
+	case _PAGE_CACHE_MODE_UC:
+	default:
+		err = _set_memory_uc(vaddr, nrpages);
+		break;
+	case _PAGE_CACHE_MODE_WC:
+		err = _set_memory_wc(vaddr, nrpages);
+		break;
+	case _PAGE_CACHE_MODE_WB:
+		err = _set_memory_wb(vaddr, nrpages);
+		break;
+	}
+
+	return err;
+}
+
+static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
+			       void *arg)
+{
+	unsigned long i;
+
+	for (i = 0; i < nr_pages; ++i)
+		if (pfn_valid(start_pfn + i) &&
+		    !PageReserved(pfn_to_page(start_pfn + i)))
+			return 1;
+
+	WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
+
+	return 0;
+}
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+		unsigned long size, enum page_cache_mode pcm, void *caller)
+{
+	unsigned long offset, vaddr;
+	resource_size_t pfn, last_pfn, last_addr;
+	const resource_size_t unaligned_phys_addr = phys_addr;
+	const unsigned long unaligned_size = size;
+	struct vm_struct *area;
+	enum page_cache_mode new_pcm;
+	pgprot_t prot;
+	int retval;
+	void __iomem *ret_addr;
+	int ram_region;
+
+	/* Don't allow wraparound or zero size */
+	last_addr = phys_addr + size - 1;
+	if (!size || last_addr < phys_addr)
+		return NULL;
+
+	if (!phys_addr_valid(phys_addr)) {
+		printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+		       (unsigned long long)phys_addr);
+		WARN_ON_ONCE(1);
+		return NULL;
+	}
+
+	/*
+	 * Don't remap the low PCI/ISA area, it's always mapped..
+	 */
+	if (is_ISA_range(phys_addr, last_addr))
+		return (__force void __iomem *)phys_to_virt(phys_addr);
+
+	/*
+	 * Don't allow anybody to remap normal RAM that we're using..
+	 */
+	/* First check if whole region can be identified as RAM or not */
+	ram_region = region_is_ram(phys_addr, size);
+	if (ram_region > 0) {
+		WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n",
+				(unsigned long int)phys_addr,
+				(unsigned long int)last_addr);
+		return NULL;
+	}
+
+	/* If could not be identified(-1), check page by page */
+	if (ram_region < 0) {
+		pfn      = phys_addr >> PAGE_SHIFT;
+		last_pfn = last_addr >> PAGE_SHIFT;
+		if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+					  __ioremap_check_ram) == 1)
+			return NULL;
+	}
+	/*
+	 * Mappings have to be page-aligned
+	 */
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PHYSICAL_PAGE_MASK;
+	size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+	retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
+						pcm, &new_pcm);
+	if (retval) {
+		printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
+		return NULL;
+	}
+
+	if (pcm != new_pcm) {
+		if (!is_new_memtype_allowed(phys_addr, size, pcm, new_pcm)) {
+			printk(KERN_ERR
+		"ioremap error for 0x%llx-0x%llx, requested 0x%x, got 0x%x\n",
+				(unsigned long long)phys_addr,
+				(unsigned long long)(phys_addr + size),
+				pcm, new_pcm);
+			goto err_free_memtype;
+		}
+		pcm = new_pcm;
+	}
+
+	prot = PAGE_KERNEL_IO;
+	switch (pcm) {
+	case _PAGE_CACHE_MODE_UC:
+	default:
+		prot = __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_UC));
+		break;
+	case _PAGE_CACHE_MODE_UC_MINUS:
+		prot = __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
+		break;
+	case _PAGE_CACHE_MODE_WC:
+		prot = __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WC));
+		break;
+	case _PAGE_CACHE_MODE_WB:
+		break;
+	}
+
+	/*
+	 * Ok, go for it..
+	 */
+	area = get_vm_area_caller(size, VM_IOREMAP, caller);
+	if (!area)
+		goto err_free_memtype;
+	area->phys_addr = phys_addr;
+	vaddr = (unsigned long) area->addr;
+
+	if (kernel_map_sync_memtype(phys_addr, size, pcm))
+		goto err_free_area;
+
+	if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
+		goto err_free_area;
+
+	ret_addr = (void __iomem *) (vaddr + offset);
+	mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
+
+	/*
+	 * Check if the request spans more than any BAR in the iomem resource
+	 * tree.
+	 */
+	WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
+	return ret_addr;
+err_free_area:
+	free_vm_area(area);
+err_free_memtype:
+	free_memtype(phys_addr, phys_addr + size);
+	return NULL;
+}
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @phys_addr:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
+{
+	/*
+	 * Ideally, this should be:
+	 *	pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+	 *
+	 * Till we fix all X drivers to use ioremap_wc(), we will use
+	 * UC MINUS.
+	 */
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
+
+	return __ioremap_caller(phys_addr, size, pcm,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_nocache);
+
+/**
+ * ioremap_wc	-	map memory into CPU space write combined
+ * @phys_addr:	bus address of the memory
+ * @size:	size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
+{
+	if (pat_enabled)
+		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+					__builtin_return_address(0));
+	else
+		return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
+void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
+{
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_cache);
+
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+				unsigned long prot_val)
+{
+	return __ioremap_caller(phys_addr, size,
+				pgprot2cachemode(__pgprot(prot_val)),
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+/**
+ * iounmap - Free a IO remapping
+ * @addr: virtual address from ioremap_*
+ *
+ * Caller must ensure there is only one unmapping for the same pointer.
+ */
+void iounmap(volatile void __iomem *addr)
+{
+	struct vm_struct *p, *o;
+
+	if ((void __force *)addr <= high_memory)
+		return;
+
+	/*
+	 * __ioremap special-cases the PCI/ISA range by not instantiating a
+	 * vm_area and by simply returning an address into the kernel mapping
+	 * of ISA space.   So handle that here.
+	 */
+	if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
+	    (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
+		return;
+
+	addr = (volatile void __iomem *)
+		(PAGE_MASK & (unsigned long __force)addr);
+
+	mmiotrace_iounmap(addr);
+
+	/* Use the vm area unlocked, assuming the caller
+	   ensures there isn't another iounmap for the same address
+	   in parallel. Reuse of the virtual address is prevented by
+	   leaving it in the global lists until we're done with it.
+	   cpa takes care of the direct mappings. */
+	p = find_vm_area((void __force *)addr);
+
+	if (!p) {
+		printk(KERN_ERR "iounmap: bad address %p\n", addr);
+		dump_stack();
+		return;
+	}
+
+	free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
+	/* Finally remove it */
+	o = remove_vm_area((void __force *)addr);
+	BUG_ON(p != o || o == NULL);
+	kfree(p);
+}
+EXPORT_SYMBOL(iounmap);
+
+int arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+	return cpu_has_gbpages;
+#else
+	return 0;
+#endif
+}
+
+int arch_ioremap_pmd_supported(void)
+{
+	return cpu_has_pse;
+}
+
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(phys_addr_t phys)
+{
+	unsigned long start  = phys &  PAGE_MASK;
+	unsigned long offset = phys & ~PAGE_MASK;
+	unsigned long vaddr;
+
+	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+	if (page_is_ram(start >> PAGE_SHIFT))
+		return __va(phys);
+
+	vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+	/* Only add the offset on success and return NULL if the ioremap() failed: */
+	if (vaddr)
+		vaddr += offset;
+
+	return (void *)vaddr;
+}
+
+void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
+{
+	if (page_is_ram(phys >> PAGE_SHIFT))
+		return;
+
+	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+	return;
+}
+
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
+
+static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
+{
+	/* Don't assume we're using swapper_pg_dir at this point */
+	pgd_t *base = __va(read_cr3());
+	pgd_t *pgd = &base[pgd_index(addr)];
+	pud_t *pud = pud_offset(pgd, addr);
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	return pmd;
+}
+
+static inline pte_t * __init early_ioremap_pte(unsigned long addr)
+{
+	return &bm_pte[pte_index(addr)];
+}
+
+bool __init is_early_ioremap_ptep(pte_t *ptep)
+{
+	return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
+}
+
+void __init early_ioremap_init(void)
+{
+	pmd_t *pmd;
+
+#ifdef CONFIG_X86_64
+	BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#else
+	WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1));
+#endif
+
+	early_ioremap_setup();
+
+	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
+	memset(bm_pte, 0, sizeof(bm_pte));
+	pmd_populate_kernel(&init_mm, pmd, bm_pte);
+
+	/*
+	 * The boot-ioremap range spans multiple pmds, for which
+	 * we are not prepared:
+	 */
+#define __FIXADDR_TOP (-PAGE_SIZE)
+	BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+		     != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
+#undef __FIXADDR_TOP
+	if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
+		WARN_ON(1);
+		printk(KERN_WARNING "pmd %p != %p\n",
+		       pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
+			fix_to_virt(FIX_BTMAP_BEGIN));
+		printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END):   %08lx\n",
+			fix_to_virt(FIX_BTMAP_END));
+
+		printk(KERN_WARNING "FIX_BTMAP_END:       %d\n", FIX_BTMAP_END);
+		printk(KERN_WARNING "FIX_BTMAP_BEGIN:     %d\n",
+		       FIX_BTMAP_BEGIN);
+	}
+}
+
+void __init __early_set_fixmap(enum fixed_addresses idx,
+			       phys_addr_t phys, pgprot_t flags)
+{
+	unsigned long addr = __fix_to_virt(idx);
+	pte_t *pte;
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+	pte = early_ioremap_pte(addr);
+
+	if (pgprot_val(flags))
+		set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
+	else
+		pte_clear(&init_mm, addr, pte);
+	__flush_tlb_one(addr);
+}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
new file mode 100644
index 000000000..4860906c6
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,206 @@
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+extern unsigned char kasan_zero_page[PAGE_SIZE];
+
+static int __init map_range(struct range *range)
+{
+	unsigned long start;
+	unsigned long end;
+
+	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+	/*
+	 * end + 1 here is intentional. We check several shadow bytes in advance
+	 * to slightly speed up fastpath. In some rare cases we could cross
+	 * boundary of mapped shadow, so we just map some more here.
+	 */
+	return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
+}
+
+static void __init clear_pgds(unsigned long start,
+			unsigned long end)
+{
+	for (; start < end; start += PGDIR_SIZE)
+		pgd_clear(pgd_offset_k(start));
+}
+
+void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+	int i;
+	unsigned long start = KASAN_SHADOW_START;
+	unsigned long end = KASAN_SHADOW_END;
+
+	for (i = pgd_index(start); start < end; i++) {
+		pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+				| _KERNPG_TABLE);
+		start += PGDIR_SIZE;
+	}
+}
+
+static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+				unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	while (addr + PAGE_SIZE <= end) {
+		WARN_ON(!pte_none(*pte));
+		set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
+					| __PAGE_KERNEL_RO));
+		addr += PAGE_SIZE;
+		pte = pte_offset_kernel(pmd, addr);
+	}
+	return 0;
+}
+
+static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
+		WARN_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
+					| __PAGE_KERNEL_RO));
+		addr += PMD_SIZE;
+		pmd = pmd_offset(pud, addr);
+	}
+	if (addr < end) {
+		if (pmd_none(*pmd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pte_populate(pmd, addr, end);
+	}
+	return ret;
+}
+
+
+static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pud_t *pud = pud_offset(pgd, addr);
+
+	while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
+		WARN_ON(!pud_none(*pud));
+		set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
+					| __PAGE_KERNEL_RO));
+		addr += PUD_SIZE;
+		pud = pud_offset(pgd, addr);
+	}
+
+	if (addr < end) {
+		if (pud_none(*pud)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pmd_populate(pud, addr, end);
+	}
+	return ret;
+}
+
+static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
+{
+	int ret = 0;
+	pgd_t *pgd = pgd_offset_k(addr);
+
+	while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
+		WARN_ON(!pgd_none(*pgd));
+		set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
+					| __PAGE_KERNEL_RO));
+		addr += PGDIR_SIZE;
+		pgd = pgd_offset_k(addr);
+	}
+
+	if (addr < end) {
+		if (pgd_none(*pgd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+			if (!p)
+				return -ENOMEM;
+			set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pud_populate(pgd, addr, end);
+	}
+	return ret;
+}
+
+
+static void __init populate_zero_shadow(const void *start, const void *end)
+{
+	if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
+		panic("kasan: unable to map zero shadow!");
+}
+
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+			     unsigned long val,
+			     void *data)
+{
+	if (val == DIE_GPF) {
+		pr_emerg("CONFIG_KASAN_INLINE enabled");
+		pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+	.notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_init(void)
+{
+	int i;
+
+#ifdef CONFIG_KASAN_INLINE
+	register_die_notifier(&kasan_die_notifier);
+#endif
+
+	memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+	load_cr3(early_level4_pgt);
+
+	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+	populate_zero_shadow((void *)KASAN_SHADOW_START,
+			kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+	for (i = 0; i < E820_X_MAX; i++) {
+		if (pfn_mapped[i].end == 0)
+			break;
+
+		if (map_range(&pfn_mapped[i]))
+			panic("kasan: unable to allocate shadow!");
+	}
+	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+			(unsigned long)kasan_mem_to_shadow(_end),
+			NUMA_NO_NODE);
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+			(void *)KASAN_SHADOW_END);
+
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	load_cr3(init_level4_pgt);
+	init_task.kasan_depth = 0;
+}
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
new file mode 100644
index 000000000..520b3bce4
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/Makefile
@@ -0,0 +1 @@
+obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
new file mode 100644
index 000000000..dab41876c
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -0,0 +1,227 @@
+#include <linux/interrupt.h>
+#include <linux/kdebug.h>
+#include <linux/kmemcheck.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include "error.h"
+#include "shadow.h"
+
+enum kmemcheck_error_type {
+	KMEMCHECK_ERROR_INVALID_ACCESS,
+	KMEMCHECK_ERROR_BUG,
+};
+
+#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
+
+struct kmemcheck_error {
+	enum kmemcheck_error_type type;
+
+	union {
+		/* KMEMCHECK_ERROR_INVALID_ACCESS */
+		struct {
+			/* Kind of access that caused the error */
+			enum kmemcheck_shadow state;
+			/* Address and size of the erroneous read */
+			unsigned long	address;
+			unsigned int	size;
+		};
+	};
+
+	struct pt_regs		regs;
+	struct stack_trace	trace;
+	unsigned long		trace_entries[32];
+
+	/* We compress it to a char. */
+	unsigned char		shadow_copy[SHADOW_COPY_SIZE];
+	unsigned char		memory_copy[SHADOW_COPY_SIZE];
+};
+
+/*
+ * Create a ring queue of errors to output. We can't call printk() directly
+ * from the kmemcheck traps, since this may call the console drivers and
+ * result in a recursive fault.
+ */
+static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
+static unsigned int error_count;
+static unsigned int error_rd;
+static unsigned int error_wr;
+static unsigned int error_missed_count;
+
+static struct kmemcheck_error *error_next_wr(void)
+{
+	struct kmemcheck_error *e;
+
+	if (error_count == ARRAY_SIZE(error_fifo)) {
+		++error_missed_count;
+		return NULL;
+	}
+
+	e = &error_fifo[error_wr];
+	if (++error_wr == ARRAY_SIZE(error_fifo))
+		error_wr = 0;
+	++error_count;
+	return e;
+}
+
+static struct kmemcheck_error *error_next_rd(void)
+{
+	struct kmemcheck_error *e;
+
+	if (error_count == 0)
+		return NULL;
+
+	e = &error_fifo[error_rd];
+	if (++error_rd == ARRAY_SIZE(error_fifo))
+		error_rd = 0;
+	--error_count;
+	return e;
+}
+
+void kmemcheck_error_recall(void)
+{
+	static const char *desc[] = {
+		[KMEMCHECK_SHADOW_UNALLOCATED]		= "unallocated",
+		[KMEMCHECK_SHADOW_UNINITIALIZED]	= "uninitialized",
+		[KMEMCHECK_SHADOW_INITIALIZED]		= "initialized",
+		[KMEMCHECK_SHADOW_FREED]		= "freed",
+	};
+
+	static const char short_desc[] = {
+		[KMEMCHECK_SHADOW_UNALLOCATED]		= 'a',
+		[KMEMCHECK_SHADOW_UNINITIALIZED]	= 'u',
+		[KMEMCHECK_SHADOW_INITIALIZED]		= 'i',
+		[KMEMCHECK_SHADOW_FREED]		= 'f',
+	};
+
+	struct kmemcheck_error *e;
+	unsigned int i;
+
+	e = error_next_rd();
+	if (!e)
+		return;
+
+	switch (e->type) {
+	case KMEMCHECK_ERROR_INVALID_ACCESS:
+		printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
+			8 * e->size, e->state < ARRAY_SIZE(desc) ?
+				desc[e->state] : "(invalid shadow state)",
+			(void *) e->address);
+
+		printk(KERN_WARNING);
+		for (i = 0; i < SHADOW_COPY_SIZE; ++i)
+			printk(KERN_CONT "%02x", e->memory_copy[i]);
+		printk(KERN_CONT "\n");
+
+		printk(KERN_WARNING);
+		for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
+			if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
+				printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
+			else
+				printk(KERN_CONT " ?");
+		}
+		printk(KERN_CONT "\n");
+		printk(KERN_WARNING "%*c\n", 2 + 2
+			* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
+		break;
+	case KMEMCHECK_ERROR_BUG:
+		printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
+		break;
+	}
+
+	__show_regs(&e->regs, 1);
+	print_stack_trace(&e->trace, 0);
+}
+
+static void do_wakeup(unsigned long data)
+{
+	while (error_count > 0)
+		kmemcheck_error_recall();
+
+	if (error_missed_count > 0) {
+		printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
+			"the queue was too small\n", error_missed_count);
+		error_missed_count = 0;
+	}
+}
+
+static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
+
+/*
+ * Save the context of an error report.
+ */
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+	unsigned long address, unsigned int size, struct pt_regs *regs)
+{
+	static unsigned long prev_ip;
+
+	struct kmemcheck_error *e;
+	void *shadow_copy;
+	void *memory_copy;
+
+	/* Don't report several adjacent errors from the same EIP. */
+	if (regs->ip == prev_ip)
+		return;
+	prev_ip = regs->ip;
+
+	e = error_next_wr();
+	if (!e)
+		return;
+
+	e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
+
+	e->state = state;
+	e->address = address;
+	e->size = size;
+
+	/* Save regs */
+	memcpy(&e->regs, regs, sizeof(*regs));
+
+	/* Save stack trace */
+	e->trace.nr_entries = 0;
+	e->trace.entries = e->trace_entries;
+	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+	e->trace.skip = 0;
+	save_stack_trace_regs(regs, &e->trace);
+
+	/* Round address down to nearest 16 bytes */
+	shadow_copy = kmemcheck_shadow_lookup(address
+		& ~(SHADOW_COPY_SIZE - 1));
+	BUG_ON(!shadow_copy);
+
+	memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
+
+	kmemcheck_show_addr(address);
+	memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
+	memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
+	kmemcheck_hide_addr(address);
+
+	tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
+
+/*
+ * Save the context of a kmemcheck bug.
+ */
+void kmemcheck_error_save_bug(struct pt_regs *regs)
+{
+	struct kmemcheck_error *e;
+
+	e = error_next_wr();
+	if (!e)
+		return;
+
+	e->type = KMEMCHECK_ERROR_BUG;
+
+	memcpy(&e->regs, regs, sizeof(*regs));
+
+	e->trace.nr_entries = 0;
+	e->trace.entries = e->trace_entries;
+	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
+	e->trace.skip = 1;
+	save_stack_trace(&e->trace);
+
+	tasklet_hi_schedule_first(&kmemcheck_tasklet);
+}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
new file mode 100644
index 000000000..0efc2e8d0
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -0,0 +1,15 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
+#define ARCH__X86__MM__KMEMCHECK__ERROR_H
+
+#include <linux/ptrace.h>
+
+#include "shadow.h"
+
+void kmemcheck_error_save(enum kmemcheck_shadow state,
+	unsigned long address, unsigned int size, struct pt_regs *regs);
+
+void kmemcheck_error_save_bug(struct pt_regs *regs);
+
+void kmemcheck_error_recall(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
new file mode 100644
index 000000000..b4f2e7e9e
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -0,0 +1,659 @@
+/**
+ * kmemcheck - a heavyweight memory checker for the linux kernel
+ * Copyright (C) 2007, 2008  Vegard Nossum <vegardno@ifi.uio.no>
+ * (With a lot of help from Ingo Molnar and Pekka Enberg.)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2) as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/percpu.h>
+#include <linux/ptrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/cacheflush.h>
+#include <asm/kmemcheck.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+
+#include "error.h"
+#include "opcode.h"
+#include "pte.h"
+#include "selftest.h"
+#include "shadow.h"
+
+
+#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
+#  define KMEMCHECK_ENABLED 0
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
+#  define KMEMCHECK_ENABLED 1
+#endif
+
+#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
+#  define KMEMCHECK_ENABLED 2
+#endif
+
+int kmemcheck_enabled = KMEMCHECK_ENABLED;
+
+int __init kmemcheck_init(void)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * Limit SMP to use a single CPU. We rely on the fact that this code
+	 * runs before SMP is set up.
+	 */
+	if (setup_max_cpus > 1) {
+		printk(KERN_INFO
+			"kmemcheck: Limiting number of CPUs to 1.\n");
+		setup_max_cpus = 1;
+	}
+#endif
+
+	if (!kmemcheck_selftest()) {
+		printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
+		kmemcheck_enabled = 0;
+		return -EINVAL;
+	}
+
+	printk(KERN_INFO "kmemcheck: Initialized\n");
+	return 0;
+}
+
+early_initcall(kmemcheck_init);
+
+/*
+ * We need to parse the kmemcheck= option before any memory is allocated.
+ */
+static int __init param_kmemcheck(char *str)
+{
+	int val;
+	int ret;
+
+	if (!str)
+		return -EINVAL;
+
+	ret = kstrtoint(str, 0, &val);
+	if (ret)
+		return ret;
+	kmemcheck_enabled = val;
+	return 0;
+}
+
+early_param("kmemcheck", param_kmemcheck);
+
+int kmemcheck_show_addr(unsigned long address)
+{
+	pte_t *pte;
+
+	pte = kmemcheck_pte_lookup(address);
+	if (!pte)
+		return 0;
+
+	set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+	__flush_tlb_one(address);
+	return 1;
+}
+
+int kmemcheck_hide_addr(unsigned long address)
+{
+	pte_t *pte;
+
+	pte = kmemcheck_pte_lookup(address);
+	if (!pte)
+		return 0;
+
+	set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+	__flush_tlb_one(address);
+	return 1;
+}
+
+struct kmemcheck_context {
+	bool busy;
+	int balance;
+
+	/*
+	 * There can be at most two memory operands to an instruction, but
+	 * each address can cross a page boundary -- so we may need up to
+	 * four addresses that must be hidden/revealed for each fault.
+	 */
+	unsigned long addr[4];
+	unsigned long n_addrs;
+	unsigned long flags;
+
+	/* Data size of the instruction that caused a fault. */
+	unsigned int size;
+};
+
+static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
+
+bool kmemcheck_active(struct pt_regs *regs)
+{
+	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+	return data->balance > 0;
+}
+
+/* Save an address that needs to be shown/hidden */
+static void kmemcheck_save_addr(unsigned long addr)
+{
+	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+	BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
+	data->addr[data->n_addrs++] = addr;
+}
+
+static unsigned int kmemcheck_show_all(void)
+{
+	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+	unsigned int i;
+	unsigned int n;
+
+	n = 0;
+	for (i = 0; i < data->n_addrs; ++i)
+		n += kmemcheck_show_addr(data->addr[i]);
+
+	return n;
+}
+
+static unsigned int kmemcheck_hide_all(void)
+{
+	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+	unsigned int i;
+	unsigned int n;
+
+	n = 0;
+	for (i = 0; i < data->n_addrs; ++i)
+		n += kmemcheck_hide_addr(data->addr[i]);
+
+	return n;
+}
+
+/*
+ * Called from the #PF handler.
+ */
+void kmemcheck_show(struct pt_regs *regs)
+{
+	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+	BUG_ON(!irqs_disabled());
+
+	if (unlikely(data->balance != 0)) {
+		kmemcheck_show_all();
+		kmemcheck_error_save_bug(regs);
+		data->balance = 0;
+		return;
+	}
+
+	/*
+	 * None of the addresses actually belonged to kmemcheck. Note that
+	 * this is not an error.
+	 */
+	if (kmemcheck_show_all() == 0)
+		return;
+
+	++data->balance;
+
+	/*
+	 * The IF needs to be cleared as well, so that the faulting
+	 * instruction can run "uninterrupted". Otherwise, we might take
+	 * an interrupt and start executing that before we've had a chance
+	 * to hide the page again.
+	 *
+	 * NOTE: In the rare case of multiple faults, we must not override
+	 * the original flags:
+	 */
+	if (!(regs->flags & X86_EFLAGS_TF))
+		data->flags = regs->flags;
+
+	regs->flags |= X86_EFLAGS_TF;
+	regs->flags &= ~X86_EFLAGS_IF;
+}
+
+/*
+ * Called from the #DB handler.
+ */
+void kmemcheck_hide(struct pt_regs *regs)
+{
+	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+	int n;
+
+	BUG_ON(!irqs_disabled());
+
+	if (unlikely(data->balance != 1)) {
+		kmemcheck_show_all();
+		kmemcheck_error_save_bug(regs);
+		data->n_addrs = 0;
+		data->balance = 0;
+
+		if (!(data->flags & X86_EFLAGS_TF))
+			regs->flags &= ~X86_EFLAGS_TF;
+		if (data->flags & X86_EFLAGS_IF)
+			regs->flags |= X86_EFLAGS_IF;
+		return;
+	}
+
+	if (kmemcheck_enabled)
+		n = kmemcheck_hide_all();
+	else
+		n = kmemcheck_show_all();
+
+	if (n == 0)
+		return;
+
+	--data->balance;
+
+	data->n_addrs = 0;
+
+	if (!(data->flags & X86_EFLAGS_TF))
+		regs->flags &= ~X86_EFLAGS_TF;
+	if (data->flags & X86_EFLAGS_IF)
+		regs->flags |= X86_EFLAGS_IF;
+}
+
+void kmemcheck_show_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i) {
+		unsigned long address;
+		pte_t *pte;
+		unsigned int level;
+
+		address = (unsigned long) page_address(&p[i]);
+		pte = lookup_address(address, &level);
+		BUG_ON(!pte);
+		BUG_ON(level != PG_LEVEL_4K);
+
+		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
+		__flush_tlb_one(address);
+	}
+}
+
+bool kmemcheck_page_is_tracked(struct page *p)
+{
+	/* This will also check the "hidden" flag of the PTE. */
+	return kmemcheck_pte_lookup((unsigned long) page_address(p));
+}
+
+void kmemcheck_hide_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i) {
+		unsigned long address;
+		pte_t *pte;
+		unsigned int level;
+
+		address = (unsigned long) page_address(&p[i]);
+		pte = lookup_address(address, &level);
+		BUG_ON(!pte);
+		BUG_ON(level != PG_LEVEL_4K);
+
+		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+		set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
+		__flush_tlb_one(address);
+	}
+}
+
+/* Access may NOT cross page boundary */
+static void kmemcheck_read_strict(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	void *shadow;
+	enum kmemcheck_shadow status;
+
+	shadow = kmemcheck_shadow_lookup(addr);
+	if (!shadow)
+		return;
+
+	kmemcheck_save_addr(addr);
+	status = kmemcheck_shadow_test(shadow, size);
+	if (status == KMEMCHECK_SHADOW_INITIALIZED)
+		return;
+
+	if (kmemcheck_enabled)
+		kmemcheck_error_save(status, addr, size, regs);
+
+	if (kmemcheck_enabled == 2)
+		kmemcheck_enabled = 0;
+
+	/* Don't warn about it again. */
+	kmemcheck_shadow_set(shadow, size);
+}
+
+bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
+{
+	enum kmemcheck_shadow status;
+	void *shadow;
+
+	shadow = kmemcheck_shadow_lookup(addr);
+	if (!shadow)
+		return true;
+
+	status = kmemcheck_shadow_test_all(shadow, size);
+
+	return status == KMEMCHECK_SHADOW_INITIALIZED;
+}
+
+/* Access may cross page boundary */
+static void kmemcheck_read(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	unsigned long page = addr & PAGE_MASK;
+	unsigned long next_addr = addr + size - 1;
+	unsigned long next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		kmemcheck_read_strict(regs, addr, size);
+		return;
+	}
+
+	/*
+	 * What we do is basically to split the access across the
+	 * two pages and handle each part separately. Yes, this means
+	 * that we may now see reads that are 3 + 5 bytes, for
+	 * example (and if both are uninitialized, there will be two
+	 * reports), but it makes the code a lot simpler.
+	 */
+	kmemcheck_read_strict(regs, addr, next_page - addr);
+	kmemcheck_read_strict(regs, next_page, next_addr - next_page);
+}
+
+static void kmemcheck_write_strict(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	void *shadow;
+
+	shadow = kmemcheck_shadow_lookup(addr);
+	if (!shadow)
+		return;
+
+	kmemcheck_save_addr(addr);
+	kmemcheck_shadow_set(shadow, size);
+}
+
+static void kmemcheck_write(struct pt_regs *regs,
+	unsigned long addr, unsigned int size)
+{
+	unsigned long page = addr & PAGE_MASK;
+	unsigned long next_addr = addr + size - 1;
+	unsigned long next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		kmemcheck_write_strict(regs, addr, size);
+		return;
+	}
+
+	/* See comment in kmemcheck_read(). */
+	kmemcheck_write_strict(regs, addr, next_page - addr);
+	kmemcheck_write_strict(regs, next_page, next_addr - next_page);
+}
+
+/*
+ * Copying is hard. We have two addresses, each of which may be split across
+ * a page (and each page will have different shadow addresses).
+ */
+static void kmemcheck_copy(struct pt_regs *regs,
+	unsigned long src_addr, unsigned long dst_addr, unsigned int size)
+{
+	uint8_t shadow[8];
+	enum kmemcheck_shadow status;
+
+	unsigned long page;
+	unsigned long next_addr;
+	unsigned long next_page;
+
+	uint8_t *x;
+	unsigned int i;
+	unsigned int n;
+
+	BUG_ON(size > sizeof(shadow));
+
+	page = src_addr & PAGE_MASK;
+	next_addr = src_addr + size - 1;
+	next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		/* Same page */
+		x = kmemcheck_shadow_lookup(src_addr);
+		if (x) {
+			kmemcheck_save_addr(src_addr);
+			for (i = 0; i < size; ++i)
+				shadow[i] = x[i];
+		} else {
+			for (i = 0; i < size; ++i)
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+		}
+	} else {
+		n = next_page - src_addr;
+		BUG_ON(n > sizeof(shadow));
+
+		/* First page */
+		x = kmemcheck_shadow_lookup(src_addr);
+		if (x) {
+			kmemcheck_save_addr(src_addr);
+			for (i = 0; i < n; ++i)
+				shadow[i] = x[i];
+		} else {
+			/* Not tracked */
+			for (i = 0; i < n; ++i)
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+		}
+
+		/* Second page */
+		x = kmemcheck_shadow_lookup(next_page);
+		if (x) {
+			kmemcheck_save_addr(next_page);
+			for (i = n; i < size; ++i)
+				shadow[i] = x[i - n];
+		} else {
+			/* Not tracked */
+			for (i = n; i < size; ++i)
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+		}
+	}
+
+	page = dst_addr & PAGE_MASK;
+	next_addr = dst_addr + size - 1;
+	next_page = next_addr & PAGE_MASK;
+
+	if (likely(page == next_page)) {
+		/* Same page */
+		x = kmemcheck_shadow_lookup(dst_addr);
+		if (x) {
+			kmemcheck_save_addr(dst_addr);
+			for (i = 0; i < size; ++i) {
+				x[i] = shadow[i];
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+			}
+		}
+	} else {
+		n = next_page - dst_addr;
+		BUG_ON(n > sizeof(shadow));
+
+		/* First page */
+		x = kmemcheck_shadow_lookup(dst_addr);
+		if (x) {
+			kmemcheck_save_addr(dst_addr);
+			for (i = 0; i < n; ++i) {
+				x[i] = shadow[i];
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+			}
+		}
+
+		/* Second page */
+		x = kmemcheck_shadow_lookup(next_page);
+		if (x) {
+			kmemcheck_save_addr(next_page);
+			for (i = n; i < size; ++i) {
+				x[i - n] = shadow[i];
+				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
+			}
+		}
+	}
+
+	status = kmemcheck_shadow_test(shadow, size);
+	if (status == KMEMCHECK_SHADOW_INITIALIZED)
+		return;
+
+	if (kmemcheck_enabled)
+		kmemcheck_error_save(status, src_addr, size, regs);
+
+	if (kmemcheck_enabled == 2)
+		kmemcheck_enabled = 0;
+}
+
+enum kmemcheck_method {
+	KMEMCHECK_READ,
+	KMEMCHECK_WRITE,
+};
+
+static void kmemcheck_access(struct pt_regs *regs,
+	unsigned long fallback_address, enum kmemcheck_method fallback_method)
+{
+	const uint8_t *insn;
+	const uint8_t *insn_primary;
+	unsigned int size;
+
+	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
+
+	/* Recursive fault -- ouch. */
+	if (data->busy) {
+		kmemcheck_show_addr(fallback_address);
+		kmemcheck_error_save_bug(regs);
+		return;
+	}
+
+	data->busy = true;
+
+	insn = (const uint8_t *) regs->ip;
+	insn_primary = kmemcheck_opcode_get_primary(insn);
+
+	kmemcheck_opcode_decode(insn, &size);
+
+	switch (insn_primary[0]) {
+#ifdef CONFIG_KMEMCHECK_BITOPS_OK
+		/* AND, OR, XOR */
+		/*
+		 * Unfortunately, these instructions have to be excluded from
+		 * our regular checking since they access only some (and not
+		 * all) bits. This clears out "bogus" bitfield-access warnings.
+		 */
+	case 0x80:
+	case 0x81:
+	case 0x82:
+	case 0x83:
+		switch ((insn_primary[1] >> 3) & 7) {
+			/* OR */
+		case 1:
+			/* AND */
+		case 4:
+			/* XOR */
+		case 6:
+			kmemcheck_write(regs, fallback_address, size);
+			goto out;
+
+			/* ADD */
+		case 0:
+			/* ADC */
+		case 2:
+			/* SBB */
+		case 3:
+			/* SUB */
+		case 5:
+			/* CMP */
+		case 7:
+			break;
+		}
+		break;
+#endif
+
+		/* MOVS, MOVSB, MOVSW, MOVSD */
+	case 0xa4:
+	case 0xa5:
+		/*
+		 * These instructions are special because they take two
+		 * addresses, but we only get one page fault.
+		 */
+		kmemcheck_copy(regs, regs->si, regs->di, size);
+		goto out;
+
+		/* CMPS, CMPSB, CMPSW, CMPSD */
+	case 0xa6:
+	case 0xa7:
+		kmemcheck_read(regs, regs->si, size);
+		kmemcheck_read(regs, regs->di, size);
+		goto out;
+	}
+
+	/*
+	 * If the opcode isn't special in any way, we use the data from the
+	 * page fault handler to determine the address and type of memory
+	 * access.
+	 */
+	switch (fallback_method) {
+	case KMEMCHECK_READ:
+		kmemcheck_read(regs, fallback_address, size);
+		goto out;
+	case KMEMCHECK_WRITE:
+		kmemcheck_write(regs, fallback_address, size);
+		goto out;
+	}
+
+out:
+	data->busy = false;
+}
+
+bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
+	unsigned long error_code)
+{
+	pte_t *pte;
+
+	/*
+	 * XXX: Is it safe to assume that memory accesses from virtual 86
+	 * mode or non-kernel code segments will _never_ access kernel
+	 * memory (e.g. tracked pages)? For now, we need this to avoid
+	 * invoking kmemcheck for PnP BIOS calls.
+	 */
+	if (regs->flags & X86_VM_MASK)
+		return false;
+	if (regs->cs != __KERNEL_CS)
+		return false;
+
+	pte = kmemcheck_pte_lookup(address);
+	if (!pte)
+		return false;
+
+	WARN_ON_ONCE(in_nmi());
+
+	if (error_code & 2)
+		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
+	else
+		kmemcheck_access(regs, address, KMEMCHECK_READ);
+
+	kmemcheck_show(regs);
+	return true;
+}
+
+bool kmemcheck_trap(struct pt_regs *regs)
+{
+	if (!kmemcheck_active(regs))
+		return false;
+
+	/* We're done. */
+	kmemcheck_hide(regs);
+	return true;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
new file mode 100644
index 000000000..324aa3f07
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -0,0 +1,106 @@
+#include <linux/types.h>
+
+#include "opcode.h"
+
+static bool opcode_is_prefix(uint8_t b)
+{
+	return
+		/* Group 1 */
+		b == 0xf0 || b == 0xf2 || b == 0xf3
+		/* Group 2 */
+		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
+		|| b == 0x64 || b == 0x65
+		/* Group 3 */
+		|| b == 0x66
+		/* Group 4 */
+		|| b == 0x67;
+}
+
+#ifdef CONFIG_X86_64
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+	return (b & 0xf0) == 0x40;
+}
+#else
+static bool opcode_is_rex_prefix(uint8_t b)
+{
+	return false;
+}
+#endif
+
+#define REX_W (1 << 3)
+
+/*
+ * This is a VERY crude opcode decoder. We only need to find the size of the
+ * load/store that caused our #PF and this should work for all the opcodes
+ * that we care about. Moreover, the ones who invented this instruction set
+ * should be shot.
+ */
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
+{
+	/* Default operand size */
+	int operand_size_override = 4;
+
+	/* prefixes */
+	for (; opcode_is_prefix(*op); ++op) {
+		if (*op == 0x66)
+			operand_size_override = 2;
+	}
+
+	/* REX prefix */
+	if (opcode_is_rex_prefix(*op)) {
+		uint8_t rex = *op;
+
+		++op;
+		if (rex & REX_W) {
+			switch (*op) {
+			case 0x63:
+				*size = 4;
+				return;
+			case 0x0f:
+				++op;
+
+				switch (*op) {
+				case 0xb6:
+				case 0xbe:
+					*size = 1;
+					return;
+				case 0xb7:
+				case 0xbf:
+					*size = 2;
+					return;
+				}
+
+				break;
+			}
+
+			*size = 8;
+			return;
+		}
+	}
+
+	/* escape opcode */
+	if (*op == 0x0f) {
+		++op;
+
+		/*
+		 * This is move with zero-extend and sign-extend, respectively;
+		 * we don't have to think about 0xb6/0xbe, because this is
+		 * already handled in the conditional below.
+		 */
+		if (*op == 0xb7 || *op == 0xbf)
+			operand_size_override = 2;
+	}
+
+	*size = (*op & 1) ? operand_size_override : 1;
+}
+
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
+{
+	/* skip prefixes */
+	while (opcode_is_prefix(*op))
+		++op;
+	if (opcode_is_rex_prefix(*op))
+		++op;
+	return op;
+}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
new file mode 100644
index 000000000..6956aad66
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -0,0 +1,9 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
+#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
+
+#include <linux/types.h>
+
+void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
+const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
new file mode 100644
index 000000000..4ead26eea
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -0,0 +1,22 @@
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+#include "pte.h"
+
+pte_t *kmemcheck_pte_lookup(unsigned long address)
+{
+	pte_t *pte;
+	unsigned int level;
+
+	pte = lookup_address(address, &level);
+	if (!pte)
+		return NULL;
+	if (level != PG_LEVEL_4K)
+		return NULL;
+	if (!pte_hidden(*pte))
+		return NULL;
+
+	return pte;
+}
+
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
new file mode 100644
index 000000000..9f5966456
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -0,0 +1,10 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
+#define ARCH__X86__MM__KMEMCHECK__PTE_H
+
+#include <linux/mm.h>
+
+#include <asm/pgtable.h>
+
+pte_t *kmemcheck_pte_lookup(unsigned long address);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
new file mode 100644
index 000000000..aef7140c0
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -0,0 +1,70 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+
+#include "opcode.h"
+#include "selftest.h"
+
+struct selftest_opcode {
+	unsigned int expected_size;
+	const uint8_t *insn;
+	const char *desc;
+};
+
+static const struct selftest_opcode selftest_opcodes[] = {
+	/* REP MOVS */
+	{1, "\xf3\xa4", 		"rep movsb <mem8>, <mem8>"},
+	{4, "\xf3\xa5",			"rep movsl <mem32>, <mem32>"},
+
+	/* MOVZX / MOVZXD */
+	{1, "\x66\x0f\xb6\x51\xf8",	"movzwq <mem8>, <reg16>"},
+	{1, "\x0f\xb6\x51\xf8",		"movzwq <mem8>, <reg32>"},
+
+	/* MOVSX / MOVSXD */
+	{1, "\x66\x0f\xbe\x51\xf8",	"movswq <mem8>, <reg16>"},
+	{1, "\x0f\xbe\x51\xf8",		"movswq <mem8>, <reg32>"},
+
+#ifdef CONFIG_X86_64
+	/* MOVZX / MOVZXD */
+	{1, "\x49\x0f\xb6\x51\xf8",	"movzbq <mem8>, <reg64>"},
+	{2, "\x49\x0f\xb7\x51\xf8",	"movzbq <mem16>, <reg64>"},
+
+	/* MOVSX / MOVSXD */
+	{1, "\x49\x0f\xbe\x51\xf8",	"movsbq <mem8>, <reg64>"},
+	{2, "\x49\x0f\xbf\x51\xf8",	"movsbq <mem16>, <reg64>"},
+	{4, "\x49\x63\x51\xf8",		"movslq <mem32>, <reg64>"},
+#endif
+};
+
+static bool selftest_opcode_one(const struct selftest_opcode *op)
+{
+	unsigned size;
+
+	kmemcheck_opcode_decode(op->insn, &size);
+
+	if (size == op->expected_size)
+		return true;
+
+	printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
+		op->desc, op->expected_size, size);
+	return false;
+}
+
+static bool selftest_opcodes_all(void)
+{
+	bool pass = true;
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
+		pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
+
+	return pass;
+}
+
+bool kmemcheck_selftest(void)
+{
+	bool pass = true;
+
+	pass = pass && selftest_opcodes_all();
+
+	return pass;
+}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
new file mode 100644
index 000000000..8fed4fe11
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -0,0 +1,6 @@
+#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
+
+bool kmemcheck_selftest(void);
+
+#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
new file mode 100644
index 000000000..aec124214
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -0,0 +1,173 @@
+#include <linux/kmemcheck.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+#include "pte.h"
+#include "shadow.h"
+
+/*
+ * Return the shadow address for the given address. Returns NULL if the
+ * address is not tracked.
+ *
+ * We need to be extremely careful not to follow any invalid pointers,
+ * because this function can be called for *any* possible address.
+ */
+void *kmemcheck_shadow_lookup(unsigned long address)
+{
+	pte_t *pte;
+	struct page *page;
+
+	if (!virt_addr_valid(address))
+		return NULL;
+
+	pte = kmemcheck_pte_lookup(address);
+	if (!pte)
+		return NULL;
+
+	page = virt_to_page(address);
+	if (!page->shadow)
+		return NULL;
+	return page->shadow + (address & (PAGE_SIZE - 1));
+}
+
+static void mark_shadow(void *address, unsigned int n,
+	enum kmemcheck_shadow status)
+{
+	unsigned long addr = (unsigned long) address;
+	unsigned long last_addr = addr + n - 1;
+	unsigned long page = addr & PAGE_MASK;
+	unsigned long last_page = last_addr & PAGE_MASK;
+	unsigned int first_n;
+	void *shadow;
+
+	/* If the memory range crosses a page boundary, stop there. */
+	if (page == last_page)
+		first_n = n;
+	else
+		first_n = page + PAGE_SIZE - addr;
+
+	shadow = kmemcheck_shadow_lookup(addr);
+	if (shadow)
+		memset(shadow, status, first_n);
+
+	addr += first_n;
+	n -= first_n;
+
+	/* Do full-page memset()s. */
+	while (n >= PAGE_SIZE) {
+		shadow = kmemcheck_shadow_lookup(addr);
+		if (shadow)
+			memset(shadow, status, PAGE_SIZE);
+
+		addr += PAGE_SIZE;
+		n -= PAGE_SIZE;
+	}
+
+	/* Do the remaining page, if any. */
+	if (n > 0) {
+		shadow = kmemcheck_shadow_lookup(addr);
+		if (shadow)
+			memset(shadow, status, n);
+	}
+}
+
+void kmemcheck_mark_unallocated(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
+}
+
+void kmemcheck_mark_uninitialized(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
+}
+
+/*
+ * Fill the shadow memory of the given address such that the memory at that
+ * address is marked as being initialized.
+ */
+void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
+}
+EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
+
+void kmemcheck_mark_freed(void *address, unsigned int n)
+{
+	mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
+}
+
+void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i)
+		kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i)
+		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 0; i < n; ++i)
+		kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
+{
+#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
+	uint8_t *x;
+	unsigned int i;
+
+	x = shadow;
+
+	/*
+	 * Make sure _some_ bytes are initialized. Gcc frequently generates
+	 * code to access neighboring bytes.
+	 */
+	for (i = 0; i < size; ++i) {
+		if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
+			return x[i];
+	}
+
+	return x[0];
+#else
+	return kmemcheck_shadow_test_all(shadow, size);
+#endif
+}
+
+enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
+{
+	uint8_t *x;
+	unsigned int i;
+
+	x = shadow;
+
+	/* All bytes must be initialized. */
+	for (i = 0; i < size; ++i) {
+		if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
+			return x[i];
+	}
+
+	return x[0];
+}
+
+void kmemcheck_shadow_set(void *shadow, unsigned int size)
+{
+	uint8_t *x;
+	unsigned int i;
+
+	x = shadow;
+	for (i = 0; i < size; ++i)
+		x[i] = KMEMCHECK_SHADOW_INITIALIZED;
+}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
new file mode 100644
index 000000000..ff0b2f70f
--- /dev/null
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -0,0 +1,18 @@
+#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
+#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
+
+enum kmemcheck_shadow {
+	KMEMCHECK_SHADOW_UNALLOCATED,
+	KMEMCHECK_SHADOW_UNINITIALIZED,
+	KMEMCHECK_SHADOW_INITIALIZED,
+	KMEMCHECK_SHADOW_FREED,
+};
+
+void *kmemcheck_shadow_lookup(unsigned long address);
+
+enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
+enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
+						unsigned int size);
+void kmemcheck_shadow_set(void *shadow, unsigned int size);
+
+#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000..637ab34ed
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,589 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ *     2007 Alexander Eichner
+ *     2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <linux/errno.h>
+#include <asm/debugreg.h>
+#include <linux/mmiotrace.h>
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+	struct list_head list;
+	struct kmmio_fault_page *release_next;
+	unsigned long page; /* location of the fault page */
+	pteval_t old_presence; /* page presence prior to arming */
+	bool armed;
+
+	/*
+	 * Number of times this page has been registered as a part
+	 * of a probe. If zero, page is disarmed and this may be freed.
+	 * Used only by writers (RCU) and post_kmmio_handler().
+	 * Protected by kmmio_lock, when linked into kmmio_page_table.
+	 */
+	int count;
+
+	bool scheduled_for_release;
+};
+
+struct kmmio_delayed_release {
+	struct rcu_head rcu;
+	struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+	struct kmmio_fault_page *fpage;
+	struct kmmio_probe *probe;
+	unsigned long saved_flags;
+	unsigned long addr;
+	int active;
+};
+
+static DEFINE_SPINLOCK(kmmio_lock);
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+	struct kmmio_probe *p;
+	list_for_each_entry_rcu(p, &kmmio_probes, list) {
+		if (addr >= p->addr && addr < (p->addr + p->len))
+			return p;
+	}
+	return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+{
+	struct list_head *head;
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	head = kmmio_page_list(page);
+	list_for_each_entry_rcu(f, head, list) {
+		if (f->page == page)
+			return f;
+	}
+	return NULL;
+}
+
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
+{
+	pmdval_t v = pmd_val(*pmd);
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
+	set_pmd(pmd, __pmd(v));
+}
+
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
+{
+	pteval_t v = pte_val(*pte);
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
+	set_pte_atomic(pte, __pte(v));
+}
+
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
+{
+	unsigned int level;
+	pte_t *pte = lookup_address(f->page, &level);
+
+	if (!pte) {
+		pr_err("no pte for page 0x%08lx\n", f->page);
+		return -1;
+	}
+
+	switch (level) {
+	case PG_LEVEL_2M:
+		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
+		break;
+	case PG_LEVEL_4K:
+		clear_pte_presence(pte, clear, &f->old_presence);
+		break;
+	default:
+		pr_err("unexpected page level 0x%x.\n", level);
+		return -1;
+	}
+
+	__flush_tlb_one(f->page);
+	return 0;
+}
+
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+	int ret;
+	WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
+	if (f->armed) {
+		pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
+			   f->page, f->count, !!f->old_presence);
+	}
+	ret = clear_page_presence(f, true);
+	WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
+		  f->page);
+	f->armed = true;
+	return ret;
+}
+
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
+{
+	int ret = clear_page_presence(f, false);
+	WARN_ONCE(ret < 0,
+			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+	f->armed = false;
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled throughout this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+	struct kmmio_context *ctx;
+	struct kmmio_fault_page *faultpage;
+	int ret = 0; /* default to fault not handled */
+
+	/*
+	 * Preemption is now disabled to prevent process switch during
+	 * single stepping. We can only handle one active kmmio trace
+	 * per cpu, so ensure that we finish it before something else
+	 * gets to run. We also hold the RCU read lock over single
+	 * stepping to avoid looking up the probe and kmmio_fault_page
+	 * again.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+
+	faultpage = get_kmmio_fault_page(addr);
+	if (!faultpage) {
+		/*
+		 * Either this page fault is not caused by kmmio, or
+		 * another CPU just pulled the kmmio probe from under
+		 * our feet. The latter case should not be possible.
+		 */
+		goto no_kmmio;
+	}
+
+	ctx = &get_cpu_var(kmmio_ctx);
+	if (ctx->active) {
+		if (addr == ctx->addr) {
+			/*
+			 * A second fault on the same page means some other
+			 * condition needs handling by do_page_fault(), the
+			 * page really not being present is the most common.
+			 */
+			pr_debug("secondary hit for 0x%08lx CPU %d.\n",
+				 addr, smp_processor_id());
+
+			if (!faultpage->old_presence)
+				pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
+					addr, smp_processor_id());
+		} else {
+			/*
+			 * Prevent overwriting already in-flight context.
+			 * This should not happen, let's hope disarming at
+			 * least prevents a panic.
+			 */
+			pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
+				 smp_processor_id(), addr);
+			pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
+			disarm_kmmio_fault_page(faultpage);
+		}
+		goto no_kmmio_ctx;
+	}
+	ctx->active++;
+
+	ctx->fpage = faultpage;
+	ctx->probe = get_kmmio_probe(addr);
+	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+	ctx->addr = addr;
+
+	if (ctx->probe && ctx->probe->pre_handler)
+		ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+	/*
+	 * Enable single-stepping and disable interrupts for the faulting
+	 * context. Local interrupts must not get enabled during stepping.
+	 */
+	regs->flags |= X86_EFLAGS_TF;
+	regs->flags &= ~X86_EFLAGS_IF;
+
+	/* Now we set present bit in PTE and single step. */
+	disarm_kmmio_fault_page(ctx->fpage);
+
+	/*
+	 * If another cpu accesses the same page while we are stepping,
+	 * the access will not be caught. It will simply succeed and the
+	 * only downside is we lose the event. If this becomes a problem,
+	 * the user should drop to single cpu before tracing.
+	 */
+
+	put_cpu_var(kmmio_ctx);
+	return 1; /* fault handled */
+
+no_kmmio_ctx:
+	put_cpu_var(kmmio_ctx);
+no_kmmio:
+	rcu_read_unlock();
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled throughout this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+	int ret = 0;
+	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+
+	if (!ctx->active) {
+		/*
+		 * debug traps without an active context are due to either
+		 * something external causing them (f.e. using a debugger while
+		 * mmio tracing enabled), or erroneous behaviour
+		 */
+		pr_warning("unexpected debug trap on CPU %d.\n",
+			   smp_processor_id());
+		goto out;
+	}
+
+	if (ctx->probe && ctx->probe->post_handler)
+		ctx->probe->post_handler(ctx->probe, condition, regs);
+
+	/* Prevent racing against release_kmmio_fault_page(). */
+	spin_lock(&kmmio_lock);
+	if (ctx->fpage->count)
+		arm_kmmio_fault_page(ctx->fpage);
+	spin_unlock(&kmmio_lock);
+
+	regs->flags &= ~X86_EFLAGS_TF;
+	regs->flags |= ctx->saved_flags;
+
+	/* These were acquired in kmmio_handler(). */
+	ctx->active--;
+	BUG_ON(ctx->active);
+	rcu_read_unlock();
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, flags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (!(regs->flags & X86_EFLAGS_TF))
+		ret = 1;
+out:
+	put_cpu_var(kmmio_ctx);
+	return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long page)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (f) {
+		if (!f->count)
+			arm_kmmio_fault_page(f);
+		f->count++;
+		return 0;
+	}
+
+	f = kzalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -1;
+
+	f->count = 1;
+	f->page = page;
+
+	if (arm_kmmio_fault_page(f)) {
+		kfree(f);
+		return -1;
+	}
+
+	list_add_rcu(&f->list, kmmio_page_list(f->page));
+
+	return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+				struct kmmio_fault_page **release_list)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (!f)
+		return;
+
+	f->count--;
+	BUG_ON(f->count < 0);
+	if (!f->count) {
+		disarm_kmmio_fault_page(f);
+		if (!f->scheduled_for_release) {
+			f->release_next = *release_list;
+			*release_list = f;
+			f->scheduled_for_release = true;
+		}
+	}
+}
+
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	int ret = 0;
+	unsigned long size = 0;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+
+	spin_lock_irqsave(&kmmio_lock, flags);
+	if (get_kmmio_probe(p->addr)) {
+		ret = -EEXIST;
+		goto out;
+	}
+	kmmio_count++;
+	list_add_rcu(&p->list, &kmmio_probes);
+	while (size < size_lim) {
+		if (add_kmmio_fault_page(p->addr + size))
+			pr_err("Unable to set page fault.\n");
+		size += PAGE_SIZE;
+	}
+out:
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+	/*
+	 * XXX: What should I do here?
+	 * Here was a call to global_flush_tlb(), but it does not exist
+	 * anymore. It seems it's not needed after all.
+	 */
+	return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *f = dr->release_list;
+	while (f) {
+		struct kmmio_fault_page *next = f->release_next;
+		BUG_ON(f->count);
+		kfree(f);
+		f = next;
+	}
+	kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr =
+		container_of(head, struct kmmio_delayed_release, rcu);
+	struct kmmio_fault_page *f = dr->release_list;
+	struct kmmio_fault_page **prevp = &dr->release_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kmmio_lock, flags);
+	while (f) {
+		if (!f->count) {
+			list_del_rcu(&f->list);
+			prevp = &f->release_next;
+		} else {
+			*prevp = f->release_next;
+			f->release_next = NULL;
+			f->scheduled_for_release = false;
+		}
+		f = *prevp;
+	}
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+
+	/* This is the real RCU destroy call. */
+	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actually free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	unsigned long size = 0;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
+	struct kmmio_fault_page *release_list = NULL;
+	struct kmmio_delayed_release *drelease;
+
+	spin_lock_irqsave(&kmmio_lock, flags);
+	while (size < size_lim) {
+		release_kmmio_fault_page(p->addr + size, &release_list);
+		size += PAGE_SIZE;
+	}
+	list_del_rcu(&p->list);
+	kmmio_count--;
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+
+	if (!release_list)
+		return;
+
+	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+	if (!drelease) {
+		pr_crit("leaking kmmio_fault_page objects.\n");
+		return;
+	}
+	drelease->release_list = release_list;
+
+	/*
+	 * This is not really RCU here. We have just disarmed a set of
+	 * pages so that they cannot trigger page faults anymore. However,
+	 * we cannot remove the pages from kmmio_page_table,
+	 * because a probe hit might be in flight on another CPU. The
+	 * pages are collected into a list, and they will be removed from
+	 * kmmio_page_table when it is certain that no probe hit related to
+	 * these pages can be in flight. RCU grace period sounds like a
+	 * good choice.
+	 *
+	 * If we removed the pages too early, kmmio page fault handler might
+	 * not find the respective kmmio_fault_page and determine it's not
+	 * a kmmio fault, when it actually is. This would lead to madness.
+	 */
+	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
+{
+	struct die_args *arg = args;
+	unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
+
+	if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
+		if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
+			/*
+			 * Reset the BS bit in dr6 (pointed by args->err) to
+			 * denote completion of processing
+			 */
+			*dr6_p &= ~DR_STEP;
+			return NOTIFY_STOP;
+		}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+int kmmio_init(void)
+{
+	int i;
+
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+	return register_die_notifier(&nb_die);
+}
+
+void kmmio_cleanup(void)
+{
+	int i;
+
+	unregister_die_notifier(&nb_die);
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+			KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+	}
+}
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000000000..62474ba66
--- /dev/null
+++ b/arch/x86/mm/mm_internal.h
@@ -0,0 +1,21 @@
+#ifndef __X86_MM_INTERNAL_H
+#define __X86_MM_INTERNAL_H
+
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+	return alloc_low_pages(1);
+}
+
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+					     unsigned long end,
+					     unsigned long page_size_mask);
+void zone_sizes_init(void);
+
+extern int after_bootmem;
+
+void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
+
+#endif	/* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
new file mode 100644
index 000000000..844b06d67
--- /dev/null
+++ b/arch/x86/mm/mmap.c
@@ -0,0 +1,135 @@
+/*
+ * Flexible mmap layout support
+ *
+ * Based on code by Ingo Molnar and Andi Kleen, copyrighted
+ * as follows:
+ *
+ * Copyright 2003-2009 Red Hat Inc.
+ * All Rights Reserved.
+ * Copyright 2005 Andi Kleen, SUSE Labs.
+ * Copyright 2007 Jiri Kosina, SUSE Labs.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/personality.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/limits.h>
+#include <linux/sched.h>
+#include <asm/elf.h>
+
+struct va_alignment __read_mostly va_align = {
+	.flags = -1,
+};
+
+static unsigned long stack_maxrandom_size(void)
+{
+	unsigned long max = 0;
+	if ((current->flags & PF_RANDOMIZE) &&
+		!(current->personality & ADDR_NO_RANDOMIZE)) {
+		max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
+	}
+
+	return max;
+}
+
+/*
+ * Top of mmap area (just below the process stack).
+ *
+ * Leave an at least ~128 MB hole with possible stack randomization.
+ */
+#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
+#define MAX_GAP (TASK_SIZE/6*5)
+
+static int mmap_is_legacy(void)
+{
+	if (current->personality & ADDR_COMPAT_LAYOUT)
+		return 1;
+
+	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
+		return 1;
+
+	return sysctl_legacy_va_layout;
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+	unsigned long rnd;
+
+	/*
+	 *  8 bits of randomness in 32bit mmaps, 20 address space bits
+	 * 28 bits of randomness in 64bit mmaps, 40 address space bits
+	 */
+	if (mmap_is_ia32())
+		rnd = (unsigned long)get_random_int() % (1<<8);
+	else
+		rnd = (unsigned long)get_random_int() % (1<<28);
+
+	return rnd << PAGE_SHIFT;
+}
+
+static unsigned long mmap_base(unsigned long rnd)
+{
+	unsigned long gap = rlimit(RLIMIT_STACK);
+
+	if (gap < MIN_GAP)
+		gap = MIN_GAP;
+	else if (gap > MAX_GAP)
+		gap = MAX_GAP;
+
+	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+}
+
+/*
+ * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
+ * does, but not when emulating X86_32
+ */
+static unsigned long mmap_legacy_base(unsigned long rnd)
+{
+	if (mmap_is_ia32())
+		return TASK_UNMAPPED_BASE;
+	else
+		return TASK_UNMAPPED_BASE + rnd;
+}
+
+/*
+ * This function, called very early during the creation of a new
+ * process VM image, sets up which VM layout function to use:
+ */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	unsigned long random_factor = 0UL;
+
+	if (current->flags & PF_RANDOMIZE)
+		random_factor = arch_mmap_rnd();
+
+	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
+
+	if (mmap_is_legacy()) {
+		mm->mmap_base = mm->mmap_legacy_base;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+	} else {
+		mm->mmap_base = mmap_base(random_factor);
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+	}
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_MPX)
+		return "[mpx]";
+	return NULL;
+}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000..0057a7acc
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,478 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *               Jeff Muizelaar, 2006, 2007
+ *               Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+
+#define pr_fmt(fmt) "mmiotrace: " fmt
+
+#define DEBUG 1
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/kallsyms.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820.h> /* for ISA_START_ADDRESS */
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+
+#include "pf_in.h"
+
+struct trap_reason {
+	unsigned long addr;
+	unsigned long ip;
+	enum reason_type type;
+	int active_traces;
+};
+
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+	resource_size_t phys;
+	unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);		/* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long	filter_offset;
+static bool		nommiotrace;
+static bool		trace_pc;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(trace_pc, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+
+static bool is_enabled(void)
+{
+	return atomic_read(&mmiotrace_enabled);
+}
+
+static void print_pte(unsigned long address)
+{
+	unsigned int level;
+	pte_t *pte = lookup_address(address, &level);
+
+	if (!pte) {
+		pr_err("Error in %s: no pte for page 0x%08lx\n",
+		       __func__, address);
+		return;
+	}
+
+	if (level == PG_LEVEL_2M) {
+		pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
+			 address);
+		BUG();
+	}
+	pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
+		address,
+		(unsigned long long)pte_val(*pte),
+		(unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
+		 addr, my_reason->addr);
+	print_pte(addr);
+	print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
+#ifdef __i386__
+	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+		 regs->ax, regs->bx, regs->cx, regs->dx);
+	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+		 regs->si, regs->di, regs->bp, regs->sp);
+#else
+	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+		 regs->ax, regs->cx, regs->dx);
+	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
+		 regs->si, regs->di, regs->bp, regs->sp);
+#endif
+	put_cpu_var(pf_reason);
+	BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+						unsigned long addr)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+	const unsigned long instptr = instruction_pointer(regs);
+	const enum reason_type type = get_ins_type(instptr);
+	struct remap_trace *trace = p->private;
+
+	/* it doesn't make sense to have more than one active trace per cpu */
+	if (my_reason->active_traces)
+		die_kmmio_nesting_error(regs, addr);
+	else
+		my_reason->active_traces++;
+
+	my_reason->type = type;
+	my_reason->addr = addr;
+	my_reason->ip = instptr;
+
+	my_trace->phys = addr - trace->probe.addr + trace->phys;
+	my_trace->map_id = trace->id;
+
+	/*
+	 * Only record the program counter when requested.
+	 * It may taint clean-room reverse engineering.
+	 */
+	if (trace_pc)
+		my_trace->pc = instptr;
+	else
+		my_trace->pc = 0;
+
+	/*
+	 * XXX: the timestamp recorded will be *after* the tracing has been
+	 * done, not at the time we hit the instruction. SMP implications
+	 * on event ordering?
+	 */
+
+	switch (type) {
+	case REG_READ:
+		my_trace->opcode = MMIO_READ;
+		my_trace->width = get_ins_mem_width(instptr);
+		break;
+	case REG_WRITE:
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_reg_val(instptr, regs);
+		break;
+	case IMM_WRITE:
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_imm_val(instptr);
+		break;
+	default:
+		{
+			unsigned char *ip = (unsigned char *)instptr;
+			my_trace->opcode = MMIO_UNKNOWN_OP;
+			my_trace->width = 0;
+			my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
+								*(ip + 2);
+		}
+	}
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+							struct pt_regs *regs)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+
+	/* this should always return the active_trace count to 0 */
+	my_reason->active_traces--;
+	if (my_reason->active_traces) {
+		pr_emerg("unexpected post handler");
+		BUG();
+	}
+
+	switch (my_reason->type) {
+	case REG_READ:
+		my_trace->value = get_ins_reg_val(my_reason->ip, regs);
+		break;
+	default:
+		break;
+	}
+
+	mmio_trace_rw(my_trace);
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
+							void __iomem *addr)
+{
+	static atomic_t next_id;
+	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+	/* These are page-unaligned. */
+	struct mmiotrace_map map = {
+		.phys = offset,
+		.virt = (unsigned long)addr,
+		.len = size,
+		.opcode = MMIO_PROBE
+	};
+
+	if (!trace) {
+		pr_err("kmalloc failed in ioremap\n");
+		return;
+	}
+
+	*trace = (struct remap_trace) {
+		.probe = {
+			.addr = (unsigned long)addr,
+			.len = size,
+			.pre_handler = pre,
+			.post_handler = post,
+			.private = trace
+		},
+		.phys = offset,
+		.id = atomic_inc_return(&next_id)
+	};
+	map.map_id = trace->id;
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled()) {
+		kfree(trace);
+		goto not_enabled;
+	}
+
+	mmio_trace_mapping(&map);
+	list_add_tail(&trace->list, &trace_list);
+	if (!nommiotrace)
+		register_kmmio_probe(&trace->probe);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+}
+
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+						void __iomem *addr)
+{
+	if (!is_enabled()) /* recheck and proper locking in *_core() */
+		return;
+
+	pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
+		 (unsigned long long)offset, size, addr);
+	if ((filter_offset) && (offset != filter_offset))
+		return;
+	ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+	struct mmiotrace_map map = {
+		.phys = 0,
+		.virt = (unsigned long)addr,
+		.len = 0,
+		.opcode = MMIO_UNPROBE
+	};
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+	struct remap_trace *found_trace = NULL;
+
+	pr_debug("Unmapping %p.\n", addr);
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		if ((unsigned long)addr == trace->probe.addr) {
+			if (!nommiotrace)
+				unregister_kmmio_probe(&trace->probe);
+			list_del(&trace->list);
+			found_trace = trace;
+			break;
+		}
+	}
+	map.map_id = (found_trace) ? found_trace->id : -1;
+	mmio_trace_mapping(&map);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+	if (found_trace) {
+		synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+		kfree(found_trace);
+	}
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+	might_sleep();
+	if (is_enabled()) /* recheck and proper locking in *_core() */
+		iounmap_trace_core(addr);
+}
+
+int mmiotrace_printk(const char *fmt, ...)
+{
+	int ret = 0;
+	va_list args;
+	unsigned long flags;
+	va_start(args, fmt);
+
+	spin_lock_irqsave(&trace_lock, flags);
+	if (is_enabled())
+		ret = mmio_trace_printk(fmt, args);
+	spin_unlock_irqrestore(&trace_lock, flags);
+
+	va_end(args);
+	return ret;
+}
+EXPORT_SYMBOL(mmiotrace_printk);
+
+static void clear_trace_list(void)
+{
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+
+	/*
+	 * No locking required, because the caller ensures we are in a
+	 * critical section via mutex, and is_enabled() is false,
+	 * i.e. nothing can traverse or modify this list.
+	 * Caller also ensures is_enabled() cannot change.
+	 */
+	list_for_each_entry(trace, &trace_list, list) {
+		pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
+			  trace->probe.addr, trace->probe.len);
+		if (!nommiotrace)
+			unregister_kmmio_probe(&trace->probe);
+	}
+	synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		list_del(&trace->list);
+		kfree(trace);
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_var_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	if (downed_cpus == NULL &&
+	    !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
+		pr_notice("Failed to allocate mask\n");
+		goto out;
+	}
+
+	get_online_cpus();
+	cpumask_copy(downed_cpus, cpu_online_mask);
+	cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
+	if (num_online_cpus() > 1)
+		pr_notice("Disabling non-boot CPUs...\n");
+	put_online_cpus();
+
+	for_each_cpu(cpu, downed_cpus) {
+		err = cpu_down(cpu);
+		if (!err)
+			pr_info("CPU%d is down.\n", cpu);
+		else
+			pr_err("Error taking CPU%d down: %d\n", cpu, err);
+	}
+out:
+	if (num_online_cpus() > 1)
+		pr_warning("multiple CPUs still online, may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
+		return;
+	pr_notice("Re-enabling CPUs...\n");
+	for_each_cpu(cpu, downed_cpus) {
+		err = cpu_up(cpu);
+		if (!err)
+			pr_info("enabled CPU%d.\n", cpu);
+		else
+			pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
+	}
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+	if (num_online_cpus() > 1)
+		pr_warning("multiple CPUs are online, may miss events. "
+			   "Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
+void enable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (is_enabled())
+		goto out;
+
+	if (nommiotrace)
+		pr_info("MMIO tracing disabled.\n");
+	kmmio_init();
+	enter_uniprocessor();
+	spin_lock_irq(&trace_lock);
+	atomic_inc(&mmiotrace_enabled);
+	spin_unlock_irq(&trace_lock);
+	pr_info("enabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (!is_enabled())
+		goto out;
+
+	spin_lock_irq(&trace_lock);
+	atomic_dec(&mmiotrace_enabled);
+	BUG_ON(is_enabled());
+	spin_unlock_irq(&trace_lock);
+
+	clear_trace_list(); /* guarantees: no more kmmio callbacks */
+	leave_uniprocessor();
+	kmmio_cleanup();
+	pr_info("disabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
new file mode 100644
index 000000000..4d1c11c07
--- /dev/null
+++ b/arch/x86/mm/mpx.c
@@ -0,0 +1,916 @@
+/*
+ * mpx.c - Memory Protection eXtensions
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ * Qiaowei Ren <qiaowei.ren@intel.com>
+ * Dave Hansen <dave.hansen@intel.com>
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+
+#include <asm/i387.h>
+#include <asm/insn.h>
+#include <asm/mman.h>
+#include <asm/mmu_context.h>
+#include <asm/mpx.h>
+#include <asm/processor.h>
+#include <asm/fpu-internal.h>
+
+/*
+ * This is really a simplified "vm_mmap". it only handles MPX
+ * bounds tables (the bounds directory is user-allocated).
+ */
+static unsigned long mpx_mmap(unsigned long len)
+{
+	unsigned long ret;
+	unsigned long addr, pgoff;
+	struct mm_struct *mm = current->mm;
+	vm_flags_t vm_flags;
+	struct vm_area_struct *vma;
+
+	/* Only bounds table and bounds directory can be allocated here */
+	if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES)
+		return -EINVAL;
+
+	down_write(&mm->mmap_sem);
+
+	/* Too many mappings? */
+	if (mm->map_count > sysctl_max_map_count) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Obtain the address to map to. we verify (or select) it and ensure
+	 * that it represents a valid section of the address space.
+	 */
+	addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
+	if (addr & ~PAGE_MASK) {
+		ret = addr;
+		goto out;
+	}
+
+	vm_flags = VM_READ | VM_WRITE | VM_MPX |
+			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
+	/* Set pgoff according to addr for anon_vma */
+	pgoff = addr >> PAGE_SHIFT;
+
+	ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
+	if (IS_ERR_VALUE(ret))
+		goto out;
+
+	vma = find_vma(mm, ret);
+	if (!vma) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (vm_flags & VM_LOCKED) {
+		up_write(&mm->mmap_sem);
+		mm_populate(ret, len);
+		return ret;
+	}
+
+out:
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
+enum reg_type {
+	REG_TYPE_RM = 0,
+	REG_TYPE_INDEX,
+	REG_TYPE_BASE,
+};
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+			  enum reg_type type)
+{
+	int regno = 0;
+
+	static const int regoff[] = {
+		offsetof(struct pt_regs, ax),
+		offsetof(struct pt_regs, cx),
+		offsetof(struct pt_regs, dx),
+		offsetof(struct pt_regs, bx),
+		offsetof(struct pt_regs, sp),
+		offsetof(struct pt_regs, bp),
+		offsetof(struct pt_regs, si),
+		offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+		offsetof(struct pt_regs, r8),
+		offsetof(struct pt_regs, r9),
+		offsetof(struct pt_regs, r10),
+		offsetof(struct pt_regs, r11),
+		offsetof(struct pt_regs, r12),
+		offsetof(struct pt_regs, r13),
+		offsetof(struct pt_regs, r14),
+		offsetof(struct pt_regs, r15),
+#endif
+	};
+	int nr_registers = ARRAY_SIZE(regoff);
+	/*
+	 * Don't possibly decode a 32-bit instructions as
+	 * reading a 64-bit-only register.
+	 */
+	if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
+		nr_registers -= 8;
+
+	switch (type) {
+	case REG_TYPE_RM:
+		regno = X86_MODRM_RM(insn->modrm.value);
+		if (X86_REX_B(insn->rex_prefix.value) == 1)
+			regno += 8;
+		break;
+
+	case REG_TYPE_INDEX:
+		regno = X86_SIB_INDEX(insn->sib.value);
+		if (X86_REX_X(insn->rex_prefix.value) == 1)
+			regno += 8;
+		break;
+
+	case REG_TYPE_BASE:
+		regno = X86_SIB_BASE(insn->sib.value);
+		if (X86_REX_B(insn->rex_prefix.value) == 1)
+			regno += 8;
+		break;
+
+	default:
+		pr_err("invalid register type");
+		BUG();
+		break;
+	}
+
+	if (regno > nr_registers) {
+		WARN_ONCE(1, "decoded an instruction with an invalid register");
+		return -EINVAL;
+	}
+	return regoff[regno];
+}
+
+/*
+ * return the address being referenced be instruction
+ * for rm=3 returning the content of the rm reg
+ * for rm!=3 calculates the address using SIB and Disp
+ */
+static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
+{
+	unsigned long addr, base, indx;
+	int addr_offset, base_offset, indx_offset;
+	insn_byte_t sib;
+
+	insn_get_modrm(insn);
+	insn_get_sib(insn);
+	sib = insn->sib.value;
+
+	if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+		addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
+		if (addr_offset < 0)
+			goto out_err;
+		addr = regs_get_register(regs, addr_offset);
+	} else {
+		if (insn->sib.nbytes) {
+			base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
+			if (base_offset < 0)
+				goto out_err;
+
+			indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
+			if (indx_offset < 0)
+				goto out_err;
+
+			base = regs_get_register(regs, base_offset);
+			indx = regs_get_register(regs, indx_offset);
+			addr = base + indx * (1 << X86_SIB_SCALE(sib));
+		} else {
+			addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
+			if (addr_offset < 0)
+				goto out_err;
+			addr = regs_get_register(regs, addr_offset);
+		}
+		addr += insn->displacement.value;
+	}
+	return (void __user *)addr;
+out_err:
+	return (void __user *)-1;
+}
+
+static int mpx_insn_decode(struct insn *insn,
+			   struct pt_regs *regs)
+{
+	unsigned char buf[MAX_INSN_SIZE];
+	int x86_64 = !test_thread_flag(TIF_IA32);
+	int not_copied;
+	int nr_copied;
+
+	not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
+	nr_copied = sizeof(buf) - not_copied;
+	/*
+	 * The decoder _should_ fail nicely if we pass it a short buffer.
+	 * But, let's not depend on that implementation detail.  If we
+	 * did not get anything, just error out now.
+	 */
+	if (!nr_copied)
+		return -EFAULT;
+	insn_init(insn, buf, nr_copied, x86_64);
+	insn_get_length(insn);
+	/*
+	 * copy_from_user() tries to get as many bytes as we could see in
+	 * the largest possible instruction.  If the instruction we are
+	 * after is shorter than that _and_ we attempt to copy from
+	 * something unreadable, we might get a short read.  This is OK
+	 * as long as the read did not stop in the middle of the
+	 * instruction.  Check to see if we got a partial instruction.
+	 */
+	if (nr_copied < insn->length)
+		return -EFAULT;
+
+	insn_get_opcode(insn);
+	/*
+	 * We only _really_ need to decode bndcl/bndcn/bndcu
+	 * Error out on anything else.
+	 */
+	if (insn->opcode.bytes[0] != 0x0f)
+		goto bad_opcode;
+	if ((insn->opcode.bytes[1] != 0x1a) &&
+	    (insn->opcode.bytes[1] != 0x1b))
+		goto bad_opcode;
+
+	return 0;
+bad_opcode:
+	return -EINVAL;
+}
+
+/*
+ * If a bounds overflow occurs then a #BR is generated. This
+ * function decodes MPX instructions to get violation address
+ * and set this address into extended struct siginfo.
+ *
+ * Note that this is not a super precise way of doing this.
+ * Userspace could have, by the time we get here, written
+ * anything it wants in to the instructions.  We can not
+ * trust anything about it.  They might not be valid
+ * instructions or might encode invalid registers, etc...
+ *
+ * The caller is expected to kfree() the returned siginfo_t.
+ */
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
+				struct xsave_struct *xsave_buf)
+{
+	struct bndreg *bndregs, *bndreg;
+	siginfo_t *info = NULL;
+	struct insn insn;
+	uint8_t bndregno;
+	int err;
+
+	err = mpx_insn_decode(&insn, regs);
+	if (err)
+		goto err_out;
+
+	/*
+	 * We know at this point that we are only dealing with
+	 * MPX instructions.
+	 */
+	insn_get_modrm(&insn);
+	bndregno = X86_MODRM_REG(insn.modrm.value);
+	if (bndregno > 3) {
+		err = -EINVAL;
+		goto err_out;
+	}
+	/* get the bndregs _area_ of the xsave structure */
+	bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS);
+	if (!bndregs) {
+		err = -EINVAL;
+		goto err_out;
+	}
+	/* now go select the individual register in the set of 4 */
+	bndreg = &bndregs[bndregno];
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	/*
+	 * The registers are always 64-bit, but the upper 32
+	 * bits are ignored in 32-bit mode.  Also, note that the
+	 * upper bounds are architecturally represented in 1's
+	 * complement form.
+	 *
+	 * The 'unsigned long' cast is because the compiler
+	 * complains when casting from integers to different-size
+	 * pointers.
+	 */
+	info->si_lower = (void __user *)(unsigned long)bndreg->lower_bound;
+	info->si_upper = (void __user *)(unsigned long)~bndreg->upper_bound;
+	info->si_addr_lsb = 0;
+	info->si_signo = SIGSEGV;
+	info->si_errno = 0;
+	info->si_code = SEGV_BNDERR;
+	info->si_addr = mpx_get_addr_ref(&insn, regs);
+	/*
+	 * We were not able to extract an address from the instruction,
+	 * probably because there was something invalid in it.
+	 */
+	if (info->si_addr == (void *)-1) {
+		err = -EINVAL;
+		goto err_out;
+	}
+	return info;
+err_out:
+	/* info might be NULL, but kfree() handles that */
+	kfree(info);
+	return ERR_PTR(err);
+}
+
+static __user void *task_get_bounds_dir(struct task_struct *tsk)
+{
+	struct bndcsr *bndcsr;
+
+	if (!cpu_feature_enabled(X86_FEATURE_MPX))
+		return MPX_INVALID_BOUNDS_DIR;
+
+	/*
+	 * 32-bit binaries on 64-bit kernels are currently
+	 * unsupported.
+	 */
+	if (IS_ENABLED(CONFIG_X86_64) && test_thread_flag(TIF_IA32))
+		return MPX_INVALID_BOUNDS_DIR;
+	/*
+	 * The bounds directory pointer is stored in a register
+	 * only accessible if we first do an xsave.
+	 */
+	fpu_save_init(&tsk->thread.fpu);
+	bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR);
+	if (!bndcsr)
+		return MPX_INVALID_BOUNDS_DIR;
+
+	/*
+	 * Make sure the register looks valid by checking the
+	 * enable bit.
+	 */
+	if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
+		return MPX_INVALID_BOUNDS_DIR;
+
+	/*
+	 * Lastly, mask off the low bits used for configuration
+	 * flags, and return the address of the bounds table.
+	 */
+	return (void __user *)(unsigned long)
+		(bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
+}
+
+int mpx_enable_management(struct task_struct *tsk)
+{
+	void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
+	struct mm_struct *mm = tsk->mm;
+	int ret = 0;
+
+	/*
+	 * runtime in the userspace will be responsible for allocation of
+	 * the bounds directory. Then, it will save the base of the bounds
+	 * directory into XSAVE/XRSTOR Save Area and enable MPX through
+	 * XRSTOR instruction.
+	 *
+	 * fpu_xsave() is expected to be very expensive. Storing the bounds
+	 * directory here means that we do not have to do xsave in the unmap
+	 * path; we can just use mm->bd_addr instead.
+	 */
+	bd_base = task_get_bounds_dir(tsk);
+	down_write(&mm->mmap_sem);
+	mm->bd_addr = bd_base;
+	if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
+		ret = -ENXIO;
+
+	up_write(&mm->mmap_sem);
+	return ret;
+}
+
+int mpx_disable_management(struct task_struct *tsk)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (!cpu_feature_enabled(X86_FEATURE_MPX))
+		return -ENXIO;
+
+	down_write(&mm->mmap_sem);
+	mm->bd_addr = MPX_INVALID_BOUNDS_DIR;
+	up_write(&mm->mmap_sem);
+	return 0;
+}
+
+/*
+ * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each
+ * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB,
+ * and the size of each bounds table is 4MB.
+ */
+static int allocate_bt(long __user *bd_entry)
+{
+	unsigned long expected_old_val = 0;
+	unsigned long actual_old_val = 0;
+	unsigned long bt_addr;
+	int ret = 0;
+
+	/*
+	 * Carve the virtual space out of userspace for the new
+	 * bounds table:
+	 */
+	bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES);
+	if (IS_ERR((void *)bt_addr))
+		return PTR_ERR((void *)bt_addr);
+	/*
+	 * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
+	 */
+	bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+
+	/*
+	 * Go poke the address of the new bounds table in to the
+	 * bounds directory entry out in userspace memory.  Note:
+	 * we may race with another CPU instantiating the same table.
+	 * In that case the cmpxchg will see an unexpected
+	 * 'actual_old_val'.
+	 *
+	 * This can fault, but that's OK because we do not hold
+	 * mmap_sem at this point, unlike some of the other part
+	 * of the MPX code that have to pagefault_disable().
+	 */
+	ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
+					   expected_old_val, bt_addr);
+	if (ret)
+		goto out_unmap;
+
+	/*
+	 * The user_atomic_cmpxchg_inatomic() will only return nonzero
+	 * for faults, *not* if the cmpxchg itself fails.  Now we must
+	 * verify that the cmpxchg itself completed successfully.
+	 */
+	/*
+	 * We expected an empty 'expected_old_val', but instead found
+	 * an apparently valid entry.  Assume we raced with another
+	 * thread to instantiate this table and desclare succecss.
+	 */
+	if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
+		ret = 0;
+		goto out_unmap;
+	}
+	/*
+	 * We found a non-empty bd_entry but it did not have the
+	 * VALID_FLAG set.  Return an error which will result in
+	 * a SEGV since this probably means that somebody scribbled
+	 * some invalid data in to a bounds table.
+	 */
+	if (expected_old_val != actual_old_val) {
+		ret = -EINVAL;
+		goto out_unmap;
+	}
+	return 0;
+out_unmap:
+	vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES);
+	return ret;
+}
+
+/*
+ * When a BNDSTX instruction attempts to save bounds to a bounds
+ * table, it will first attempt to look up the table in the
+ * first-level bounds directory.  If it does not find a table in
+ * the directory, a #BR is generated and we get here in order to
+ * allocate a new table.
+ *
+ * With 32-bit mode, the size of BD is 4MB, and the size of each
+ * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
+ * and the size of each bound table is 4MB.
+ */
+static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
+{
+	unsigned long bd_entry, bd_base;
+	struct bndcsr *bndcsr;
+
+	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+	if (!bndcsr)
+		return -EINVAL;
+	/*
+	 * Mask off the preserve and enable bits
+	 */
+	bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
+	/*
+	 * The hardware provides the address of the missing or invalid
+	 * entry via BNDSTATUS, so we don't have to go look it up.
+	 */
+	bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
+	/*
+	 * Make sure the directory entry is within where we think
+	 * the directory is.
+	 */
+	if ((bd_entry < bd_base) ||
+	    (bd_entry >= bd_base + MPX_BD_SIZE_BYTES))
+		return -EINVAL;
+
+	return allocate_bt((long __user *)bd_entry);
+}
+
+int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+{
+	/*
+	 * Userspace never asked us to manage the bounds tables,
+	 * so refuse to help.
+	 */
+	if (!kernel_managing_mpx_tables(current->mm))
+		return -EINVAL;
+
+	if (do_mpx_bt_fault(xsave_buf)) {
+		force_sig(SIGSEGV, current);
+		/*
+		 * The force_sig() is essentially "handling" this
+		 * exception, so we do not pass up the error
+		 * from do_mpx_bt_fault().
+		 */
+	}
+	return 0;
+}
+
+/*
+ * A thin wrapper around get_user_pages().  Returns 0 if the
+ * fault was resolved or -errno if not.
+ */
+static int mpx_resolve_fault(long __user *addr, int write)
+{
+	long gup_ret;
+	int nr_pages = 1;
+	int force = 0;
+
+	gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
+				 nr_pages, write, force, NULL, NULL);
+	/*
+	 * get_user_pages() returns number of pages gotten.
+	 * 0 means we failed to fault in and get anything,
+	 * probably because 'addr' is bad.
+	 */
+	if (!gup_ret)
+		return -EFAULT;
+	/* Other error, return it */
+	if (gup_ret < 0)
+		return gup_ret;
+	/* must have gup'd a page and gup_ret>0, success */
+	return 0;
+}
+
+/*
+ * Get the base of bounds tables pointed by specific bounds
+ * directory entry.
+ */
+static int get_bt_addr(struct mm_struct *mm,
+			long __user *bd_entry, unsigned long *bt_addr)
+{
+	int ret;
+	int valid_bit;
+
+	if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry)))
+		return -EFAULT;
+
+	while (1) {
+		int need_write = 0;
+
+		pagefault_disable();
+		ret = get_user(*bt_addr, bd_entry);
+		pagefault_enable();
+		if (!ret)
+			break;
+		if (ret == -EFAULT)
+			ret = mpx_resolve_fault(bd_entry, need_write);
+		/*
+		 * If we could not resolve the fault, consider it
+		 * userspace's fault and error out.
+		 */
+		if (ret)
+			return ret;
+	}
+
+	valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG;
+	*bt_addr &= MPX_BT_ADDR_MASK;
+
+	/*
+	 * When the kernel is managing bounds tables, a bounds directory
+	 * entry will either have a valid address (plus the valid bit)
+	 * *OR* be completely empty. If we see a !valid entry *and* some
+	 * data in the address field, we know something is wrong. This
+	 * -EINVAL return will cause a SIGSEGV.
+	 */
+	if (!valid_bit && *bt_addr)
+		return -EINVAL;
+	/*
+	 * Do we have an completely zeroed bt entry?  That is OK.  It
+	 * just means there was no bounds table for this memory.  Make
+	 * sure to distinguish this from -EINVAL, which will cause
+	 * a SEGV.
+	 */
+	if (!valid_bit)
+		return -ENOENT;
+
+	return 0;
+}
+
+/*
+ * Free the backing physical pages of bounds table 'bt_addr'.
+ * Assume start...end is within that bounds table.
+ */
+static int zap_bt_entries(struct mm_struct *mm,
+		unsigned long bt_addr,
+		unsigned long start, unsigned long end)
+{
+	struct vm_area_struct *vma;
+	unsigned long addr, len;
+
+	/*
+	 * Find the first overlapping vma. If vma->vm_start > start, there
+	 * will be a hole in the bounds table. This -EINVAL return will
+	 * cause a SIGSEGV.
+	 */
+	vma = find_vma(mm, start);
+	if (!vma || vma->vm_start > start)
+		return -EINVAL;
+
+	/*
+	 * A NUMA policy on a VM_MPX VMA could cause this bouds table to
+	 * be split. So we need to look across the entire 'start -> end'
+	 * range of this bounds table, find all of the VM_MPX VMAs, and
+	 * zap only those.
+	 */
+	addr = start;
+	while (vma && vma->vm_start < end) {
+		/*
+		 * We followed a bounds directory entry down
+		 * here.  If we find a non-MPX VMA, that's bad,
+		 * so stop immediately and return an error.  This
+		 * probably results in a SIGSEGV.
+		 */
+		if (!(vma->vm_flags & VM_MPX))
+			return -EINVAL;
+
+		len = min(vma->vm_end, end) - addr;
+		zap_page_range(vma, addr, len, NULL);
+
+		vma = vma->vm_next;
+		addr = vma->vm_start;
+	}
+
+	return 0;
+}
+
+static int unmap_single_bt(struct mm_struct *mm,
+		long __user *bd_entry, unsigned long bt_addr)
+{
+	unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+	unsigned long actual_old_val = 0;
+	int ret;
+
+	while (1) {
+		int need_write = 1;
+
+		pagefault_disable();
+		ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
+						   expected_old_val, 0);
+		pagefault_enable();
+		if (!ret)
+			break;
+		if (ret == -EFAULT)
+			ret = mpx_resolve_fault(bd_entry, need_write);
+		/*
+		 * If we could not resolve the fault, consider it
+		 * userspace's fault and error out.
+		 */
+		if (ret)
+			return ret;
+	}
+	/*
+	 * The cmpxchg was performed, check the results.
+	 */
+	if (actual_old_val != expected_old_val) {
+		/*
+		 * Someone else raced with us to unmap the table.
+		 * There was no bounds table pointed to by the
+		 * directory, so declare success.  Somebody freed
+		 * it.
+		 */
+		if (!actual_old_val)
+			return 0;
+		/*
+		 * Something messed with the bounds directory
+		 * entry.  We hold mmap_sem for read or write
+		 * here, so it could not be a _new_ bounds table
+		 * that someone just allocated.  Something is
+		 * wrong, so pass up the error and SIGSEGV.
+		 */
+		return -EINVAL;
+	}
+
+	/*
+	 * Note, we are likely being called under do_munmap() already. To
+	 * avoid recursion, do_munmap() will check whether it comes
+	 * from one bounds table through VM_MPX flag.
+	 */
+	return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES);
+}
+
+/*
+ * If the bounds table pointed by bounds directory 'bd_entry' is
+ * not shared, unmap this whole bounds table. Otherwise, only free
+ * those backing physical pages of bounds table entries covered
+ * in this virtual address region start...end.
+ */
+static int unmap_shared_bt(struct mm_struct *mm,
+		long __user *bd_entry, unsigned long start,
+		unsigned long end, bool prev_shared, bool next_shared)
+{
+	unsigned long bt_addr;
+	int ret;
+
+	ret = get_bt_addr(mm, bd_entry, &bt_addr);
+	/*
+	 * We could see an "error" ret for not-present bounds
+	 * tables (not really an error), or actual errors, but
+	 * stop unmapping either way.
+	 */
+	if (ret)
+		return ret;
+
+	if (prev_shared && next_shared)
+		ret = zap_bt_entries(mm, bt_addr,
+				bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
+				bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
+	else if (prev_shared)
+		ret = zap_bt_entries(mm, bt_addr,
+				bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
+				bt_addr+MPX_BT_SIZE_BYTES);
+	else if (next_shared)
+		ret = zap_bt_entries(mm, bt_addr, bt_addr,
+				bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
+	else
+		ret = unmap_single_bt(mm, bd_entry, bt_addr);
+
+	return ret;
+}
+
+/*
+ * A virtual address region being munmap()ed might share bounds table
+ * with adjacent VMAs. We only need to free the backing physical
+ * memory of these shared bounds tables entries covered in this virtual
+ * address region.
+ */
+static int unmap_edge_bts(struct mm_struct *mm,
+		unsigned long start, unsigned long end)
+{
+	int ret;
+	long __user *bde_start, *bde_end;
+	struct vm_area_struct *prev, *next;
+	bool prev_shared = false, next_shared = false;
+
+	bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
+	bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
+
+	/*
+	 * Check whether bde_start and bde_end are shared with adjacent
+	 * VMAs.
+	 *
+	 * We already unliked the VMAs from the mm's rbtree so 'start'
+	 * is guaranteed to be in a hole. This gets us the first VMA
+	 * before the hole in to 'prev' and the next VMA after the hole
+	 * in to 'next'.
+	 */
+	next = find_vma_prev(mm, start, &prev);
+	if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1))
+			== bde_start)
+		prev_shared = true;
+	if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start))
+			== bde_end)
+		next_shared = true;
+
+	/*
+	 * This virtual address region being munmap()ed is only
+	 * covered by one bounds table.
+	 *
+	 * In this case, if this table is also shared with adjacent
+	 * VMAs, only part of the backing physical memory of the bounds
+	 * table need be freeed. Otherwise the whole bounds table need
+	 * be unmapped.
+	 */
+	if (bde_start == bde_end) {
+		return unmap_shared_bt(mm, bde_start, start, end,
+				prev_shared, next_shared);
+	}
+
+	/*
+	 * If more than one bounds tables are covered in this virtual
+	 * address region being munmap()ed, we need to separately check
+	 * whether bde_start and bde_end are shared with adjacent VMAs.
+	 */
+	ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false);
+	if (ret)
+		return ret;
+	ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int mpx_unmap_tables(struct mm_struct *mm,
+		unsigned long start, unsigned long end)
+{
+	int ret;
+	long __user *bd_entry, *bde_start, *bde_end;
+	unsigned long bt_addr;
+
+	/*
+	 * "Edge" bounds tables are those which are being used by the region
+	 * (start -> end), but that may be shared with adjacent areas.  If they
+	 * turn out to be completely unshared, they will be freed.  If they are
+	 * shared, we will free the backing store (like an MADV_DONTNEED) for
+	 * areas used by this region.
+	 */
+	ret = unmap_edge_bts(mm, start, end);
+	switch (ret) {
+		/* non-present tables are OK */
+		case 0:
+		case -ENOENT:
+			/* Success, or no tables to unmap */
+			break;
+		case -EINVAL:
+		case -EFAULT:
+		default:
+			return ret;
+	}
+
+	/*
+	 * Only unmap the bounds table that are
+	 *   1. fully covered
+	 *   2. not at the edges of the mapping, even if full aligned
+	 */
+	bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
+	bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
+	for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) {
+		ret = get_bt_addr(mm, bd_entry, &bt_addr);
+		switch (ret) {
+			case 0:
+				break;
+			case -ENOENT:
+				/* No table here, try the next one */
+				continue;
+			case -EINVAL:
+			case -EFAULT:
+			default:
+				/*
+				 * Note: we are being strict here.
+				 * Any time we run in to an issue
+				 * unmapping tables, we stop and
+				 * SIGSEGV.
+				 */
+				return ret;
+		}
+
+		ret = unmap_single_bt(mm, bd_entry, bt_addr);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Free unused bounds tables covered in a virtual address region being
+ * munmap()ed. Assume end > start.
+ *
+ * This function will be called by do_munmap(), and the VMAs covering
+ * the virtual address region start...end have already been split if
+ * necessary, and the 'vma' is the first vma in this range (start -> end).
+ */
+void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long start, unsigned long end)
+{
+	int ret;
+
+	/*
+	 * Refuse to do anything unless userspace has asked
+	 * the kernel to help manage the bounds tables,
+	 */
+	if (!kernel_managing_mpx_tables(current->mm))
+		return;
+	/*
+	 * This will look across the entire 'start -> end' range,
+	 * and find all of the non-VM_MPX VMAs.
+	 *
+	 * To avoid recursion, if a VM_MPX vma is found in the range
+	 * (start->end), we will not continue follow-up work. This
+	 * recursion represents having bounds tables for bounds tables,
+	 * which should not occur normally. Being strict about it here
+	 * helps ensure that we do not have an exploitable stack overflow.
+	 */
+	do {
+		if (vma->vm_flags & VM_MPX)
+			return;
+		vma = vma->vm_next;
+	} while (vma && vma->vm_start < end);
+
+	ret = mpx_unmap_tables(mm, start, end);
+	if (ret)
+		force_sig(SIGSEGV, current);
+}
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
new file mode 100644
index 000000000..4053bb58b
--- /dev/null
+++ b/arch/x86/mm/numa.c
@@ -0,0 +1,882 @@
+/* Common code for 32 and 64-bit NUMA */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+#include <asm/acpi.h>
+#include <asm/amd_nb.h>
+
+#include "numa_internal.h"
+
+int __initdata numa_off;
+nodemask_t numa_nodes_parsed __initdata;
+
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+
+static __init int numa_setup(char *opt)
+{
+	if (!opt)
+		return -EINVAL;
+	if (!strncmp(opt, "off", 3))
+		numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+	if (!strncmp(opt, "fake=", 5))
+		numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+	if (!strncmp(opt, "noacpi", 6))
+		acpi_numa = -1;
+#endif
+	return 0;
+}
+early_param("numa", numa_setup);
+
+/*
+ * apicid, cpu, node mappings
+ */
+s16 __apicid_to_node[MAX_LOCAL_APIC] = {
+	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+int numa_cpu_node(int cpu)
+{
+	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+	if (apicid != BAD_APICID)
+		return __apicid_to_node[apicid];
+	return NUMA_NO_NODE;
+}
+
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+/*
+ * Map cpu index to node index
+ */
+DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
+EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
+
+void numa_set_node(int cpu, int node)
+{
+	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
+
+	/* early setting, no percpu area yet */
+	if (cpu_to_node_map) {
+		cpu_to_node_map[cpu] = node;
+		return;
+	}
+
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
+		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+		dump_stack();
+		return;
+	}
+#endif
+	per_cpu(x86_cpu_to_node_map, cpu) = node;
+
+	set_cpu_numa_node(cpu, node);
+}
+
+void numa_clear_node(int cpu)
+{
+	numa_set_node(cpu, NUMA_NO_NODE);
+}
+
+/*
+ * Allocate node_to_cpumask_map based on number of available nodes
+ * Requires node_possible_map to be valid.
+ *
+ * Note: cpumask_of_node() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
+ */
+void __init setup_node_to_cpumask_map(void)
+{
+	unsigned int node;
+
+	/* setup nr_node_ids if not done yet */
+	if (nr_node_ids == MAX_NUMNODES)
+		setup_nr_node_ids();
+
+	/* allocate the map */
+	for (node = 0; node < nr_node_ids; node++)
+		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+
+	/* cpumask_of_node() will now work */
+	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+}
+
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+				     struct numa_meminfo *mi)
+{
+	/* ignore zero length blks */
+	if (start == end)
+		return 0;
+
+	/* whine about and ignore invalid blks */
+	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+		pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+			   nid, start, end - 1);
+		return 0;
+	}
+
+	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: too many memblk ranges\n");
+		return -EINVAL;
+	}
+
+	mi->blk[mi->nr_blks].start = start;
+	mi->blk[mi->nr_blks].end = end;
+	mi->blk[mi->nr_blks].nid = nid;
+	mi->nr_blks++;
+	return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+	mi->nr_blks--;
+	memmove(&mi->blk[idx], &mi->blk[idx + 1],
+		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
+{
+	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+	u64 nd_pa;
+	void *nd;
+	int tnid;
+
+	/*
+	 * Allocate node data.  Try node-local memory and then any node.
+	 * Never allocate in DMA zone.
+	 */
+	nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+	if (!nd_pa) {
+		nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
+					      MEMBLOCK_ALLOC_ACCESSIBLE);
+		if (!nd_pa) {
+			pr_err("Cannot find %zu bytes in node %d\n",
+			       nd_size, nid);
+			return;
+		}
+	}
+	nd = __va(nd_pa);
+
+	/* report and initialize */
+	printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+	       nd_pa, nd_pa + nd_size - 1);
+	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+	if (tnid != nid)
+		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
+
+	node_data[nid] = nd;
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+
+	node_set_online(nid);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks.  Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+	const u64 low = 0;
+	const u64 high = PFN_PHYS(max_pfn);
+	int i, j, k;
+
+	/* first, trim all entries */
+	for (i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi = &mi->blk[i];
+
+		/* make sure all blocks are inside the limits */
+		bi->start = max(bi->start, low);
+		bi->end = min(bi->end, high);
+
+		/* and there's no empty block */
+		if (bi->start >= bi->end)
+			numa_remove_memblk_from(i--, mi);
+	}
+
+	/* merge neighboring / overlapping entries */
+	for (i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi = &mi->blk[i];
+
+		for (j = i + 1; j < mi->nr_blks; j++) {
+			struct numa_memblk *bj = &mi->blk[j];
+			u64 start, end;
+
+			/*
+			 * See whether there are overlapping blocks.  Whine
+			 * about but allow overlaps of the same nid.  They
+			 * will be merged below.
+			 */
+			if (bi->end > bj->start && bi->start < bj->end) {
+				if (bi->nid != bj->nid) {
+					pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+					       bi->nid, bi->start, bi->end - 1,
+					       bj->nid, bj->start, bj->end - 1);
+					return -EINVAL;
+				}
+				pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+					   bi->nid, bi->start, bi->end - 1,
+					   bj->start, bj->end - 1);
+			}
+
+			/*
+			 * Join together blocks on the same node, holes
+			 * between which don't overlap with memory on other
+			 * nodes.
+			 */
+			if (bi->nid != bj->nid)
+				continue;
+			start = min(bi->start, bj->start);
+			end = max(bi->end, bj->end);
+			for (k = 0; k < mi->nr_blks; k++) {
+				struct numa_memblk *bk = &mi->blk[k];
+
+				if (bi->nid == bk->nid)
+					continue;
+				if (start < bk->end && end > bk->start)
+					break;
+			}
+			if (k < mi->nr_blks)
+				continue;
+			printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+			       bi->nid, bi->start, bi->end - 1, bj->start,
+			       bj->end - 1, start, end - 1);
+			bi->start = start;
+			bi->end = end;
+			numa_remove_memblk_from(j--, mi);
+		}
+	}
+
+	/* clear unused ones */
+	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+		mi->blk[i].start = mi->blk[i].end = 0;
+		mi->blk[i].nid = NUMA_NO_NODE;
+	}
+
+	return 0;
+}
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+					      const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+		if (mi->blk[i].start != mi->blk[i].end &&
+		    mi->blk[i].nid != NUMA_NO_NODE)
+			node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed.  The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+	/* numa_distance could be 1LU marking allocation failure, test cnt */
+	if (numa_distance_cnt)
+		memblock_free(__pa(numa_distance), size);
+	numa_distance_cnt = 0;
+	numa_distance = NULL;	/* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+	nodemask_t nodes_parsed;
+	size_t size;
+	int i, j, cnt = 0;
+	u64 phys;
+
+	/* size the new table and allocate it */
+	nodes_parsed = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+	for_each_node_mask(i, nodes_parsed)
+		cnt = i;
+	cnt++;
+	size = cnt * cnt * sizeof(numa_distance[0]);
+
+	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+				      size, PAGE_SIZE);
+	if (!phys) {
+		pr_warning("NUMA: Warning: can't allocate distance table!\n");
+		/* don't retry until explicitly reset */
+		numa_distance = (void *)1LU;
+		return -ENOMEM;
+	}
+	memblock_reserve(phys, size);
+
+	numa_distance = __va(phys);
+	numa_distance_cnt = cnt;
+
+	/* fill with the default distances */
+	for (i = 0; i < cnt; i++)
+		for (j = 0; j < cnt; j++)
+			numa_distance[i * cnt + j] = i == j ?
+				LOCAL_DISTANCE : REMOTE_DISTANCE;
+	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+	return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.  If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+	if (!numa_distance && numa_alloc_distance() < 0)
+		return;
+
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+			from < 0 || to < 0) {
+		pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+			    from, to, distance);
+		return;
+	}
+
+	if ((u8)distance != distance ||
+	    (from == to && distance != LOCAL_DISTANCE)) {
+		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+			     from, to, distance);
+		return;
+	}
+
+	numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+	return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common).  Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+	u64 numaram, e820ram;
+	int i;
+
+	numaram = 0;
+	for (i = 0; i < mi->nr_blks; i++) {
+		u64 s = mi->blk[i].start >> PAGE_SHIFT;
+		u64 e = mi->blk[i].end >> PAGE_SHIFT;
+		numaram += e - s;
+		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+		if ((s64)numaram < 0)
+			numaram = 0;
+	}
+
+	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+
+	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+	if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+		printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
+		       (numaram << PAGE_SHIFT) >> 20,
+		       (e820ram << PAGE_SHIFT) >> 20);
+		return false;
+	}
+	return true;
+}
+
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+	int i, nid;
+	nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
+	unsigned long start, end;
+	struct memblock_region *r;
+
+	/*
+	 * At this time, all memory regions reserved by memblock are
+	 * used by the kernel. Set the nid in memblock.reserved will
+	 * mark out all the nodes the kernel resides in.
+	 */
+	for (i = 0; i < numa_meminfo.nr_blks; i++) {
+		struct numa_memblk *mb = &numa_meminfo.blk[i];
+
+		memblock_set_node(mb->start, mb->end - mb->start,
+				  &memblock.reserved, mb->nid);
+	}
+
+	/*
+	 * Mark all kernel nodes.
+	 *
+	 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
+	 * may not include all the memblock.reserved memory ranges because
+	 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+	 */
+	for_each_memblock(reserved, r)
+		if (r->nid != MAX_NUMNODES)
+			node_set(r->nid, numa_kernel_nodes);
+
+	/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+	for (i = 0; i < numa_meminfo.nr_blks; i++) {
+		nid = numa_meminfo.blk[i].nid;
+		if (!node_isset(nid, numa_kernel_nodes))
+			continue;
+
+		start = numa_meminfo.blk[i].start;
+		end = numa_meminfo.blk[i].end;
+
+		memblock_clear_hotplug(start, end - start);
+	}
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+	unsigned long uninitialized_var(pfn_align);
+	int i, nid;
+
+	/* Account for nodes with cpus and no memory */
+	node_possible_map = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&node_possible_map, mi);
+	if (WARN_ON(nodes_empty(node_possible_map)))
+		return -EINVAL;
+
+	for (i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *mb = &mi->blk[i];
+		memblock_set_node(mb->start, mb->end - mb->start,
+				  &memblock.memory, mb->nid);
+	}
+
+	/*
+	 * At very early time, the kernel have to use some memory such as
+	 * loading the kernel image. We cannot prevent this anyway. So any
+	 * node the kernel resides in should be un-hotpluggable.
+	 *
+	 * And when we come here, alloc node data won't fail.
+	 */
+	numa_clear_kernel_node_hotplug();
+
+	/*
+	 * If sections array is gonna be used for pfn -> nid mapping, check
+	 * whether its granularity is fine enough.
+	 */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+	pfn_align = node_map_pfn_alignment();
+	if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+		printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+		       PFN_PHYS(pfn_align) >> 20,
+		       PFN_PHYS(PAGES_PER_SECTION) >> 20);
+		return -EINVAL;
+	}
+#endif
+	if (!numa_meminfo_cover_memory(mi))
+		return -EINVAL;
+
+	/* Finally register nodes. */
+	for_each_node_mask(nid, node_possible_map) {
+		u64 start = PFN_PHYS(max_pfn);
+		u64 end = 0;
+
+		for (i = 0; i < mi->nr_blks; i++) {
+			if (nid != mi->blk[i].nid)
+				continue;
+			start = min(mi->blk[i].start, start);
+			end = max(mi->blk[i].end, end);
+		}
+
+		if (start >= end)
+			continue;
+
+		/*
+		 * Don't confuse VM with a node that doesn't have the
+		 * minimum amount of memory:
+		 */
+		if (end && (end - start) < NODE_MIN_SIZE)
+			continue;
+
+		alloc_node_data(nid);
+	}
+
+	/* Dump memblock with node info and return. */
+	memblock_dump_all();
+	return 0;
+}
+
+/*
+ * There are unfortunately some poorly designed mainboards around that
+ * only connect memory to a single CPU. This breaks the 1:1 cpu->node
+ * mapping. To avoid this fill in the mapping for all possible CPUs,
+ * as the number of CPUs is not known yet. We round robin the existing
+ * nodes.
+ */
+static void __init numa_init_array(void)
+{
+	int rr, i;
+
+	rr = first_node(node_online_map);
+	for (i = 0; i < nr_cpu_ids; i++) {
+		if (early_cpu_to_node(i) != NUMA_NO_NODE)
+			continue;
+		numa_set_node(i, rr);
+		rr = next_node(rr, node_online_map);
+		if (rr == MAX_NUMNODES)
+			rr = first_node(node_online_map);
+	}
+}
+
+static int __init numa_init(int (*init_func)(void))
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		set_apicid_to_node(i, NUMA_NO_NODE);
+
+	nodes_clear(numa_nodes_parsed);
+	nodes_clear(node_possible_map);
+	nodes_clear(node_online_map);
+	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.memory,
+				  MAX_NUMNODES));
+	WARN_ON(memblock_set_node(0, ULLONG_MAX, &memblock.reserved,
+				  MAX_NUMNODES));
+	/* In case that parsing SRAT failed. */
+	WARN_ON(memblock_clear_hotplug(0, ULLONG_MAX));
+	numa_reset_distance();
+
+	ret = init_func();
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * We reset memblock back to the top-down direction
+	 * here because if we configured ACPI_NUMA, we have
+	 * parsed SRAT in init_func(). It is ok to have the
+	 * reset here even if we did't configure ACPI_NUMA
+	 * or acpi numa init fails and fallbacks to dummy
+	 * numa init.
+	 */
+	memblock_set_bottom_up(false);
+
+	ret = numa_cleanup_meminfo(&numa_meminfo);
+	if (ret < 0)
+		return ret;
+
+	numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+	ret = numa_register_memblks(&numa_meminfo);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < nr_cpu_ids; i++) {
+		int nid = early_cpu_to_node(i);
+
+		if (nid == NUMA_NO_NODE)
+			continue;
+		if (!node_online(nid))
+			numa_clear_node(i);
+	}
+	numa_init_array();
+
+	return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory.  This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+	printk(KERN_INFO "%s\n",
+	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
+	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
+	       0LLU, PFN_PHYS(max_pfn) - 1);
+
+	node_set(0, numa_nodes_parsed);
+	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+
+	return 0;
+}
+
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds.  The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+	if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+		if (!numa_init(x86_acpi_numa_init))
+			return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+		if (!numa_init(amd_numa_init))
+			return;
+#endif
+	}
+
+	numa_init(dummy_numa_init);
+}
+
+static __init int find_near_online_node(int node)
+{
+	int n, val;
+	int min_val = INT_MAX;
+	int best_node = -1;
+
+	for_each_online_node(n) {
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	return best_node;
+}
+
+/*
+ * Setup early cpu_to_node.
+ *
+ * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
+ * and apicid_to_node[] tables have valid entries for a CPU.
+ * This means we skip cpu_to_node[] initialisation for NUMA
+ * emulation and faking node case (when running a kernel compiled
+ * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
+ * is already initialized in a round robin manner at numa_init_array,
+ * prior to this call, and this initialization is good enough
+ * for the fake NUMA cases.
+ *
+ * Called before the per_cpu areas are setup.
+ */
+void __init init_cpu_to_node(void)
+{
+	int cpu;
+	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
+
+	BUG_ON(cpu_to_apicid == NULL);
+
+	for_each_possible_cpu(cpu) {
+		int node = numa_cpu_node(cpu);
+
+		if (node == NUMA_NO_NODE)
+			continue;
+		if (!node_online(node))
+			node = find_near_online_node(node);
+		numa_set_node(cpu, node);
+	}
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+
+# ifndef CONFIG_NUMA_EMU
+void numa_add_cpu(int cpu)
+{
+	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
+}
+# endif	/* !CONFIG_NUMA_EMU */
+
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+int __cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
+		printk(KERN_WARNING
+			"cpu_to_node(%d): usage too early!\n", cpu);
+		dump_stack();
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+EXPORT_SYMBOL(__cpu_to_node);
+
+/*
+ * Same function as cpu_to_node() but used if called before the
+ * per_cpu areas are setup.
+ */
+int early_cpu_to_node(int cpu)
+{
+	if (early_per_cpu_ptr(x86_cpu_to_node_map))
+		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
+
+	if (!cpu_possible(cpu)) {
+		printk(KERN_WARNING
+			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
+		dump_stack();
+		return NUMA_NO_NODE;
+	}
+	return per_cpu(x86_cpu_to_node_map, cpu);
+}
+
+void debug_cpumask_set_cpu(int cpu, int node, bool enable)
+{
+	struct cpumask *mask;
+
+	if (node == NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return;
+	}
+	mask = node_to_cpumask_map[node];
+	if (!mask) {
+		pr_err("node_to_cpumask_map[%i] NULL\n", node);
+		dump_stack();
+		return;
+	}
+
+	if (enable)
+		cpumask_set_cpu(cpu, mask);
+	else
+		cpumask_clear_cpu(cpu, mask);
+
+	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
+		enable ? "numa_add_cpu" : "numa_remove_cpu",
+		cpu, node, cpumask_pr_args(mask));
+	return;
+}
+
+# ifndef CONFIG_NUMA_EMU
+static void numa_set_cpumask(int cpu, bool enable)
+{
+	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
+}
+
+void numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, false);
+}
+# endif	/* !CONFIG_NUMA_EMU */
+
+/*
+ * Returns a pointer to the bitmask of CPUs on Node 'node'.
+ */
+const struct cpumask *cpumask_of_node(int node)
+{
+	if (node >= nr_node_ids) {
+		printk(KERN_WARNING
+			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+			node, nr_node_ids);
+		dump_stack();
+		return cpu_none_mask;
+	}
+	if (node_to_cpumask_map[node] == NULL) {
+		printk(KERN_WARNING
+			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
+			node);
+		dump_stack();
+		return cpu_online_mask;
+	}
+	return node_to_cpumask_map[node];
+}
+EXPORT_SYMBOL(cpumask_of_node);
+
+#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+	struct numa_meminfo *mi = &numa_meminfo;
+	int nid = mi->blk[0].nid;
+	int i;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].start <= start && mi->blk[i].end > start)
+			nid = mi->blk[i].nid;
+	return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
new file mode 100644
index 000000000..47b6436e4
--- /dev/null
+++ b/arch/x86/mm/numa_32.c
@@ -0,0 +1,104 @@
+/*
+ * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
+ * August 2002: added remote node KVA remap - Martin J. Bligh 
+ *
+ * Copyright (C) 2002, IBM Corp.
+ *
+ * All rights reserved.          
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/module.h>
+
+#include "numa_internal.h"
+
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * 4) physnode_map     - the mapping between a pfn and owning node
+ * physnode_map keeps track of the physical memory layout of a generic
+ * numa node on a 64Mb break (each element of the array will
+ * represent 64Mb of memory and will be marked by the node id.  so,
+ * if the first gig is on node 0, and the second gig is on node 1
+ * physnode_map will contain:
+ *
+ *     physnode_map[0-15] = 0;
+ *     physnode_map[16-31] = 1;
+ *     physnode_map[32- ] = -1;
+ */
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
+EXPORT_SYMBOL(physnode_map);
+
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+	unsigned long pfn;
+
+	printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
+			nid, start, end);
+	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
+	printk(KERN_DEBUG "  ");
+	start = round_down(start, PAGES_PER_SECTION);
+	end = round_up(end, PAGES_PER_SECTION);
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+		physnode_map[pfn / PAGES_PER_SECTION] = nid;
+		printk(KERN_CONT "%lx ", pfn);
+	}
+	printk(KERN_CONT "\n");
+}
+
+unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
+					      unsigned long end_pfn)
+{
+	unsigned long nr_pages = end_pfn - start_pfn;
+
+	if (!nr_pages)
+		return 0;
+
+	return (nr_pages + 1) * sizeof(struct page);
+}
+#endif
+
+extern unsigned long highend_pfn, highstart_pfn;
+
+void __init initmem_init(void)
+{
+	x86_numa_init();
+
+#ifdef CONFIG_HIGHMEM
+	highstart_pfn = highend_pfn = max_pfn;
+	if (max_pfn > max_low_pfn)
+		highstart_pfn = max_low_pfn;
+	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+	       pages_to_mb(highend_pfn - highstart_pfn));
+	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+			pages_to_mb(max_low_pfn));
+	printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
+			max_low_pfn, highstart_pfn);
+
+	printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
+			(ulong) pfn_to_kaddr(max_low_pfn));
+
+	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
+			(ulong) pfn_to_kaddr(highstart_pfn));
+
+	setup_bootmem_allocator();
+}
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
new file mode 100644
index 000000000..9405ffc91
--- /dev/null
+++ b/arch/x86/mm/numa_64.c
@@ -0,0 +1,12 @@
+/*
+ * Generic VM initialization for x86-64 NUMA setups.
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/bootmem.h>
+
+#include "numa_internal.h"
+
+void __init initmem_init(void)
+{
+	x86_numa_init();
+}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000..a8f90ce3d
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,502 @@
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <linux/bootmem.h>
+#include <asm/dma.h>
+
+#include "numa_internal.h"
+
+static int emu_nid_to_phys[MAX_NUMNODES];
+static char *emu_cmdline __initdata;
+
+void __init numa_emu_cmdline(char *str)
+{
+	emu_cmdline = str;
+}
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].nid == nid)
+			return i;
+	return -ENOENT;
+}
+
+static u64 __init mem_hole_size(u64 start, u64 end)
+{
+	unsigned long start_pfn = PFN_UP(start);
+	unsigned long end_pfn = PFN_DOWN(end);
+
+	if (start_pfn < end_pfn)
+		return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+	return 0;
+}
+
+/*
+ * Sets up nid to range from @start to @end.  The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+				   struct numa_meminfo *pi,
+				   int nid, int phys_blk, u64 size)
+{
+	struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+	struct numa_memblk *pb = &pi->blk[phys_blk];
+
+	if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+		return -EINVAL;
+	}
+
+	ei->nr_blks++;
+	eb->start = pb->start;
+	eb->end = pb->start + size;
+	eb->nid = nid;
+
+	if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+		emu_nid_to_phys[nid] = nid;
+
+	pb->start += size;
+	if (pb->start >= pb->end) {
+		WARN_ON_ONCE(pb->start > pb->end);
+		numa_remove_memblk_from(phys_blk, pi);
+	}
+
+	printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+	       nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+	return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+					 struct numa_meminfo *pi,
+					 u64 addr, u64 max_addr, int nr_nodes)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 size;
+	int big;
+	int nid = 0;
+	int i, ret;
+
+	if (nr_nodes <= 0)
+		return -1;
+	if (nr_nodes > MAX_NUMNODES) {
+		pr_info("numa=fake=%d too large, reducing to %d\n",
+			nr_nodes, MAX_NUMNODES);
+		nr_nodes = MAX_NUMNODES;
+	}
+
+	/*
+	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do
+	 * the division in ulong number of pages and convert back.
+	 */
+	size = max_addr - addr - mem_hole_size(addr, max_addr);
+	size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
+	/*
+	 * Calculate the number of big nodes that can be allocated as a result
+	 * of consolidating the remainder.
+	 */
+	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+		FAKE_NODE_MIN_SIZE;
+
+	size &= FAKE_NODE_MIN_HASH_MASK;
+	if (!size) {
+		pr_err("Not enough memory for each node.  "
+			"NUMA emulation disabled.\n");
+		return -1;
+	}
+
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
+
+	/*
+	 * Continue to fill physical nodes with fake nodes until there is no
+	 * memory left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+			end = start + size;
+
+			if (nid < big)
+				end += FAKE_NODE_MIN_SIZE;
+
+			/*
+			 * Continue to add memory to this fake node if its
+			 * non-reserved memory is less than the per-node size.
+			 */
+			while (end - start - mem_hole_size(start, end) < size) {
+				end += FAKE_NODE_MIN_SIZE;
+				if (end > limit) {
+					end = limit;
+					break;
+				}
+			}
+
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (limit - end - mem_hole_size(end, limit) < size)
+				end = limit;
+
+			ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+	u64 end = start + size;
+
+	while (end - start - mem_hole_size(start, end) < size) {
+		end += FAKE_NODE_MIN_SIZE;
+		if (end > max_addr) {
+			end = max_addr;
+			break;
+		}
+	}
+	return end;
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'.  The return value is the number of nodes allocated.
+ */
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+					      struct numa_meminfo *pi,
+					      u64 addr, u64 max_addr, u64 size)
+{
+	nodemask_t physnode_mask = NODE_MASK_NONE;
+	u64 min_size;
+	int nid = 0;
+	int i, ret;
+
+	if (!size)
+		return -1;
+	/*
+	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
+	 * increased accordingly if the requested size is too small.  This
+	 * creates a uniform distribution of node sizes across the entire
+	 * machine (but not necessarily over physical nodes).
+	 */
+	min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
+	min_size = max(min_size, FAKE_NODE_MIN_SIZE);
+	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
+		min_size = (min_size + FAKE_NODE_MIN_SIZE) &
+						FAKE_NODE_MIN_HASH_MASK;
+	if (size < min_size) {
+		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+			size >> 20, min_size >> 20);
+		size = min_size;
+	}
+	size &= FAKE_NODE_MIN_HASH_MASK;
+
+	for (i = 0; i < pi->nr_blks; i++)
+		node_set(pi->blk[i].nid, physnode_mask);
+
+	/*
+	 * Fill physical nodes with fake nodes of size until there is no memory
+	 * left on any of them.
+	 */
+	while (nodes_weight(physnode_mask)) {
+		for_each_node_mask(i, physnode_mask) {
+			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
+			u64 start, limit, end;
+			int phys_blk;
+
+			phys_blk = emu_find_memblk_by_nid(i, pi);
+			if (phys_blk < 0) {
+				node_clear(i, physnode_mask);
+				continue;
+			}
+			start = pi->blk[phys_blk].start;
+			limit = pi->blk[phys_blk].end;
+
+			end = find_end_of_node(start, limit, size);
+			/*
+			 * If there won't be at least FAKE_NODE_MIN_SIZE of
+			 * non-reserved memory in ZONE_DMA32 for the next node,
+			 * this one must extend to the boundary.
+			 */
+			if (end < dma32_end && dma32_end - end -
+			    mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+				end = dma32_end;
+
+			/*
+			 * If there won't be enough non-reserved memory for the
+			 * next node, this one must extend to the end of the
+			 * physical node.
+			 */
+			if (limit - end - mem_hole_size(end, limit) < size)
+				end = limit;
+
+			ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+					       phys_blk,
+					       min(end, limit) - start);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	return 0;
+}
+
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success.  @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ *   emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ *   nodes.  The distances are determined considering how emulated nodes
+ *   are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ *   nodes.  This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+	static struct numa_meminfo ei __initdata;
+	static struct numa_meminfo pi __initdata;
+	const u64 max_addr = PFN_PHYS(max_pfn);
+	u8 *phys_dist = NULL;
+	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+	int max_emu_nid, dfl_phys_nid;
+	int i, j, ret;
+
+	if (!emu_cmdline)
+		goto no_emu;
+
+	memset(&ei, 0, sizeof(ei));
+	pi = *numa_meminfo;
+
+	for (i = 0; i < MAX_NUMNODES; i++)
+		emu_nid_to_phys[i] = NUMA_NO_NODE;
+
+	/*
+	 * If the numa=fake command-line contains a 'M' or 'G', it represents
+	 * the fixed node size.  Otherwise, if it is just a single number N,
+	 * split the system RAM into N fake nodes.
+	 */
+	if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+		u64 size;
+
+		size = memparse(emu_cmdline, &emu_cmdline);
+		ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+	} else {
+		unsigned long n;
+
+		n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+		ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+	}
+	if (*emu_cmdline == ':')
+		emu_cmdline++;
+
+	if (ret < 0)
+		goto no_emu;
+
+	if (numa_cleanup_meminfo(&ei) < 0) {
+		pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+		goto no_emu;
+	}
+
+	/* copy the physical distance table */
+	if (numa_dist_cnt) {
+		u64 phys;
+
+		phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+					      phys_size, PAGE_SIZE);
+		if (!phys) {
+			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+			goto no_emu;
+		}
+		memblock_reserve(phys, phys_size);
+		phys_dist = __va(phys);
+
+		for (i = 0; i < numa_dist_cnt; i++)
+			for (j = 0; j < numa_dist_cnt; j++)
+				phys_dist[i * numa_dist_cnt + j] =
+					node_distance(i, j);
+	}
+
+	/*
+	 * Determine the max emulated nid and the default phys nid to use
+	 * for unmapped nodes.
+	 */
+	max_emu_nid = 0;
+	dfl_phys_nid = NUMA_NO_NODE;
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+		if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+			max_emu_nid = i;
+			if (dfl_phys_nid == NUMA_NO_NODE)
+				dfl_phys_nid = emu_nid_to_phys[i];
+		}
+	}
+	if (dfl_phys_nid == NUMA_NO_NODE) {
+		pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
+		goto no_emu;
+	}
+
+	/* commit */
+	*numa_meminfo = ei;
+
+	/*
+	 * Transform __apicid_to_node table to use emulated nids by
+	 * reverse-mapping phys_nid.  The maps should always exist but fall
+	 * back to zero just in case.
+	 */
+	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
+		if (__apicid_to_node[i] == NUMA_NO_NODE)
+			continue;
+		for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
+			if (__apicid_to_node[i] == emu_nid_to_phys[j])
+				break;
+		__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
+	}
+
+	/* make sure all emulated nodes are mapped to a physical node */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+			emu_nid_to_phys[i] = dfl_phys_nid;
+
+	/* transform distance table */
+	numa_reset_distance();
+	for (i = 0; i < max_emu_nid + 1; i++) {
+		for (j = 0; j < max_emu_nid + 1; j++) {
+			int physi = emu_nid_to_phys[i];
+			int physj = emu_nid_to_phys[j];
+			int dist;
+
+			if (get_option(&emu_cmdline, &dist) == 2)
+				;
+			else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+				dist = physi == physj ?
+					LOCAL_DISTANCE : REMOTE_DISTANCE;
+			else
+				dist = phys_dist[physi * numa_dist_cnt + physj];
+
+			numa_set_distance(i, j, dist);
+		}
+	}
+
+	/* free the copied physical distance table */
+	if (phys_dist)
+		memblock_free(__pa(phys_dist), phys_size);
+	return;
+
+no_emu:
+	/* No emulation.  Build identity emu_nid_to_phys[] for numa_add_cpu() */
+	for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+		emu_nid_to_phys[i] = i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void numa_add_cpu(int cpu)
+{
+	int physnid, nid;
+
+	nid = early_cpu_to_node(cpu);
+	BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+	physnid = emu_nid_to_phys[nid];
+
+	/*
+	 * Map the cpu to each emulated node that is allocated on the physical
+	 * node of the cpu's apic id.
+	 */
+	for_each_online_node(nid)
+		if (emu_nid_to_phys[nid] == physnid)
+			cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_remove_cpu(int cpu)
+{
+	int i;
+
+	for_each_online_node(i)
+		cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void numa_set_cpumask(int cpu, bool enable)
+{
+	int nid, physnid;
+
+	nid = early_cpu_to_node(cpu);
+	if (nid == NUMA_NO_NODE) {
+		/* early_cpu_to_node() already emits a warning and trace */
+		return;
+	}
+
+	physnid = emu_nid_to_phys[nid];
+
+	for_each_online_node(nid) {
+		if (emu_nid_to_phys[nid] != physnid)
+			continue;
+
+		debug_cpumask_set_cpu(cpu, nid, enable);
+	}
+}
+
+void numa_add_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(int cpu)
+{
+	numa_set_cpumask(cpu, false);
+}
+#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000..ad86ec91e
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,33 @@
+#ifndef __X86_MM_NUMA_INTERNAL_H
+#define __X86_MM_NUMA_INTERNAL_H
+
+#include <linux/types.h>
+#include <asm/numa.h>
+
+struct numa_memblk {
+	u64			start;
+	u64			end;
+	int			nid;
+};
+
+struct numa_meminfo {
+	int			nr_blks;
+	struct numa_memblk	blk[NR_NODE_MEMBLKS];
+};
+
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
+void __init numa_reset_distance(void);
+
+void __init x86_numa_init(void);
+
+#ifdef CONFIG_NUMA_EMU
+void __init numa_emulation(struct numa_meminfo *numa_meminfo,
+			   int numa_dist_cnt);
+#else
+static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
+				  int numa_dist_cnt)
+{ }
+#endif
+
+#endif	/* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
new file mode 100644
index 000000000..6629f397b
--- /dev/null
+++ b/arch/x86/mm/pageattr-test.c
@@ -0,0 +1,259 @@
+/*
+ * self test for change_page_attr.
+ *
+ * Clears the a test pte bit on random pages in the direct mapping,
+ * then reverts and compares page tables forwards and afterwards.
+ */
+#include <linux/bootmem.h>
+#include <linux/kthread.h>
+#include <linux/random.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+#include <asm/cacheflush.h>
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+
+/*
+ * Only print the results of the first pass:
+ */
+static __read_mostly int print = 1;
+
+enum {
+	NTEST			= 400,
+#ifdef CONFIG_X86_64
+	LPS			= (1 << PMD_SHIFT),
+#elif defined(CONFIG_X86_PAE)
+	LPS			= (1 << PMD_SHIFT),
+#else
+	LPS			= (1 << 22),
+#endif
+	GPS			= (1<<30)
+};
+
+#define PAGE_CPA_TEST	__pgprot(_PAGE_CPA_TEST)
+
+static int pte_testbit(pte_t pte)
+{
+	return pte_flags(pte) & _PAGE_SOFTW1;
+}
+
+struct split_state {
+	long lpg, gpg, spg, exec;
+	long min_exec, max_exec;
+};
+
+static int print_split(struct split_state *s)
+{
+	long i, expected, missed = 0;
+	int err = 0;
+
+	s->lpg = s->gpg = s->spg = s->exec = 0;
+	s->min_exec = ~0UL;
+	s->max_exec = 0;
+	for (i = 0; i < max_pfn_mapped; ) {
+		unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
+		unsigned int level;
+		pte_t *pte;
+
+		pte = lookup_address(addr, &level);
+		if (!pte) {
+			missed++;
+			i++;
+			continue;
+		}
+
+		if (level == PG_LEVEL_1G && sizeof(long) == 8) {
+			s->gpg++;
+			i += GPS/PAGE_SIZE;
+		} else if (level == PG_LEVEL_2M) {
+			if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) {
+				printk(KERN_ERR
+					"%lx level %d but not PSE %Lx\n",
+					addr, level, (u64)pte_val(*pte));
+				err = 1;
+			}
+			s->lpg++;
+			i += LPS/PAGE_SIZE;
+		} else {
+			s->spg++;
+			i++;
+		}
+		if (!(pte_val(*pte) & _PAGE_NX)) {
+			s->exec++;
+			if (addr < s->min_exec)
+				s->min_exec = addr;
+			if (addr > s->max_exec)
+				s->max_exec = addr;
+		}
+	}
+	if (print) {
+		printk(KERN_INFO
+			" 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
+			s->spg, s->lpg, s->gpg, s->exec,
+			s->min_exec != ~0UL ? s->min_exec : 0,
+			s->max_exec, missed);
+	}
+
+	expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
+	if (expected != i) {
+		printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
+			max_pfn_mapped, expected);
+		return 1;
+	}
+	return err;
+}
+
+static unsigned long addr[NTEST];
+static unsigned int len[NTEST];
+
+/* Change the global bit on random pages in the direct mapping */
+static int pageattr_test(void)
+{
+	struct split_state sa, sb, sc;
+	unsigned long *bm;
+	pte_t *pte, pte0;
+	int failed = 0;
+	unsigned int level;
+	int i, k;
+	int err;
+	unsigned long test_addr;
+
+	if (print)
+		printk(KERN_INFO "CPA self-test:\n");
+
+	bm = vzalloc((max_pfn_mapped + 7) / 8);
+	if (!bm) {
+		printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
+		return -ENOMEM;
+	}
+
+	failed += print_split(&sa);
+
+	for (i = 0; i < NTEST; i++) {
+		unsigned long pfn = prandom_u32() % max_pfn_mapped;
+
+		addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
+		len[i] = prandom_u32() % 100;
+		len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
+
+		if (len[i] == 0)
+			len[i] = 1;
+
+		pte = NULL;
+		pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
+
+		for (k = 0; k < len[i]; k++) {
+			pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
+			if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
+			    !(pte_val(*pte) & _PAGE_PRESENT)) {
+				addr[i] = 0;
+				break;
+			}
+			if (k == 0) {
+				pte0 = *pte;
+			} else {
+				if (pgprot_val(pte_pgprot(*pte)) !=
+					pgprot_val(pte_pgprot(pte0))) {
+					len[i] = k;
+					break;
+				}
+			}
+			if (test_bit(pfn + k, bm)) {
+				len[i] = k;
+				break;
+			}
+			__set_bit(pfn + k, bm);
+		}
+		if (!addr[i] || !pte || !k) {
+			addr[i] = 0;
+			continue;
+		}
+
+		test_addr = addr[i];
+		err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
+		if (err < 0) {
+			printk(KERN_ERR "CPA %d failed %d\n", i, err);
+			failed++;
+		}
+
+		pte = lookup_address(addr[i], &level);
+		if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
+			printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
+				pte ? (u64)pte_val(*pte) : 0ULL);
+			failed++;
+		}
+		if (level != PG_LEVEL_4K) {
+			printk(KERN_ERR "CPA %lx: unexpected level %d\n",
+				addr[i], level);
+			failed++;
+		}
+
+	}
+	vfree(bm);
+
+	failed += print_split(&sb);
+
+	for (i = 0; i < NTEST; i++) {
+		if (!addr[i])
+			continue;
+		pte = lookup_address(addr[i], &level);
+		if (!pte) {
+			printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
+			failed++;
+			continue;
+		}
+		test_addr = addr[i];
+		err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
+		if (err < 0) {
+			printk(KERN_ERR "CPA reverting failed: %d\n", err);
+			failed++;
+		}
+		pte = lookup_address(addr[i], &level);
+		if (!pte || pte_testbit(*pte)) {
+			printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
+				addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
+			failed++;
+		}
+
+	}
+
+	failed += print_split(&sc);
+
+	if (failed) {
+		WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
+		return -EINVAL;
+	} else {
+		if (print)
+			printk(KERN_INFO "ok.\n");
+	}
+
+	return 0;
+}
+
+static int do_pageattr_test(void *__unused)
+{
+	while (!kthread_should_stop()) {
+		schedule_timeout_interruptible(HZ*30);
+		if (pageattr_test() < 0)
+			break;
+		if (print)
+			print--;
+	}
+	return 0;
+}
+
+static int start_pageattr_test(void)
+{
+	struct task_struct *p;
+
+	p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
+	if (!IS_ERR(p))
+		wake_up_process(p);
+	else
+		WARN_ON(1);
+
+	return 0;
+}
+
+module_init(start_pageattr_test);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
new file mode 100644
index 000000000..89af288ec
--- /dev/null
+++ b/arch/x86/mm/pageattr.c
@@ -0,0 +1,1924 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/pfn.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/pci.h>
+
+#include <asm/e820.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pat.h>
+
+/*
+ * The current flushing context - we pass it instead of 5 arguments:
+ */
+struct cpa_data {
+	unsigned long	*vaddr;
+	pgd_t		*pgd;
+	pgprot_t	mask_set;
+	pgprot_t	mask_clr;
+	int		numpages;
+	int		flags;
+	unsigned long	pfn;
+	unsigned	force_split : 1;
+	int		curpage;
+	struct page	**pages;
+};
+
+/*
+ * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
+ * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
+ * entries change the page attribute in parallel to some other cpu
+ * splitting a large page entry along with changing the attribute.
+ */
+static DEFINE_SPINLOCK(cpa_lock);
+
+#define CPA_FLUSHTLB 1
+#define CPA_ARRAY 2
+#define CPA_PAGES_ARRAY 4
+
+#ifdef CONFIG_PROC_FS
+static unsigned long direct_pages_count[PG_LEVEL_NUM];
+
+void update_page_count(int level, unsigned long pages)
+{
+	/* Protect against CPA */
+	spin_lock(&pgd_lock);
+	direct_pages_count[level] += pages;
+	spin_unlock(&pgd_lock);
+}
+
+static void split_page_count(int level)
+{
+	direct_pages_count[level]--;
+	direct_pages_count[level - 1] += PTRS_PER_PTE;
+}
+
+void arch_report_meminfo(struct seq_file *m)
+{
+	seq_printf(m, "DirectMap4k:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_4K] << 2);
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	seq_printf(m, "DirectMap2M:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 11);
+#else
+	seq_printf(m, "DirectMap4M:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_2M] << 12);
+#endif
+	if (direct_gbpages)
+		seq_printf(m, "DirectMap1G:    %8lu kB\n",
+			direct_pages_count[PG_LEVEL_1G] << 20);
+}
+#else
+static inline void split_page_count(int level) { }
+#endif
+
+#ifdef CONFIG_X86_64
+
+static inline unsigned long highmap_start_pfn(void)
+{
+	return __pa_symbol(_text) >> PAGE_SHIFT;
+}
+
+static inline unsigned long highmap_end_pfn(void)
+{
+	return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+# define debug_pagealloc 1
+#else
+# define debug_pagealloc 0
+#endif
+
+static inline int
+within(unsigned long addr, unsigned long start, unsigned long end)
+{
+	return addr >= start && addr < end;
+}
+
+/*
+ * Flushing functions
+ */
+
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @vaddr:	virtual start address
+ * @size:	number of bytes to flush
+ *
+ * clflushopt is an unordered instruction which needs fencing with mfence or
+ * sfence to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+	void *vend = vaddr + size - 1;
+
+	mb();
+
+	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
+		clflushopt(vaddr);
+	/*
+	 * Flush any possible final partial cacheline:
+	 */
+	clflushopt(vend);
+
+	mb();
+}
+EXPORT_SYMBOL_GPL(clflush_cache_range);
+
+static void __cpa_flush_all(void *arg)
+{
+	unsigned long cache = (unsigned long)arg;
+
+	/*
+	 * Flush all to work around Errata in early athlons regarding
+	 * large page flushing.
+	 */
+	__flush_tlb_all();
+
+	if (cache && boot_cpu_data.x86 >= 4)
+		wbinvd();
+}
+
+static void cpa_flush_all(unsigned long cache)
+{
+	BUG_ON(irqs_disabled());
+
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
+}
+
+static void __cpa_flush_range(void *arg)
+{
+	/*
+	 * We could optimize that further and do individual per page
+	 * tlb invalidates for a low number of pages. Caveat: we must
+	 * flush the high aliases on 64bit as well.
+	 */
+	__flush_tlb_all();
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+	unsigned int i, level;
+	unsigned long addr;
+
+	BUG_ON(irqs_disabled());
+	WARN_ON(PAGE_ALIGN(start) != start);
+
+	on_each_cpu(__cpa_flush_range, NULL, 1);
+
+	if (!cache)
+		return;
+
+	/*
+	 * We only need to flush on one CPU,
+	 * clflush is a MESI-coherent instruction that
+	 * will cause all other CPUs to flush the same
+	 * cachelines:
+	 */
+	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
+		pte_t *pte = lookup_address(addr, &level);
+
+		/*
+		 * Only flush present addresses:
+		 */
+		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+			clflush_cache_range((void *) addr, PAGE_SIZE);
+	}
+}
+
+static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+			    int in_flags, struct page **pages)
+{
+	unsigned int i, level;
+	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
+
+	BUG_ON(irqs_disabled());
+
+	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+
+	if (!cache || do_wbinvd)
+		return;
+
+	/*
+	 * We only need to flush on one CPU,
+	 * clflush is a MESI-coherent instruction that
+	 * will cause all other CPUs to flush the same
+	 * cachelines:
+	 */
+	for (i = 0; i < numpages; i++) {
+		unsigned long addr;
+		pte_t *pte;
+
+		if (in_flags & CPA_PAGES_ARRAY)
+			addr = (unsigned long)page_address(pages[i]);
+		else
+			addr = start[i];
+
+		pte = lookup_address(addr, &level);
+
+		/*
+		 * Only flush present addresses:
+		 */
+		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
+			clflush_cache_range((void *)addr, PAGE_SIZE);
+	}
+}
+
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
+				   unsigned long pfn)
+{
+	pgprot_t forbidden = __pgprot(0);
+
+	/*
+	 * The BIOS area between 640k and 1Mb needs to be executable for
+	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
+	 */
+#ifdef CONFIG_PCI_BIOS
+	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+		pgprot_val(forbidden) |= _PAGE_NX;
+#endif
+
+	/*
+	 * The kernel text needs to be executable for obvious reasons
+	 * Does not cover __inittext since that is gone later on. On
+	 * 64bit we do not enforce !NX on the low mapping
+	 */
+	if (within(address, (unsigned long)_text, (unsigned long)_etext))
+		pgprot_val(forbidden) |= _PAGE_NX;
+
+	/*
+	 * The .rodata section needs to be read-only. Using the pfn
+	 * catches all aliases.
+	 */
+	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
+		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
+		pgprot_val(forbidden) |= _PAGE_RW;
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+	/*
+	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+	 * kernel text mappings for the large page aligned text, rodata sections
+	 * will be always read-only. For the kernel identity mappings covering
+	 * the holes caused by this alignment can be anything that user asks.
+	 *
+	 * This will preserve the large page mappings for kernel text/data
+	 * at no extra cost.
+	 */
+	if (kernel_set_to_readonly &&
+	    within(address, (unsigned long)_text,
+		   (unsigned long)__end_rodata_hpage_align)) {
+		unsigned int level;
+
+		/*
+		 * Don't enforce the !RW mapping for the kernel text mapping,
+		 * if the current mapping is already using small page mapping.
+		 * No need to work hard to preserve large page mappings in this
+		 * case.
+		 *
+		 * This also fixes the Linux Xen paravirt guest boot failure
+		 * (because of unexpected read-only mappings for kernel identity
+		 * mappings). In this paravirt guest case, the kernel text
+		 * mapping and the kernel identity mapping share the same
+		 * page-table pages. Thus we can't really use different
+		 * protections for the kernel text and identity mappings. Also,
+		 * these shared mappings are made of small page mappings.
+		 * Thus this don't enforce !RW mapping for small page kernel
+		 * text mapping logic will help Linux Xen parvirt guest boot
+		 * as well.
+		 */
+		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
+			pgprot_val(forbidden) |= _PAGE_RW;
+	}
+#endif
+
+	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+
+	return prot;
+}
+
+/*
+ * Lookup the page table entry for a virtual address in a specific pgd.
+ * Return a pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+			     unsigned int *level)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+
+	*level = PG_LEVEL_NONE;
+
+	if (pgd_none(*pgd))
+		return NULL;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud))
+		return NULL;
+
+	*level = PG_LEVEL_1G;
+	if (pud_large(*pud) || !pud_present(*pud))
+		return (pte_t *)pud;
+
+	pmd = pmd_offset(pud, address);
+	if (pmd_none(*pmd))
+		return NULL;
+
+	*level = PG_LEVEL_2M;
+	if (pmd_large(*pmd) || !pmd_present(*pmd))
+		return (pte_t *)pmd;
+
+	*level = PG_LEVEL_4K;
+
+	return pte_offset_kernel(pmd, address);
+}
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
+EXPORT_SYMBOL_GPL(lookup_address);
+
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+				  unsigned int *level)
+{
+        if (cpa->pgd)
+		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+					       address, level);
+
+        return lookup_address(address, level);
+}
+
+/*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = pgd_offset_k(address);
+	if (pgd_none(*pgd))
+		return NULL;
+
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+		return NULL;
+
+	return pmd_offset(pud, address);
+}
+
+/*
+ * This is necessary because __pa() does not work on some
+ * kinds of memory, like vmalloc() or the alloc_remap()
+ * areas on 32-bit NUMA systems.  The percpu areas can
+ * end up in this kind of memory, for instance.
+ *
+ * This could be optimized, but it is only intended to be
+ * used at inititalization time, and keeping it
+ * unoptimized should increase the testing coverage for
+ * the more obscure platforms.
+ */
+phys_addr_t slow_virt_to_phys(void *__virt_addr)
+{
+	unsigned long virt_addr = (unsigned long)__virt_addr;
+	phys_addr_t phys_addr;
+	unsigned long offset;
+	enum pg_level level;
+	unsigned long psize;
+	unsigned long pmask;
+	pte_t *pte;
+
+	pte = lookup_address(virt_addr, &level);
+	BUG_ON(!pte);
+	psize = page_level_size(level);
+	pmask = page_level_mask(level);
+	offset = virt_addr & ~pmask;
+	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+	return (phys_addr | offset);
+}
+EXPORT_SYMBOL_GPL(slow_virt_to_phys);
+
+/*
+ * Set the new pmd in all the pgds we know about:
+ */
+static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
+{
+	/* change init_mm */
+	set_pte_atomic(kpte, pte);
+#ifdef CONFIG_X86_32
+	if (!SHARED_KERNEL_PMD) {
+		struct page *page;
+
+		list_for_each_entry(page, &pgd_list, lru) {
+			pgd_t *pgd;
+			pud_t *pud;
+			pmd_t *pmd;
+
+			pgd = (pgd_t *)page_address(page) + pgd_index(address);
+			pud = pud_offset(pgd, address);
+			pmd = pmd_offset(pud, address);
+			set_pte_atomic((pte_t *)pmd, pte);
+		}
+	}
+#endif
+}
+
+static int
+try_preserve_large_page(pte_t *kpte, unsigned long address,
+			struct cpa_data *cpa)
+{
+	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
+	pte_t new_pte, old_pte, *tmp;
+	pgprot_t old_prot, new_prot, req_prot;
+	int i, do_split = 1;
+	enum pg_level level;
+
+	if (cpa->force_split)
+		return 1;
+
+	spin_lock(&pgd_lock);
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up already:
+	 */
+	tmp = _lookup_address_cpa(cpa, address, &level);
+	if (tmp != kpte)
+		goto out_unlock;
+
+	switch (level) {
+	case PG_LEVEL_2M:
+#ifdef CONFIG_X86_64
+	case PG_LEVEL_1G:
+#endif
+		psize = page_level_size(level);
+		pmask = page_level_mask(level);
+		break;
+	default:
+		do_split = -EINVAL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Calculate the number of pages, which fit into this large
+	 * page starting at address:
+	 */
+	nextpage_addr = (address + psize) & pmask;
+	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+	if (numpages < cpa->numpages)
+		cpa->numpages = numpages;
+
+	/*
+	 * We are safe now. Check whether the new pgprot is the same:
+	 * Convert protection attributes to 4k-format, as cpa->mask* are set
+	 * up accordingly.
+	 */
+	old_pte = *kpte;
+	old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
+
+	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
+
+	/*
+	 * req_prot is in format of 4k pages. It must be converted to large
+	 * page format: the caching mode includes the PAT bit located at
+	 * different bit positions in the two formats.
+	 */
+	req_prot = pgprot_4k_2_large(req_prot);
+
+	/*
+	 * Set the PSE and GLOBAL flags only if the PRESENT flag is
+	 * set otherwise pmd_present/pmd_huge will return true even on
+	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
+	 * for the ancient hardware that doesn't support it.
+	 */
+	if (pgprot_val(req_prot) & _PAGE_PRESENT)
+		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
+	else
+		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
+
+	req_prot = canon_pgprot(req_prot);
+
+	/*
+	 * old_pte points to the large page base address. So we need
+	 * to add the offset of the virtual address:
+	 */
+	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
+	cpa->pfn = pfn;
+
+	new_prot = static_protections(req_prot, address, pfn);
+
+	/*
+	 * We need to check the full range, whether
+	 * static_protection() requires a different pgprot for one of
+	 * the pages in the range we try to preserve:
+	 */
+	addr = address & pmask;
+	pfn = pte_pfn(old_pte);
+	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
+
+		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+			goto out_unlock;
+	}
+
+	/*
+	 * If there are no changes, return. maxpages has been updated
+	 * above:
+	 */
+	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+		do_split = 0;
+		goto out_unlock;
+	}
+
+	/*
+	 * We need to change the attributes. Check, whether we can
+	 * change the large page in one go. We request a split, when
+	 * the address is not aligned and the number of pages is
+	 * smaller than the number of pages in the large page. Note
+	 * that we limited the number of possible pages already to
+	 * the number of pages in the large page.
+	 */
+	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
+		/*
+		 * The address is aligned and the number of pages
+		 * covers the full page.
+		 */
+		new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
+		__set_pmd_pte(kpte, address, new_pte);
+		cpa->flags |= CPA_FLUSHTLB;
+		do_split = 0;
+	}
+
+out_unlock:
+	spin_unlock(&pgd_lock);
+
+	return do_split;
+}
+
+static int
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+		   struct page *base)
+{
+	pte_t *pbase = (pte_t *)page_address(base);
+	unsigned long pfn, pfninc = 1;
+	unsigned int i, level;
+	pte_t *tmp;
+	pgprot_t ref_prot;
+
+	spin_lock(&pgd_lock);
+	/*
+	 * Check for races, another CPU might have split this page
+	 * up for us already:
+	 */
+	tmp = _lookup_address_cpa(cpa, address, &level);
+	if (tmp != kpte) {
+		spin_unlock(&pgd_lock);
+		return 1;
+	}
+
+	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
+	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
+
+	/* promote PAT bit to correct position */
+	if (level == PG_LEVEL_2M)
+		ref_prot = pgprot_large_2_4k(ref_prot);
+
+#ifdef CONFIG_X86_64
+	if (level == PG_LEVEL_1G) {
+		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+		/*
+		 * Set the PSE flags only if the PRESENT flag is set
+		 * otherwise pmd_present/pmd_huge will return true
+		 * even on a non present pmd.
+		 */
+		if (pgprot_val(ref_prot) & _PAGE_PRESENT)
+			pgprot_val(ref_prot) |= _PAGE_PSE;
+		else
+			pgprot_val(ref_prot) &= ~_PAGE_PSE;
+	}
+#endif
+
+	/*
+	 * Set the GLOBAL flags only if the PRESENT flag is set
+	 * otherwise pmd/pte_present will return true even on a non
+	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
+	 * for the ancient hardware that doesn't support it.
+	 */
+	if (pgprot_val(ref_prot) & _PAGE_PRESENT)
+		pgprot_val(ref_prot) |= _PAGE_GLOBAL;
+	else
+		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
+
+	/*
+	 * Get the target pfn from the original entry:
+	 */
+	pfn = pte_pfn(*kpte);
+	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
+		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
+
+	if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+				PFN_DOWN(__pa(address)) + 1))
+		split_page_count(level);
+
+	/*
+	 * Install the new, split up pagetable.
+	 *
+	 * We use the standard kernel pagetable protections for the new
+	 * pagetable protections, the actual ptes set above control the
+	 * primary protection behavior:
+	 */
+	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
+
+	/*
+	 * Intel Atom errata AAH41 workaround.
+	 *
+	 * The real fix should be in hw or in a microcode update, but
+	 * we also probabilistically try to reduce the window of having
+	 * a large TLB mixed with 4K TLBs while instruction fetches are
+	 * going on.
+	 */
+	__flush_tlb_all();
+	spin_unlock(&pgd_lock);
+
+	return 0;
+}
+
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+			    unsigned long address)
+{
+	struct page *base;
+
+	if (!debug_pagealloc)
+		spin_unlock(&cpa_lock);
+	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	if (!debug_pagealloc)
+		spin_lock(&cpa_lock);
+	if (!base)
+		return -ENOMEM;
+
+	if (__split_large_page(cpa, kpte, address, base))
+		__free_page(base);
+
+	return 0;
+}
+
+static bool try_to_free_pte_page(pte_t *pte)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		if (!pte_none(pte[i]))
+			return false;
+
+	free_page((unsigned long)pte);
+	return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++)
+		if (!pmd_none(pmd[i]))
+			return false;
+
+	free_page((unsigned long)pmd);
+	return true;
+}
+
+static bool try_to_free_pud_page(pud_t *pud)
+{
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++)
+		if (!pud_none(pud[i]))
+			return false;
+
+	free_page((unsigned long)pud);
+	return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, start);
+
+	while (start < end) {
+		set_pte(pte, __pte(0));
+
+		start += PAGE_SIZE;
+		pte++;
+	}
+
+	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+		pmd_clear(pmd);
+		return true;
+	}
+	return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+			      unsigned long start, unsigned long end)
+{
+	if (unmap_pte_range(pmd, start, end))
+		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+			pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+	pmd_t *pmd = pmd_offset(pud, start);
+
+	/*
+	 * Not on a 2MB page boundary?
+	 */
+	if (start & (PMD_SIZE - 1)) {
+		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+		unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+		__unmap_pmd_range(pud, pmd, start, pre_end);
+
+		start = pre_end;
+		pmd++;
+	}
+
+	/*
+	 * Try to unmap in 2M chunks.
+	 */
+	while (end - start >= PMD_SIZE) {
+		if (pmd_large(*pmd))
+			pmd_clear(pmd);
+		else
+			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+		start += PMD_SIZE;
+		pmd++;
+	}
+
+	/*
+	 * 4K leftovers?
+	 */
+	if (start < end)
+		return __unmap_pmd_range(pud, pmd, start, end);
+
+	/*
+	 * Try again to free the PMD page if haven't succeeded above.
+	 */
+	if (!pud_none(*pud))
+		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+			pud_clear(pud);
+}
+
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	pud_t *pud = pud_offset(pgd, start);
+
+	/*
+	 * Not on a GB page boundary?
+	 */
+	if (start & (PUD_SIZE - 1)) {
+		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+		unsigned long pre_end	= min_t(unsigned long, end, next_page);
+
+		unmap_pmd_range(pud, start, pre_end);
+
+		start = pre_end;
+		pud++;
+	}
+
+	/*
+	 * Try to unmap in 1G chunks?
+	 */
+	while (end - start >= PUD_SIZE) {
+
+		if (pud_large(*pud))
+			pud_clear(pud);
+		else
+			unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+		start += PUD_SIZE;
+		pud++;
+	}
+
+	/*
+	 * 2M leftovers?
+	 */
+	if (start < end)
+		unmap_pmd_range(pud, start, end);
+
+	/*
+	 * No need to try to free the PUD page because we'll free it in
+	 * populate_pgd's error path
+	 */
+}
+
+static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
+{
+	pgd_t *pgd_entry = root + pgd_index(addr);
+
+	unmap_pud_range(pgd_entry, addr, end);
+
+	if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
+		pgd_clear(pgd_entry);
+}
+
+static int alloc_pte_page(pmd_t *pmd)
+{
+	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	if (!pte)
+		return -1;
+
+	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+	return 0;
+}
+
+static int alloc_pmd_page(pud_t *pud)
+{
+	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	if (!pmd)
+		return -1;
+
+	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	return 0;
+}
+
+static void populate_pte(struct cpa_data *cpa,
+			 unsigned long start, unsigned long end,
+			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+	pte_t *pte;
+
+	pte = pte_offset_kernel(pmd, start);
+
+	while (num_pages-- && start < end) {
+
+		/* deal with the NX bit */
+		if (!(pgprot_val(pgprot) & _PAGE_NX))
+			cpa->pfn &= ~_PAGE_NX;
+
+		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+
+		start	 += PAGE_SIZE;
+		cpa->pfn += PAGE_SIZE;
+		pte++;
+	}
+}
+
+static int populate_pmd(struct cpa_data *cpa,
+			unsigned long start, unsigned long end,
+			unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+	unsigned int cur_pages = 0;
+	pmd_t *pmd;
+	pgprot_t pmd_pgprot;
+
+	/*
+	 * Not on a 2M boundary?
+	 */
+	if (start & (PMD_SIZE - 1)) {
+		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+		pre_end   = min_t(unsigned long, pre_end, next_page);
+		cur_pages = (pre_end - start) >> PAGE_SHIFT;
+		cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+		/*
+		 * Need a PTE page?
+		 */
+		pmd = pmd_offset(pud, start);
+		if (pmd_none(*pmd))
+			if (alloc_pte_page(pmd))
+				return -1;
+
+		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+		start = pre_end;
+	}
+
+	/*
+	 * We mapped them all?
+	 */
+	if (num_pages == cur_pages)
+		return cur_pages;
+
+	pmd_pgprot = pgprot_4k_2_large(pgprot);
+
+	while (end - start >= PMD_SIZE) {
+
+		/*
+		 * We cannot use a 1G page so allocate a PMD page if needed.
+		 */
+		if (pud_none(*pud))
+			if (alloc_pmd_page(pud))
+				return -1;
+
+		pmd = pmd_offset(pud, start);
+
+		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+				   massage_pgprot(pmd_pgprot)));
+
+		start	  += PMD_SIZE;
+		cpa->pfn  += PMD_SIZE;
+		cur_pages += PMD_SIZE >> PAGE_SHIFT;
+	}
+
+	/*
+	 * Map trailing 4K pages.
+	 */
+	if (start < end) {
+		pmd = pmd_offset(pud, start);
+		if (pmd_none(*pmd))
+			if (alloc_pte_page(pmd))
+				return -1;
+
+		populate_pte(cpa, start, end, num_pages - cur_pages,
+			     pmd, pgprot);
+	}
+	return num_pages;
+}
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+			pgprot_t pgprot)
+{
+	pud_t *pud;
+	unsigned long end;
+	int cur_pages = 0;
+	pgprot_t pud_pgprot;
+
+	end = start + (cpa->numpages << PAGE_SHIFT);
+
+	/*
+	 * Not on a Gb page boundary? => map everything up to it with
+	 * smaller pages.
+	 */
+	if (start & (PUD_SIZE - 1)) {
+		unsigned long pre_end;
+		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+		pre_end   = min_t(unsigned long, end, next_page);
+		cur_pages = (pre_end - start) >> PAGE_SHIFT;
+		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+		pud = pud_offset(pgd, start);
+
+		/*
+		 * Need a PMD page?
+		 */
+		if (pud_none(*pud))
+			if (alloc_pmd_page(pud))
+				return -1;
+
+		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+					 pud, pgprot);
+		if (cur_pages < 0)
+			return cur_pages;
+
+		start = pre_end;
+	}
+
+	/* We mapped them all? */
+	if (cpa->numpages == cur_pages)
+		return cur_pages;
+
+	pud = pud_offset(pgd, start);
+	pud_pgprot = pgprot_4k_2_large(pgprot);
+
+	/*
+	 * Map everything starting from the Gb boundary, possibly with 1G pages
+	 */
+	while (end - start >= PUD_SIZE) {
+		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+				   massage_pgprot(pud_pgprot)));
+
+		start	  += PUD_SIZE;
+		cpa->pfn  += PUD_SIZE;
+		cur_pages += PUD_SIZE >> PAGE_SHIFT;
+		pud++;
+	}
+
+	/* Map trailing leftover */
+	if (start < end) {
+		int tmp;
+
+		pud = pud_offset(pgd, start);
+		if (pud_none(*pud))
+			if (alloc_pmd_page(pud))
+				return -1;
+
+		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+				   pud, pgprot);
+		if (tmp < 0)
+			return cur_pages;
+
+		cur_pages += tmp;
+	}
+	return cur_pages;
+}
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+	pud_t *pud = NULL;	/* shut up gcc */
+	pgd_t *pgd_entry;
+	int ret;
+
+	pgd_entry = cpa->pgd + pgd_index(addr);
+
+	/*
+	 * Allocate a PUD page and hand it down for mapping.
+	 */
+	if (pgd_none(*pgd_entry)) {
+		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		if (!pud)
+			return -1;
+
+		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+	}
+
+	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
+
+	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+	if (ret < 0) {
+		unmap_pgd_range(cpa->pgd, addr,
+				addr + (cpa->numpages << PAGE_SHIFT));
+		return ret;
+	}
+
+	cpa->numpages = ret;
+	return 0;
+}
+
+static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
+			       int primary)
+{
+	if (cpa->pgd)
+		return populate_pgd(cpa, vaddr);
+
+	/*
+	 * Ignore all non primary paths.
+	 */
+	if (!primary)
+		return 0;
+
+	/*
+	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
+	 * to have holes.
+	 * Also set numpages to '1' indicating that we processed cpa req for
+	 * one virtual address page and its pfn. TBD: numpages can be set based
+	 * on the initial value and the level returned by lookup_address().
+	 */
+	if (within(vaddr, PAGE_OFFSET,
+		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
+		cpa->numpages = 1;
+		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
+		return 0;
+	} else {
+		WARN(1, KERN_WARNING "CPA: called for zero pte. "
+			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
+			*cpa->vaddr);
+
+		return -EFAULT;
+	}
+}
+
+static int __change_page_attr(struct cpa_data *cpa, int primary)
+{
+	unsigned long address;
+	int do_split, err;
+	unsigned int level;
+	pte_t *kpte, old_pte;
+
+	if (cpa->flags & CPA_PAGES_ARRAY) {
+		struct page *page = cpa->pages[cpa->curpage];
+		if (unlikely(PageHighMem(page)))
+			return 0;
+		address = (unsigned long)page_address(page);
+	} else if (cpa->flags & CPA_ARRAY)
+		address = cpa->vaddr[cpa->curpage];
+	else
+		address = *cpa->vaddr;
+repeat:
+	kpte = _lookup_address_cpa(cpa, address, &level);
+	if (!kpte)
+		return __cpa_process_fault(cpa, address, primary);
+
+	old_pte = *kpte;
+	if (!pte_val(old_pte))
+		return __cpa_process_fault(cpa, address, primary);
+
+	if (level == PG_LEVEL_4K) {
+		pte_t new_pte;
+		pgprot_t new_prot = pte_pgprot(old_pte);
+		unsigned long pfn = pte_pfn(old_pte);
+
+		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
+		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+
+		new_prot = static_protections(new_prot, address, pfn);
+
+		/*
+		 * Set the GLOBAL flags only if the PRESENT flag is
+		 * set otherwise pte_present will return true even on
+		 * a non present pte. The canon_pgprot will clear
+		 * _PAGE_GLOBAL for the ancient hardware that doesn't
+		 * support it.
+		 */
+		if (pgprot_val(new_prot) & _PAGE_PRESENT)
+			pgprot_val(new_prot) |= _PAGE_GLOBAL;
+		else
+			pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
+
+		/*
+		 * We need to keep the pfn from the existing PTE,
+		 * after all we're only going to change it's attributes
+		 * not the memory it points to
+		 */
+		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+		cpa->pfn = pfn;
+		/*
+		 * Do we really change anything ?
+		 */
+		if (pte_val(old_pte) != pte_val(new_pte)) {
+			set_pte_atomic(kpte, new_pte);
+			cpa->flags |= CPA_FLUSHTLB;
+		}
+		cpa->numpages = 1;
+		return 0;
+	}
+
+	/*
+	 * Check, whether we can keep the large page intact
+	 * and just change the pte:
+	 */
+	do_split = try_preserve_large_page(kpte, address, cpa);
+	/*
+	 * When the range fits into the existing large page,
+	 * return. cp->numpages and cpa->tlbflush have been updated in
+	 * try_large_page:
+	 */
+	if (do_split <= 0)
+		return do_split;
+
+	/*
+	 * We have to split the large page:
+	 */
+	err = split_large_page(cpa, kpte, address);
+	if (!err) {
+		/*
+	 	 * Do a global flush tlb after splitting the large page
+	 	 * and before we do the actual change page attribute in the PTE.
+	 	 *
+	 	 * With out this, we violate the TLB application note, that says
+	 	 * "The TLBs may contain both ordinary and large-page
+		 *  translations for a 4-KByte range of linear addresses. This
+		 *  may occur if software modifies the paging structures so that
+		 *  the page size used for the address range changes. If the two
+		 *  translations differ with respect to page frame or attributes
+		 *  (e.g., permissions), processor behavior is undefined and may
+		 *  be implementation-specific."
+	 	 *
+	 	 * We do this global tlb flush inside the cpa_lock, so that we
+		 * don't allow any other cpu, with stale tlb entries change the
+		 * page attribute in parallel, that also falls into the
+		 * just split large page entry.
+	 	 */
+		flush_tlb_all();
+		goto repeat;
+	}
+
+	return err;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
+
+static int cpa_process_alias(struct cpa_data *cpa)
+{
+	struct cpa_data alias_cpa;
+	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
+	unsigned long vaddr;
+	int ret;
+
+	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
+		return 0;
+
+	/*
+	 * No need to redo, when the primary call touched the direct
+	 * mapping already:
+	 */
+	if (cpa->flags & CPA_PAGES_ARRAY) {
+		struct page *page = cpa->pages[cpa->curpage];
+		if (unlikely(PageHighMem(page)))
+			return 0;
+		vaddr = (unsigned long)page_address(page);
+	} else if (cpa->flags & CPA_ARRAY)
+		vaddr = cpa->vaddr[cpa->curpage];
+	else
+		vaddr = *cpa->vaddr;
+
+	if (!(within(vaddr, PAGE_OFFSET,
+		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
+
+		alias_cpa = *cpa;
+		alias_cpa.vaddr = &laddr;
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+		ret = __change_page_attr_set_clr(&alias_cpa, 0);
+		if (ret)
+			return ret;
+	}
+
+#ifdef CONFIG_X86_64
+	/*
+	 * If the primary call didn't touch the high mapping already
+	 * and the physical address is inside the kernel map, we need
+	 * to touch the high mapped kernel as well:
+	 */
+	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
+	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
+					       __START_KERNEL_map - phys_base;
+		alias_cpa = *cpa;
+		alias_cpa.vaddr = &temp_cpa_vaddr;
+		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+
+		/*
+		 * The high mapping range is imprecise, so ignore the
+		 * return value.
+		 */
+		__change_page_attr_set_clr(&alias_cpa, 0);
+	}
+#endif
+
+	return 0;
+}
+
+static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
+{
+	int ret, numpages = cpa->numpages;
+
+	while (numpages) {
+		/*
+		 * Store the remaining nr of pages for the large page
+		 * preservation check.
+		 */
+		cpa->numpages = numpages;
+		/* for array changes, we can't use large page */
+		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
+			cpa->numpages = 1;
+
+		if (!debug_pagealloc)
+			spin_lock(&cpa_lock);
+		ret = __change_page_attr(cpa, checkalias);
+		if (!debug_pagealloc)
+			spin_unlock(&cpa_lock);
+		if (ret)
+			return ret;
+
+		if (checkalias) {
+			ret = cpa_process_alias(cpa);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * Adjust the number of pages with the result of the
+		 * CPA operation. Either a large page has been
+		 * preserved or a single page update happened.
+		 */
+		BUG_ON(cpa->numpages > numpages);
+		numpages -= cpa->numpages;
+		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+			cpa->curpage++;
+		else
+			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
+
+	}
+	return 0;
+}
+
+static int change_page_attr_set_clr(unsigned long *addr, int numpages,
+				    pgprot_t mask_set, pgprot_t mask_clr,
+				    int force_split, int in_flag,
+				    struct page **pages)
+{
+	struct cpa_data cpa;
+	int ret, cache, checkalias;
+	unsigned long baddr = 0;
+
+	memset(&cpa, 0, sizeof(cpa));
+
+	/*
+	 * Check, if we are requested to change a not supported
+	 * feature:
+	 */
+	mask_set = canon_pgprot(mask_set);
+	mask_clr = canon_pgprot(mask_clr);
+	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
+		return 0;
+
+	/* Ensure we are PAGE_SIZE aligned */
+	if (in_flag & CPA_ARRAY) {
+		int i;
+		for (i = 0; i < numpages; i++) {
+			if (addr[i] & ~PAGE_MASK) {
+				addr[i] &= PAGE_MASK;
+				WARN_ON_ONCE(1);
+			}
+		}
+	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
+		/*
+		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
+		 * No need to cehck in that case
+		 */
+		if (*addr & ~PAGE_MASK) {
+			*addr &= PAGE_MASK;
+			/*
+			 * People should not be passing in unaligned addresses:
+			 */
+			WARN_ON_ONCE(1);
+		}
+		/*
+		 * Save address for cache flush. *addr is modified in the call
+		 * to __change_page_attr_set_clr() below.
+		 */
+		baddr = *addr;
+	}
+
+	/* Must avoid aliasing mappings in the highmem code */
+	kmap_flush_unused();
+
+	vm_unmap_aliases();
+
+	cpa.vaddr = addr;
+	cpa.pages = pages;
+	cpa.numpages = numpages;
+	cpa.mask_set = mask_set;
+	cpa.mask_clr = mask_clr;
+	cpa.flags = 0;
+	cpa.curpage = 0;
+	cpa.force_split = force_split;
+
+	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
+		cpa.flags |= in_flag;
+
+	/* No alias checking for _NX bit modifications */
+	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
+
+	ret = __change_page_attr_set_clr(&cpa, checkalias);
+
+	/*
+	 * Check whether we really changed something:
+	 */
+	if (!(cpa.flags & CPA_FLUSHTLB))
+		goto out;
+
+	/*
+	 * No need to flush, when we did not set any of the caching
+	 * attributes:
+	 */
+	cache = !!pgprot2cachemode(mask_set);
+
+	/*
+	 * On success we use CLFLUSH, when the CPU supports it to
+	 * avoid the WBINVD. If the CPU does not support it and in the
+	 * error case we fall back to cpa_flush_all (which uses
+	 * WBINVD):
+	 */
+	if (!ret && cpu_has_clflush) {
+		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+			cpa_flush_array(addr, numpages, cache,
+					cpa.flags, pages);
+		} else
+			cpa_flush_range(baddr, numpages, cache);
+	} else
+		cpa_flush_all(cache);
+
+out:
+	return ret;
+}
+
+static inline int change_page_attr_set(unsigned long *addr, int numpages,
+				       pgprot_t mask, int array)
+{
+	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
+		(array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int change_page_attr_clear(unsigned long *addr, int numpages,
+					 pgprot_t mask, int array)
+{
+	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
+		(array ? CPA_ARRAY : 0), NULL);
+}
+
+static inline int cpa_set_pages_array(struct page **pages, int numpages,
+				       pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
+		CPA_PAGES_ARRAY, pages);
+}
+
+static inline int cpa_clear_pages_array(struct page **pages, int numpages,
+					 pgprot_t mask)
+{
+	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
+		CPA_PAGES_ARRAY, pages);
+}
+
+int _set_memory_uc(unsigned long addr, int numpages)
+{
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
+	return change_page_attr_set(&addr, numpages,
+				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+				    0);
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+	int ret;
+
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = _set_memory_uc(addr, numpages);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
+}
+EXPORT_SYMBOL(set_memory_uc);
+
+static int _set_memory_array(unsigned long *addr, int addrinarray,
+		enum page_cache_mode new_type)
+{
+	int i, j;
+	int ret;
+
+	/*
+	 * for now UC MINUS. see comments in ioremap_nocache()
+	 */
+	for (i = 0; i < addrinarray; i++) {
+		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+					new_type, NULL);
+		if (ret)
+			goto out_free;
+	}
+
+	ret = change_page_attr_set(addr, addrinarray,
+				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+				   1);
+
+	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
+		ret = change_page_attr_set_clr(addr, addrinarray,
+					       cachemode2pgprot(
+						_PAGE_CACHE_MODE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, CPA_ARRAY, NULL);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	for (j = 0; j < i; j++)
+		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+
+	return ret;
+}
+
+int set_memory_array_uc(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_memory_array_uc);
+
+int set_memory_array_wc(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
+}
+EXPORT_SYMBOL(set_memory_array_wc);
+
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+	int ret;
+	unsigned long addr_copy = addr;
+
+	ret = change_page_attr_set(&addr, numpages,
+				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
+				   0);
+	if (!ret) {
+		ret = change_page_attr_set_clr(&addr_copy, numpages,
+					       cachemode2pgprot(
+						_PAGE_CACHE_MODE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, 0, NULL);
+	}
+	return ret;
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+	int ret;
+
+	if (!pat_enabled)
+		return set_memory_uc(addr, numpages);
+
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+		_PAGE_CACHE_MODE_WC, NULL);
+	if (ret)
+		goto out_err;
+
+	ret = _set_memory_wc(addr, numpages);
+	if (ret)
+		goto out_free;
+
+	return 0;
+
+out_free:
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+out_err:
+	return ret;
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wb(unsigned long addr, int numpages)
+{
+	/* WB cache mode is hard wired to all cache attribute bits being 0 */
+	return change_page_attr_clear(&addr, numpages,
+				      __pgprot(_PAGE_CACHE_MASK), 0);
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+	int ret;
+
+	ret = _set_memory_wb(addr, numpages);
+	if (ret)
+		return ret;
+
+	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL(set_memory_wb);
+
+int set_memory_array_wb(unsigned long *addr, int addrinarray)
+{
+	int i;
+	int ret;
+
+	/* WB cache mode is hard wired to all cache attribute bits being 0 */
+	ret = change_page_attr_clear(addr, addrinarray,
+				      __pgprot(_PAGE_CACHE_MASK), 1);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < addrinarray; i++)
+		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+
+	return 0;
+}
+EXPORT_SYMBOL(set_memory_array_wb);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return 0;
+
+	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_x);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+	if (!(__supported_pte_mask & _PAGE_NX))
+		return 0;
+
+	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
+}
+
+int set_memory_np(unsigned long addr, int numpages)
+{
+	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
+}
+
+int set_memory_4k(unsigned long addr, int numpages)
+{
+	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
+					__pgprot(0), 1, 0, NULL);
+}
+
+int set_pages_uc(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_uc(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_uc);
+
+static int _set_pages_array(struct page **pages, int addrinarray,
+		enum page_cache_mode new_type)
+{
+	unsigned long start;
+	unsigned long end;
+	int i;
+	int free_idx;
+	int ret;
+
+	for (i = 0; i < addrinarray; i++) {
+		if (PageHighMem(pages[i]))
+			continue;
+		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+		end = start + PAGE_SIZE;
+		if (reserve_memtype(start, end, new_type, NULL))
+			goto err_out;
+	}
+
+	ret = cpa_set_pages_array(pages, addrinarray,
+			cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
+	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
+		ret = change_page_attr_set_clr(NULL, addrinarray,
+					       cachemode2pgprot(
+						_PAGE_CACHE_MODE_WC),
+					       __pgprot(_PAGE_CACHE_MASK),
+					       0, CPA_PAGES_ARRAY, pages);
+	if (ret)
+		goto err_out;
+	return 0; /* Success */
+err_out:
+	free_idx = i;
+	for (i = 0; i < free_idx; i++) {
+		if (PageHighMem(pages[i]))
+			continue;
+		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+	return -EINVAL;
+}
+
+int set_pages_array_uc(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+}
+EXPORT_SYMBOL(set_pages_array_uc);
+
+int set_pages_array_wc(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
+}
+EXPORT_SYMBOL(set_pages_array_wc);
+
+int set_pages_wb(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_wb(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_wb);
+
+int set_pages_array_wb(struct page **pages, int addrinarray)
+{
+	int retval;
+	unsigned long start;
+	unsigned long end;
+	int i;
+
+	/* WB cache mode is hard wired to all cache attribute bits being 0 */
+	retval = cpa_clear_pages_array(pages, addrinarray,
+			__pgprot(_PAGE_CACHE_MASK));
+	if (retval)
+		return retval;
+
+	for (i = 0; i < addrinarray; i++) {
+		if (PageHighMem(pages[i]))
+			continue;
+		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
+		end = start + PAGE_SIZE;
+		free_memtype(start, end);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(set_pages_array_wb);
+
+int set_pages_x(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_x(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_x);
+
+int set_pages_nx(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_nx(addr, numpages);
+}
+EXPORT_SYMBOL(set_pages_nx);
+
+int set_pages_ro(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_ro(addr, numpages);
+}
+
+int set_pages_rw(struct page *page, int numpages)
+{
+	unsigned long addr = (unsigned long)page_address(page);
+
+	return set_memory_rw(addr, numpages);
+}
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+
+static int __set_pages_p(struct page *page, int numpages)
+{
+	unsigned long tempaddr = (unsigned long) page_address(page);
+	struct cpa_data cpa = { .vaddr = &tempaddr,
+				.pgd = NULL,
+				.numpages = numpages,
+				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+				.mask_clr = __pgprot(0),
+				.flags = 0};
+
+	/*
+	 * No alias checking needed for setting present flag. otherwise,
+	 * we may need to break large pages for 64-bit kernel text
+	 * mappings (this adds to complexity if we want to do this from
+	 * atomic context especially). Let's keep it simple!
+	 */
+	return __change_page_attr_set_clr(&cpa, 0);
+}
+
+static int __set_pages_np(struct page *page, int numpages)
+{
+	unsigned long tempaddr = (unsigned long) page_address(page);
+	struct cpa_data cpa = { .vaddr = &tempaddr,
+				.pgd = NULL,
+				.numpages = numpages,
+				.mask_set = __pgprot(0),
+				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+				.flags = 0};
+
+	/*
+	 * No alias checking needed for setting not present flag. otherwise,
+	 * we may need to break large pages for 64-bit kernel text
+	 * mappings (this adds to complexity if we want to do this from
+	 * atomic context especially). Let's keep it simple!
+	 */
+	return __change_page_attr_set_clr(&cpa, 0);
+}
+
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+	if (PageHighMem(page))
+		return;
+	if (!enable) {
+		debug_check_no_locks_freed(page_address(page),
+					   numpages * PAGE_SIZE);
+	}
+
+	/*
+	 * The return value is ignored as the calls cannot fail.
+	 * Large pages for identity mappings are not used at boot time
+	 * and hence no memory allocations during large page split.
+	 */
+	if (enable)
+		__set_pages_p(page, numpages);
+	else
+		__set_pages_np(page, numpages);
+
+	/*
+	 * We should perform an IPI and flush all tlbs,
+	 * but that can deadlock->flush only current cpu:
+	 */
+	__flush_tlb_all();
+
+	arch_flush_lazy_mmu_mode();
+}
+
+#ifdef CONFIG_HIBERNATION
+
+bool kernel_page_present(struct page *page)
+{
+	unsigned int level;
+	pte_t *pte;
+
+	if (PageHighMem(page))
+		return false;
+
+	pte = lookup_address((unsigned long)page_address(page), &level);
+	return (pte_val(*pte) & _PAGE_PRESENT);
+}
+
+#endif /* CONFIG_HIBERNATION */
+
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+			    unsigned numpages, unsigned long page_flags)
+{
+	int retval = -EINVAL;
+
+	struct cpa_data cpa = {
+		.vaddr = &address,
+		.pfn = pfn,
+		.pgd = pgd,
+		.numpages = numpages,
+		.mask_set = __pgprot(0),
+		.mask_clr = __pgprot(0),
+		.flags = 0,
+	};
+
+	if (!(__supported_pte_mask & _PAGE_NX))
+		goto out;
+
+	if (!(page_flags & _PAGE_NX))
+		cpa.mask_clr = __pgprot(_PAGE_NX);
+
+	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+	retval = __change_page_attr_set_clr(&cpa, 0);
+	__flush_tlb_all();
+
+out:
+	return retval;
+}
+
+void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
+			       unsigned numpages)
+{
+	unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
+}
+
+/*
+ * The testcases use internal knowledge of the implementation that shouldn't
+ * be exposed to the rest of the kernel. Include these directly here.
+ */
+#ifdef CONFIG_CPA_DEBUG
+#include "pageattr-test.c"
+#endif
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
new file mode 100644
index 000000000..35af6771a
--- /dev/null
+++ b/arch/x86/mm/pat.c
@@ -0,0 +1,1008 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/bootmem.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rbtree.h>
+
+#include <asm/cacheflush.h>
+#include <asm/processor.h>
+#include <asm/tlbflush.h>
+#include <asm/x86_init.h>
+#include <asm/pgtable.h>
+#include <asm/fcntl.h>
+#include <asm/e820.h>
+#include <asm/mtrr.h>
+#include <asm/page.h>
+#include <asm/msr.h>
+#include <asm/pat.h>
+#include <asm/io.h>
+
+#include "pat_internal.h"
+#include "mm_internal.h"
+
+#ifdef CONFIG_X86_PAT
+int __read_mostly pat_enabled = 1;
+
+static inline void pat_disable(const char *reason)
+{
+	pat_enabled = 0;
+	printk(KERN_INFO "%s\n", reason);
+}
+
+static int __init nopat(char *str)
+{
+	pat_disable("PAT support disabled.");
+	return 0;
+}
+early_param("nopat", nopat);
+#else
+static inline void pat_disable(const char *reason)
+{
+	(void)reason;
+}
+#endif
+
+
+int pat_debug_enable;
+
+static int __init pat_debug_setup(char *str)
+{
+	pat_debug_enable = 1;
+	return 0;
+}
+__setup("debugpat", pat_debug_setup);
+
+static u64 __read_mostly boot_pat_state;
+
+#ifdef CONFIG_X86_PAT
+/*
+ * X86 PAT uses page flags WC and Uncached together to keep track of
+ * memory type of pages that have backing page struct. X86 PAT supports 3
+ * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
+ * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
+ * been changed from its default (value of -1 used to denote this).
+ * Note we do not support _PAGE_CACHE_MODE_UC here.
+ */
+
+#define _PGMT_DEFAULT		0
+#define _PGMT_WC		(1UL << PG_arch_1)
+#define _PGMT_UC_MINUS		(1UL << PG_uncached)
+#define _PGMT_WB		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
+
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+	unsigned long pg_flags = pg->flags & _PGMT_MASK;
+
+	if (pg_flags == _PGMT_DEFAULT)
+		return -1;
+	else if (pg_flags == _PGMT_WC)
+		return _PAGE_CACHE_MODE_WC;
+	else if (pg_flags == _PGMT_UC_MINUS)
+		return _PAGE_CACHE_MODE_UC_MINUS;
+	else
+		return _PAGE_CACHE_MODE_WB;
+}
+
+static inline void set_page_memtype(struct page *pg,
+				    enum page_cache_mode memtype)
+{
+	unsigned long memtype_flags;
+	unsigned long old_flags;
+	unsigned long new_flags;
+
+	switch (memtype) {
+	case _PAGE_CACHE_MODE_WC:
+		memtype_flags = _PGMT_WC;
+		break;
+	case _PAGE_CACHE_MODE_UC_MINUS:
+		memtype_flags = _PGMT_UC_MINUS;
+		break;
+	case _PAGE_CACHE_MODE_WB:
+		memtype_flags = _PGMT_WB;
+		break;
+	default:
+		memtype_flags = _PGMT_DEFAULT;
+		break;
+	}
+
+	do {
+		old_flags = pg->flags;
+		new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
+	} while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
+}
+#else
+static inline enum page_cache_mode get_page_memtype(struct page *pg)
+{
+	return -1;
+}
+static inline void set_page_memtype(struct page *pg,
+				    enum page_cache_mode memtype)
+{
+}
+#endif
+
+enum {
+	PAT_UC = 0,		/* uncached */
+	PAT_WC = 1,		/* Write combining */
+	PAT_WT = 4,		/* Write Through */
+	PAT_WP = 5,		/* Write Protected */
+	PAT_WB = 6,		/* Write Back (default) */
+	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
+};
+
+#define CM(c) (_PAGE_CACHE_MODE_ ## c)
+
+static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
+{
+	enum page_cache_mode cache;
+	char *cache_mode;
+
+	switch (pat_val) {
+	case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
+	case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
+	case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
+	case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
+	case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
+	case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+	default:           cache = CM(WB);       cache_mode = "WB  "; break;
+	}
+
+	memcpy(msg, cache_mode, 4);
+
+	return cache;
+}
+
+#undef CM
+
+/*
+ * Update the cache mode to pgprot translation tables according to PAT
+ * configuration.
+ * Using lower indices is preferred, so we start with highest index.
+ */
+void pat_init_cache_modes(void)
+{
+	int i;
+	enum page_cache_mode cache;
+	char pat_msg[33];
+	u64 pat;
+
+	rdmsrl(MSR_IA32_CR_PAT, pat);
+	pat_msg[32] = 0;
+	for (i = 7; i >= 0; i--) {
+		cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
+					   pat_msg + 4 * i);
+		update_cache_mode_entry(i, cache);
+	}
+	pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+}
+
+#define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+	u64 pat;
+	bool boot_cpu = !boot_pat_state;
+
+	if (!pat_enabled)
+		return;
+
+	if (!cpu_has_pat) {
+		if (!boot_pat_state) {
+			pat_disable("PAT not supported by CPU.");
+			return;
+		} else {
+			/*
+			 * If this happens we are on a secondary CPU, but
+			 * switched to PAT on the boot CPU. We have no way to
+			 * undo PAT.
+			 */
+			printk(KERN_ERR "PAT enabled, "
+			       "but not supported by secondary CPU\n");
+			BUG();
+		}
+	}
+
+	/* Set PWT to Write-Combining. All other bits stay the same */
+	/*
+	 * PTE encoding used in Linux:
+	 *      PAT
+	 *      |PCD
+	 *      ||PWT
+	 *      |||
+	 *      000 WB		_PAGE_CACHE_WB
+	 *      001 WC		_PAGE_CACHE_WC
+	 *      010 UC-		_PAGE_CACHE_UC_MINUS
+	 *      011 UC		_PAGE_CACHE_UC
+	 * PAT bit unused
+	 */
+	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+
+	/* Boot CPU check */
+	if (!boot_pat_state) {
+		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+		if (!boot_pat_state) {
+			pat_disable("PAT read returns always zero, disabled.");
+			return;
+		}
+	}
+
+	wrmsrl(MSR_IA32_CR_PAT, pat);
+
+	if (boot_cpu)
+		pat_init_cache_modes();
+}
+
+#undef PAT
+
+static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static unsigned long pat_x_mtrr_type(u64 start, u64 end,
+				     enum page_cache_mode req_type)
+{
+	/*
+	 * Look for MTRR hint to get the effective type in case where PAT
+	 * request is for WB.
+	 */
+	if (req_type == _PAGE_CACHE_MODE_WB) {
+		u8 mtrr_type;
+
+		mtrr_type = mtrr_type_lookup(start, end);
+		if (mtrr_type != MTRR_TYPE_WRBACK)
+			return _PAGE_CACHE_MODE_UC_MINUS;
+
+		return _PAGE_CACHE_MODE_WB;
+	}
+
+	return req_type;
+}
+
+struct pagerange_state {
+	unsigned long		cur_pfn;
+	int			ram;
+	int			not_ram;
+};
+
+static int
+pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
+{
+	struct pagerange_state *state = arg;
+
+	state->not_ram	|= initial_pfn > state->cur_pfn;
+	state->ram	|= total_nr_pages > 0;
+	state->cur_pfn	 = initial_pfn + total_nr_pages;
+
+	return state->ram && state->not_ram;
+}
+
+static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
+{
+	int ret = 0;
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	struct pagerange_state state = {start_pfn, 0, 0};
+
+	/*
+	 * For legacy reasons, physical address range in the legacy ISA
+	 * region is tracked as non-RAM. This will allow users of
+	 * /dev/mem to map portions of legacy ISA region, even when
+	 * some of those portions are listed(or not even listed) with
+	 * different e820 types(RAM/reserved/..)
+	 */
+	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
+		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
+
+	if (start_pfn < end_pfn) {
+		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+				&state, pagerange_is_ram_callback);
+	}
+
+	return (ret > 0) ? -1 : (state.ram ? 1 : 0);
+}
+
+/*
+ * For RAM pages, we use page flags to mark the pages with appropriate type.
+ * Here we do two pass:
+ * - Find the memtype of all the pages in the range, look for any conflicts
+ * - In case of no conflicts, set the new memtype for pages in the range
+ */
+static int reserve_ram_pages_type(u64 start, u64 end,
+				  enum page_cache_mode req_type,
+				  enum page_cache_mode *new_type)
+{
+	struct page *page;
+	u64 pfn;
+
+	if (req_type == _PAGE_CACHE_MODE_UC) {
+		/* We do not support strong UC */
+		WARN_ON_ONCE(1);
+		req_type = _PAGE_CACHE_MODE_UC_MINUS;
+	}
+
+	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+		enum page_cache_mode type;
+
+		page = pfn_to_page(pfn);
+		type = get_page_memtype(page);
+		if (type != -1) {
+			pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+				start, end - 1, type, req_type);
+			if (new_type)
+				*new_type = type;
+
+			return -EBUSY;
+		}
+	}
+
+	if (new_type)
+		*new_type = req_type;
+
+	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+		page = pfn_to_page(pfn);
+		set_page_memtype(page, req_type);
+	}
+	return 0;
+}
+
+static int free_ram_pages_type(u64 start, u64 end)
+{
+	struct page *page;
+	u64 pfn;
+
+	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
+		page = pfn_to_page(pfn);
+		set_page_memtype(page, -1);
+	}
+	return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_UC
+ *
+ * If new_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If new_type is non-NULL, function will return
+ * available type in new_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
+		    enum page_cache_mode *new_type)
+{
+	struct memtype *new;
+	enum page_cache_mode actual_type;
+	int is_range_ram;
+	int err = 0;
+
+	BUG_ON(start >= end); /* end is exclusive */
+
+	if (!pat_enabled) {
+		/* This is identical to page table setting without PAT */
+		if (new_type) {
+			if (req_type == _PAGE_CACHE_MODE_WC)
+				*new_type = _PAGE_CACHE_MODE_UC_MINUS;
+			else
+				*new_type = req_type;
+		}
+		return 0;
+	}
+
+	/* Low ISA region is always mapped WB in page table. No need to track */
+	if (x86_platform.is_untracked_pat_range(start, end)) {
+		if (new_type)
+			*new_type = _PAGE_CACHE_MODE_WB;
+		return 0;
+	}
+
+	/*
+	 * Call mtrr_lookup to get the type hint. This is an
+	 * optimization for /dev/mem mmap'ers into WB memory (BIOS
+	 * tools and ACPI tools). Use WB request for WB memory and use
+	 * UC_MINUS otherwise.
+	 */
+	actual_type = pat_x_mtrr_type(start, end, req_type);
+
+	if (new_type)
+		*new_type = actual_type;
+
+	is_range_ram = pat_pagerange_is_ram(start, end);
+	if (is_range_ram == 1) {
+
+		err = reserve_ram_pages_type(start, end, req_type, new_type);
+
+		return err;
+	} else if (is_range_ram < 0) {
+		return -EINVAL;
+	}
+
+	new  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	new->start	= start;
+	new->end	= end;
+	new->type	= actual_type;
+
+	spin_lock(&memtype_lock);
+
+	err = rbt_memtype_check_insert(new, new_type);
+	if (err) {
+		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+		       start, end - 1,
+		       cattr_name(new->type), cattr_name(req_type));
+		kfree(new);
+		spin_unlock(&memtype_lock);
+
+		return err;
+	}
+
+	spin_unlock(&memtype_lock);
+
+	dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+		start, end - 1, cattr_name(new->type), cattr_name(req_type),
+		new_type ? cattr_name(*new_type) : "-");
+
+	return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+	int err = -EINVAL;
+	int is_range_ram;
+	struct memtype *entry;
+
+	if (!pat_enabled)
+		return 0;
+
+	/* Low ISA region is always mapped WB. No need to track */
+	if (x86_platform.is_untracked_pat_range(start, end))
+		return 0;
+
+	is_range_ram = pat_pagerange_is_ram(start, end);
+	if (is_range_ram == 1) {
+
+		err = free_ram_pages_type(start, end);
+
+		return err;
+	} else if (is_range_ram < 0) {
+		return -EINVAL;
+	}
+
+	spin_lock(&memtype_lock);
+	entry = rbt_memtype_erase(start, end);
+	spin_unlock(&memtype_lock);
+
+	if (!entry) {
+		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+		       current->comm, current->pid, start, end - 1);
+		return -EINVAL;
+	}
+
+	kfree(entry);
+
+	dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
+
+	return 0;
+}
+
+
+/**
+ * lookup_memtype - Looksup the memory type for a physical address
+ * @paddr: physical address of which memory type needs to be looked up
+ *
+ * Only to be called when PAT is enabled
+ *
+ * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
+ * or _PAGE_CACHE_MODE_UC
+ */
+static enum page_cache_mode lookup_memtype(u64 paddr)
+{
+	enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
+	struct memtype *entry;
+
+	if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
+		return rettype;
+
+	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
+		struct page *page;
+		page = pfn_to_page(paddr >> PAGE_SHIFT);
+		rettype = get_page_memtype(page);
+		/*
+		 * -1 from get_page_memtype() implies RAM page is in its
+		 * default state and not reserved, and hence of type WB
+		 */
+		if (rettype == -1)
+			rettype = _PAGE_CACHE_MODE_WB;
+
+		return rettype;
+	}
+
+	spin_lock(&memtype_lock);
+
+	entry = rbt_memtype_lookup(paddr);
+	if (entry != NULL)
+		rettype = entry->type;
+	else
+		rettype = _PAGE_CACHE_MODE_UC_MINUS;
+
+	spin_unlock(&memtype_lock);
+	return rettype;
+}
+
+/**
+ * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ * @type: A pointer to memtype, with requested type. On success, requested
+ * or any other compatible type that was available for the region is returned
+ *
+ * On success, returns 0
+ * On failure, returns non-zero
+ */
+int io_reserve_memtype(resource_size_t start, resource_size_t end,
+			enum page_cache_mode *type)
+{
+	resource_size_t size = end - start;
+	enum page_cache_mode req_type = *type;
+	enum page_cache_mode new_type;
+	int ret;
+
+	WARN_ON_ONCE(iomem_map_sanity_check(start, size));
+
+	ret = reserve_memtype(start, end, req_type, &new_type);
+	if (ret)
+		goto out_err;
+
+	if (!is_new_memtype_allowed(start, size, req_type, new_type))
+		goto out_free;
+
+	if (kernel_map_sync_memtype(start, size, new_type) < 0)
+		goto out_free;
+
+	*type = new_type;
+	return 0;
+
+out_free:
+	free_memtype(start, end);
+	ret = -EBUSY;
+out_err:
+	return ret;
+}
+
+/**
+ * io_free_memtype - Release a memory type mapping for a region of memory
+ * @start: start (physical address) of the region
+ * @end: end (physical address) of the region
+ */
+void io_free_memtype(resource_size_t start, resource_size_t end)
+{
+	free_memtype(start, end);
+}
+
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+				unsigned long size, pgprot_t vma_prot)
+{
+	return vma_prot;
+}
+
+#ifdef CONFIG_STRICT_DEVMEM
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	return 1;
+}
+#else
+/* This check is needed to avoid cache aliasing when PAT is enabled */
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+	u64 from = ((u64)pfn) << PAGE_SHIFT;
+	u64 to = from + size;
+	u64 cursor = from;
+
+	if (!pat_enabled)
+		return 1;
+
+	while (cursor < to) {
+		if (!devmem_is_allowed(pfn)) {
+			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+			       current->comm, from, to - 1);
+			return 0;
+		}
+		cursor += PAGE_SIZE;
+		pfn++;
+	}
+	return 1;
+}
+#endif /* CONFIG_STRICT_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+				unsigned long size, pgprot_t *vma_prot)
+{
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
+
+	if (!range_is_allowed(pfn, size))
+		return 0;
+
+	if (file->f_flags & O_DSYNC)
+		pcm = _PAGE_CACHE_MODE_UC_MINUS;
+
+#ifdef CONFIG_X86_32
+	/*
+	 * On the PPro and successors, the MTRRs are used to set
+	 * memory types for physical addresses outside main memory,
+	 * so blindly setting UC or PWT on those pages is wrong.
+	 * For Pentiums and earlier, the surround logic should disable
+	 * caching for the high addresses through the KEN pin, but
+	 * we maintain the tradition of paranoia in this code.
+	 */
+	if (!pat_enabled &&
+	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
+	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
+	      boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
+	    (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+		pcm = _PAGE_CACHE_MODE_UC;
+	}
+#endif
+
+	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+			     cachemode2protval(pcm));
+	return 1;
+}
+
+/*
+ * Change the memory type for the physial address range in kernel identity
+ * mapping space if that range is a part of identity map.
+ */
+int kernel_map_sync_memtype(u64 base, unsigned long size,
+			    enum page_cache_mode pcm)
+{
+	unsigned long id_sz;
+
+	if (base > __pa(high_memory-1))
+		return 0;
+
+	/*
+	 * some areas in the middle of the kernel identity range
+	 * are not mapped, like the PCI space.
+	 */
+	if (!page_is_ram(base >> PAGE_SHIFT))
+		return 0;
+
+	id_sz = (__pa(high_memory-1) <= base + size) ?
+				__pa(high_memory) - base :
+				size;
+
+	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
+		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
+			"for [mem %#010Lx-%#010Lx]\n",
+			current->comm, current->pid,
+			cattr_name(pcm),
+			base, (unsigned long long)(base + size-1));
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Internal interface to reserve a range of physical memory with prot.
+ * Reserved non RAM regions only and after successful reserve_memtype,
+ * this func also keeps identity mapping (if any) in sync with this new prot.
+ */
+static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
+				int strict_prot)
+{
+	int is_ram = 0;
+	int ret;
+	enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
+	enum page_cache_mode pcm = want_pcm;
+
+	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+
+	/*
+	 * reserve_pfn_range() for RAM pages. We do not refcount to keep
+	 * track of number of mappings of RAM pages. We can assert that
+	 * the type requested matches the type of first page in the range.
+	 */
+	if (is_ram) {
+		if (!pat_enabled)
+			return 0;
+
+		pcm = lookup_memtype(paddr);
+		if (want_pcm != pcm) {
+			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+				current->comm, current->pid,
+				cattr_name(want_pcm),
+				(unsigned long long)paddr,
+				(unsigned long long)(paddr + size - 1),
+				cattr_name(pcm));
+			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
+					     (~_PAGE_CACHE_MASK)) |
+					     cachemode2protval(pcm));
+		}
+		return 0;
+	}
+
+	ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
+	if (ret)
+		return ret;
+
+	if (pcm != want_pcm) {
+		if (strict_prot ||
+		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
+			free_memtype(paddr, paddr + size);
+			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
+				" for [mem %#010Lx-%#010Lx], got %s\n",
+				current->comm, current->pid,
+				cattr_name(want_pcm),
+				(unsigned long long)paddr,
+				(unsigned long long)(paddr + size - 1),
+				cattr_name(pcm));
+			return -EINVAL;
+		}
+		/*
+		 * We allow returning different type than the one requested in
+		 * non strict case.
+		 */
+		*vma_prot = __pgprot((pgprot_val(*vma_prot) &
+				      (~_PAGE_CACHE_MASK)) |
+				     cachemode2protval(pcm));
+	}
+
+	if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
+		free_memtype(paddr, paddr + size);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Internal interface to free a range of physical memory.
+ * Frees non RAM regions only.
+ */
+static void free_pfn_range(u64 paddr, unsigned long size)
+{
+	int is_ram;
+
+	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
+	if (is_ram == 0)
+		free_memtype(paddr, paddr + size);
+}
+
+/*
+ * track_pfn_copy is called when vma that is covering the pfnmap gets
+ * copied through copy_page_range().
+ *
+ * If the vma has a linear pfn mapping for the entire range, we get the prot
+ * from pte and reserve the entire vma range with single reserve_pfn_range call.
+ */
+int track_pfn_copy(struct vm_area_struct *vma)
+{
+	resource_size_t paddr;
+	unsigned long prot;
+	unsigned long vma_size = vma->vm_end - vma->vm_start;
+	pgprot_t pgprot;
+
+	if (vma->vm_flags & VM_PAT) {
+		/*
+		 * reserve the whole chunk covered by vma. We need the
+		 * starting address and protection from pte.
+		 */
+		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+			WARN_ON_ONCE(1);
+			return -EINVAL;
+		}
+		pgprot = __pgprot(prot);
+		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
+	}
+
+	return 0;
+}
+
+/*
+ * prot is passed in as a parameter for the new mapping. If the vma has a
+ * linear pfn mapping for the entire range reserve the entire vma range with
+ * single reserve_pfn_range call.
+ */
+int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
+		    unsigned long pfn, unsigned long addr, unsigned long size)
+{
+	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
+	enum page_cache_mode pcm;
+
+	/* reserve the whole chunk starting from paddr */
+	if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+		int ret;
+
+		ret = reserve_pfn_range(paddr, size, prot, 0);
+		if (!ret)
+			vma->vm_flags |= VM_PAT;
+		return ret;
+	}
+
+	if (!pat_enabled)
+		return 0;
+
+	/*
+	 * For anything smaller than the vma size we set prot based on the
+	 * lookup.
+	 */
+	pcm = lookup_memtype(paddr);
+
+	/* Check memtype for the remaining pages */
+	while (size > PAGE_SIZE) {
+		size -= PAGE_SIZE;
+		paddr += PAGE_SIZE;
+		if (pcm != lookup_memtype(paddr))
+			return -EINVAL;
+	}
+
+	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+			 cachemode2protval(pcm));
+
+	return 0;
+}
+
+int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
+		     unsigned long pfn)
+{
+	enum page_cache_mode pcm;
+
+	if (!pat_enabled)
+		return 0;
+
+	/* Set prot based on lookup */
+	pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
+	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
+			 cachemode2protval(pcm));
+
+	return 0;
+}
+
+/*
+ * untrack_pfn is called while unmapping a pfnmap for a region.
+ * untrack can be called for a specific region indicated by pfn and size or
+ * can be for the entire vma (in which case pfn, size are zero).
+ */
+void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+		 unsigned long size)
+{
+	resource_size_t paddr;
+	unsigned long prot;
+
+	if (!(vma->vm_flags & VM_PAT))
+		return;
+
+	/* free the chunk starting from pfn or the whole chunk */
+	paddr = (resource_size_t)pfn << PAGE_SHIFT;
+	if (!paddr && !size) {
+		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
+			WARN_ON_ONCE(1);
+			return;
+		}
+
+		size = vma->vm_end - vma->vm_start;
+	}
+	free_pfn_range(paddr, size);
+	vma->vm_flags &= ~VM_PAT;
+}
+
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+	if (pat_enabled)
+		return __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WC));
+	else
+		return pgprot_noncached(prot);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+
+static struct memtype *memtype_get_idx(loff_t pos)
+{
+	struct memtype *print_entry;
+	int ret;
+
+	print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+	if (!print_entry)
+		return NULL;
+
+	spin_lock(&memtype_lock);
+	ret = rbt_memtype_copy_nth_element(print_entry, pos);
+	spin_unlock(&memtype_lock);
+
+	if (!ret) {
+		return print_entry;
+	} else {
+		kfree(print_entry);
+		return NULL;
+	}
+}
+
+static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	if (*pos == 0) {
+		++*pos;
+		seq_puts(seq, "PAT memtype list:\n");
+	}
+
+	return memtype_get_idx(*pos);
+}
+
+static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return memtype_get_idx(*pos);
+}
+
+static void memtype_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int memtype_seq_show(struct seq_file *seq, void *v)
+{
+	struct memtype *print_entry = (struct memtype *)v;
+
+	seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
+			print_entry->start, print_entry->end);
+	kfree(print_entry);
+
+	return 0;
+}
+
+static const struct seq_operations memtype_seq_ops = {
+	.start = memtype_seq_start,
+	.next  = memtype_seq_next,
+	.stop  = memtype_seq_stop,
+	.show  = memtype_seq_show,
+};
+
+static int memtype_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &memtype_seq_ops);
+}
+
+static const struct file_operations memtype_fops = {
+	.open    = memtype_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static int __init pat_memtype_list_init(void)
+{
+	if (pat_enabled) {
+		debugfs_create_file("pat_memtype_list", S_IRUSR,
+				    arch_debugfs_dir, NULL, &memtype_fops);
+	}
+	return 0;
+}
+
+late_initcall(pat_memtype_list_init);
+
+#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
new file mode 100644
index 000000000..f64116203
--- /dev/null
+++ b/arch/x86/mm/pat_internal.h
@@ -0,0 +1,48 @@
+#ifndef __PAT_INTERNAL_H_
+#define __PAT_INTERNAL_H_
+
+extern int pat_debug_enable;
+
+#define dprintk(fmt, arg...) \
+	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+
+struct memtype {
+	u64			start;
+	u64			end;
+	u64			subtree_max_end;
+	enum page_cache_mode	type;
+	struct rb_node		rb;
+};
+
+static inline char *cattr_name(enum page_cache_mode pcm)
+{
+	switch (pcm) {
+	case _PAGE_CACHE_MODE_UC:		return "uncached";
+	case _PAGE_CACHE_MODE_UC_MINUS:		return "uncached-minus";
+	case _PAGE_CACHE_MODE_WB:		return "write-back";
+	case _PAGE_CACHE_MODE_WC:		return "write-combining";
+	case _PAGE_CACHE_MODE_WT:		return "write-through";
+	case _PAGE_CACHE_MODE_WP:		return "write-protected";
+	default:				return "broken";
+	}
+}
+
+#ifdef CONFIG_X86_PAT
+extern int rbt_memtype_check_insert(struct memtype *new,
+					enum page_cache_mode *new_type);
+extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_lookup(u64 addr);
+extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
+#else
+static inline int rbt_memtype_check_insert(struct memtype *new,
+					enum page_cache_mode *new_type)
+{ return 0; }
+static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{ return NULL; }
+static inline struct memtype *rbt_memtype_lookup(u64 addr)
+{ return NULL; }
+static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{ return 0; }
+#endif
+
+#endif /* __PAT_INTERNAL_H_ */
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
new file mode 100644
index 000000000..6582adcc8
--- /dev/null
+++ b/arch/x86/mm/pat_rbtree.c
@@ -0,0 +1,249 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ *          Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Interval tree (augmented rbtree) used to store the PAT memory type
+ * reservations.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rbtree_augmented.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+
+#include "pat_internal.h"
+
+/*
+ * The memtype tree keeps track of memory type for specific
+ * physical memory areas. Without proper tracking, conflicting memory
+ * types in different mappings can cause CPU cache corruption.
+ *
+ * The tree is an interval tree (augmented rbtree) with tree ordered
+ * on starting address. Tree can contain multiple entries for
+ * different regions which overlap. All the aliases have the same
+ * cache attributes of course.
+ *
+ * memtype_lock protects the rbtree.
+ */
+
+static struct rb_root memtype_rbroot = RB_ROOT;
+
+static int is_node_overlap(struct memtype *node, u64 start, u64 end)
+{
+	if (node->start >= end || node->end <= start)
+		return 0;
+
+	return 1;
+}
+
+static u64 get_subtree_max_end(struct rb_node *node)
+{
+	u64 ret = 0;
+	if (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+		ret = data->subtree_max_end;
+	}
+	return ret;
+}
+
+static u64 compute_subtree_max_end(struct memtype *data)
+{
+	u64 max_end = data->end, child_max_end;
+
+	child_max_end = get_subtree_max_end(data->rb.rb_right);
+	if (child_max_end > max_end)
+		max_end = child_max_end;
+
+	child_max_end = get_subtree_max_end(data->rb.rb_left);
+	if (child_max_end > max_end)
+		max_end = child_max_end;
+
+	return max_end;
+}
+
+RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
+		     u64, subtree_max_end, compute_subtree_max_end)
+
+/* Find the first (lowest start addr) overlapping range from rb tree */
+static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
+				u64 start, u64 end)
+{
+	struct rb_node *node = root->rb_node;
+	struct memtype *last_lower = NULL;
+
+	while (node) {
+		struct memtype *data = container_of(node, struct memtype, rb);
+
+		if (get_subtree_max_end(node->rb_left) > start) {
+			/* Lowest overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (is_node_overlap(data, start, end)) {
+			last_lower = data;
+			break;
+		} else if (start >= data->start) {
+			/* Lowest overlap if any must be on right side */
+			node = node->rb_right;
+		} else {
+			break;
+		}
+	}
+	return last_lower; /* Returns NULL if there is no overlap */
+}
+
+static struct memtype *memtype_rb_exact_match(struct rb_root *root,
+				u64 start, u64 end)
+{
+	struct memtype *match;
+
+	match = memtype_rb_lowest_match(root, start, end);
+	while (match != NULL && match->start < end) {
+		struct rb_node *node;
+
+		if (match->start == start && match->end == end)
+			return match;
+
+		node = rb_next(&match->rb);
+		if (node)
+			match = container_of(node, struct memtype, rb);
+		else
+			match = NULL;
+	}
+
+	return NULL; /* Returns NULL if there is no exact match */
+}
+
+static int memtype_rb_check_conflict(struct rb_root *root,
+				u64 start, u64 end,
+				enum page_cache_mode reqtype,
+				enum page_cache_mode *newtype)
+{
+	struct rb_node *node;
+	struct memtype *match;
+	enum page_cache_mode found_type = reqtype;
+
+	match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
+	if (match == NULL)
+		goto success;
+
+	if (match->type != found_type && newtype == NULL)
+		goto failure;
+
+	dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
+	found_type = match->type;
+
+	node = rb_next(&match->rb);
+	while (node) {
+		match = container_of(node, struct memtype, rb);
+
+		if (match->start >= end) /* Checked all possible matches */
+			goto success;
+
+		if (is_node_overlap(match, start, end) &&
+		    match->type != found_type) {
+			goto failure;
+		}
+
+		node = rb_next(&match->rb);
+	}
+success:
+	if (newtype)
+		*newtype = found_type;
+
+	return 0;
+
+failure:
+	printk(KERN_INFO "%s:%d conflicting memory types "
+		"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
+		end, cattr_name(found_type), cattr_name(match->type));
+	return -EBUSY;
+}
+
+static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
+{
+	struct rb_node **node = &(root->rb_node);
+	struct rb_node *parent = NULL;
+
+	while (*node) {
+		struct memtype *data = container_of(*node, struct memtype, rb);
+
+		parent = *node;
+		if (data->subtree_max_end < newdata->end)
+			data->subtree_max_end = newdata->end;
+		if (newdata->start <= data->start)
+			node = &((*node)->rb_left);
+		else if (newdata->start > data->start)
+			node = &((*node)->rb_right);
+	}
+
+	newdata->subtree_max_end = newdata->end;
+	rb_link_node(&newdata->rb, parent, node);
+	rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
+}
+
+int rbt_memtype_check_insert(struct memtype *new,
+			     enum page_cache_mode *ret_type)
+{
+	int err = 0;
+
+	err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
+						new->type, ret_type);
+
+	if (!err) {
+		if (ret_type)
+			new->type = *ret_type;
+
+		new->subtree_max_end = new->end;
+		memtype_rb_insert(&memtype_rbroot, new);
+	}
+	return err;
+}
+
+struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{
+	struct memtype *data;
+
+	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
+	if (!data)
+		goto out;
+
+	rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
+out:
+	return data;
+}
+
+struct memtype *rbt_memtype_lookup(u64 addr)
+{
+	struct memtype *data;
+	data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
+	return data;
+}
+
+#if defined(CONFIG_DEBUG_FS)
+int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
+{
+	struct rb_node *node;
+	int i = 1;
+
+	node = rb_first(&memtype_rbroot);
+	while (node && pos != i) {
+		node = rb_next(node);
+		i++;
+	}
+
+	if (node) { /* pos == i */
+		struct memtype *this = container_of(node, struct memtype, rb);
+		*out = *this;
+		return 0;
+	} else {
+		return 1;
+	}
+}
+#endif
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000..9f0614dae
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,532 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
+ *  Copyright by Intel Crop., 2002
+ *  Louis Zhuang (louis.zhuang@intel.com)
+ *
+ *  Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+
+#include <linux/module.h>
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+	0x65, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
+static unsigned int rw32[] = {
+	0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+	0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+	0xF0, 0xF3, 0xF2,
+	/* REX Prefixes */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
+static unsigned int rw32[] = {
+	0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
+#endif /* not __i386__ */
+
+struct prefix_bits {
+	unsigned shorted:1;
+	unsigned enlarged:1;
+	unsigned rexr:1;
+	unsigned rex:1;
+};
+
+static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
+{
+	int i;
+	unsigned char *p = addr;
+	prf->shorted = 0;
+	prf->enlarged = 0;
+	prf->rexr = 0;
+	prf->rex = 0;
+
+restart:
+	for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+		if (*p == prefix_codes[i]) {
+			if (*p == 0x66)
+				prf->shorted = 1;
+#ifdef __amd64__
+			if ((*p & 0xf8) == 0x48)
+				prf->enlarged = 1;
+			if ((*p & 0xf4) == 0x44)
+				prf->rexr = 1;
+			if ((*p & 0xf0) == 0x40)
+				prf->rex = 1;
+#endif
+			p++;
+			goto restart;
+		}
+	}
+
+	return (p - addr);
+}
+
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+	int len;
+
+	if (*addr == 0x0F) {
+		/* 0x0F is extension instruction */
+		*opcode = *(unsigned short *)addr;
+		len = 2;
+	} else {
+		*opcode = *addr;
+		len = 1;
+	}
+
+	return len;
+}
+
+#define CHECK_OP_TYPE(opcode, array, type) \
+	for (i = 0; i < ARRAY_SIZE(array); i++) { \
+		if (array[i] == opcode) { \
+			rv = type; \
+			goto exit; \
+		} \
+	}
+
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	struct prefix_bits prf;
+	int i;
+	enum reason_type rv = OTHERS;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &prf);
+	p += get_opcode(p, &opcode);
+
+	CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+	CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+	CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+
+exit:
+	return rv;
+}
+#undef CHECK_OP_TYPE
+
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	struct prefix_bits prf;
+	int i;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &prf);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(rw8); i++)
+		if (rw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(rw32); i++)
+		if (rw32[i] == opcode)
+			return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	struct prefix_bits prf;
+	int i;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &prf);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(mw8); i++)
+		if (mw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(mw16); i++)
+		if (mw16[i] == opcode)
+			return 2;
+
+	for (i = 0; i < ARRAY_SIZE(mw32); i++)
+		if (mw32[i] == opcode)
+			return prf.shorted ? 2 : 4;
+
+	for (i = 0; i < ARRAY_SIZE(mw64); i++)
+		if (mw64[i] == opcode)
+			return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+	arg_AL = 0,
+	arg_CL = 1,
+	arg_DL = 2,
+	arg_BL = 3,
+	arg_AH = 4,
+	arg_CH = 5,
+	arg_DH = 6,
+	arg_BH = 7,
+
+	arg_AX = 0,
+	arg_CX = 1,
+	arg_DX = 2,
+	arg_BX = 3,
+	arg_SP = 4,
+	arg_BP = 5,
+	arg_SI = 6,
+	arg_DI = 7,
+#ifdef __amd64__
+	arg_R8  = 8,
+	arg_R9  = 9,
+	arg_R10 = 10,
+	arg_R11 = 11,
+	arg_R12 = 12,
+	arg_R13 = 13,
+	arg_R14 = 14,
+	arg_R15 = 15
+#endif
+};
+
+static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
+{
+	unsigned char *rv = NULL;
+
+	switch (no) {
+	case arg_AL:
+		rv = (unsigned char *)&regs->ax;
+		break;
+	case arg_BL:
+		rv = (unsigned char *)&regs->bx;
+		break;
+	case arg_CL:
+		rv = (unsigned char *)&regs->cx;
+		break;
+	case arg_DL:
+		rv = (unsigned char *)&regs->dx;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = (unsigned char *)&regs->r8;
+		break;
+	case arg_R9:
+		rv = (unsigned char *)&regs->r9;
+		break;
+	case arg_R10:
+		rv = (unsigned char *)&regs->r10;
+		break;
+	case arg_R11:
+		rv = (unsigned char *)&regs->r11;
+		break;
+	case arg_R12:
+		rv = (unsigned char *)&regs->r12;
+		break;
+	case arg_R13:
+		rv = (unsigned char *)&regs->r13;
+		break;
+	case arg_R14:
+		rv = (unsigned char *)&regs->r14;
+		break;
+	case arg_R15:
+		rv = (unsigned char *)&regs->r15;
+		break;
+#endif
+	default:
+		break;
+	}
+
+	if (rv)
+		return rv;
+
+	if (rex) {
+		/*
+		 * If REX prefix exists, access low bytes of SI etc.
+		 * instead of AH etc.
+		 */
+		switch (no) {
+		case arg_SI:
+			rv = (unsigned char *)&regs->si;
+			break;
+		case arg_DI:
+			rv = (unsigned char *)&regs->di;
+			break;
+		case arg_BP:
+			rv = (unsigned char *)&regs->bp;
+			break;
+		case arg_SP:
+			rv = (unsigned char *)&regs->sp;
+			break;
+		default:
+			break;
+		}
+	} else {
+		switch (no) {
+		case arg_AH:
+			rv = 1 + (unsigned char *)&regs->ax;
+			break;
+		case arg_BH:
+			rv = 1 + (unsigned char *)&regs->bx;
+			break;
+		case arg_CH:
+			rv = 1 + (unsigned char *)&regs->cx;
+			break;
+		case arg_DH:
+			rv = 1 + (unsigned char *)&regs->dx;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!rv)
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+
+	return rv;
+}
+
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+	unsigned long *rv = NULL;
+
+	switch (no) {
+	case arg_AX:
+		rv = &regs->ax;
+		break;
+	case arg_BX:
+		rv = &regs->bx;
+		break;
+	case arg_CX:
+		rv = &regs->cx;
+		break;
+	case arg_DX:
+		rv = &regs->dx;
+		break;
+	case arg_SP:
+		rv = &regs->sp;
+		break;
+	case arg_BP:
+		rv = &regs->bp;
+		break;
+	case arg_SI:
+		rv = &regs->si;
+		break;
+	case arg_DI:
+		rv = &regs->di;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = &regs->r8;
+		break;
+	case arg_R9:
+		rv = &regs->r9;
+		break;
+	case arg_R10:
+		rv = &regs->r10;
+		break;
+	case arg_R11:
+		rv = &regs->r11;
+		break;
+	case arg_R12:
+		rv = &regs->r12;
+		break;
+	case arg_R13:
+		rv = &regs->r13;
+		break;
+	case arg_R14:
+		rv = &regs->r14;
+		break;
+	case arg_R15:
+		rv = &regs->r15;
+		break;
+#endif
+	default:
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+	}
+
+	return rv;
+}
+
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+	unsigned int opcode;
+	int reg;
+	unsigned char *p;
+	struct prefix_bits prf;
+	int i;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &prf);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+		if (reg_rop[i] == opcode)
+			goto do_work;
+
+	for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+		if (reg_wop[i] == opcode)
+			goto do_work;
+
+	printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	/* for STOS, source register is fixed */
+	if (opcode == 0xAA || opcode == 0xAB) {
+		reg = arg_AX;
+	} else {
+		unsigned char mod_rm = *p;
+		reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
+	}
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *get_reg_w8(reg, prf.rex, regs);
+
+	case 2:
+		return *(unsigned short *)get_reg_w32(reg, regs);
+
+	case 4:
+		return *(unsigned int *)get_reg_w32(reg, regs);
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+	}
+
+err:
+	return 0;
+}
+
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char mod_rm;
+	unsigned char mod;
+	unsigned char *p;
+	struct prefix_bits prf;
+	int i;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &prf);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+		if (imm_wop[i] == opcode)
+			goto do_work;
+
+	printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	mod_rm = *p;
+	mod = mod_rm >> 6;
+	p++;
+	switch (mod) {
+	case 0:
+		/* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2)  */
+		/* AMD64: XXX Check for address size prefix? */
+		if ((mod_rm & 0x7) == 0x5)
+			p += 4;
+		break;
+
+	case 1:
+		p += 1;
+		break;
+
+	case 2:
+		p += 4;
+		break;
+
+	case 3:
+	default:
+		printk(KERN_ERR "mmiotrace: not a memory access instruction "
+						"at 0x%lx, rm_mod=0x%02x\n",
+						ins_addr, mod_rm);
+	}
+
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *(unsigned char *)p;
+
+	case 2:
+		return *(unsigned short *)p;
+
+	case 4:
+		return *(unsigned int *)p;
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)p;
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error: width.\n");
+	}
+
+err:
+	return 0;
+}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000..e05341a51
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum reason_type {
+	NOT_ME,	/* page fault is not in regions */
+	NOTHING,	/* access others point in regions */
+	REG_READ,	/* read from addr to reg */
+	REG_WRITE,	/* write from reg to addr */
+	IMM_WRITE,	/* write from imm to addr */
+	OTHERS	/* Other instructions can not intercept */
+};
+
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+
+#endif /* __PF_H_ */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 000000000..0b97d2c75
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,627 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/fixmap.h>
+#include <asm/mtrr.h>
+
+#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+
+#ifdef CONFIG_HIGHPTE
+#define PGALLOC_USER_GFP __GFP_HIGHMEM
+#else
+#define PGALLOC_USER_GFP 0
+#endif
+
+gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+	return (pte_t *)__get_free_page(PGALLOC_GFP);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+	struct page *pte;
+
+	pte = alloc_pages(__userpte_alloc_gfp, 0);
+	if (!pte)
+		return NULL;
+	if (!pgtable_page_ctor(pte)) {
+		__free_page(pte);
+		return NULL;
+	}
+	return pte;
+}
+
+static int __init setup_userpte(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	/*
+	 * "userpte=nohigh" disables allocation of user pagetables in
+	 * high memory.
+	 */
+	if (strcmp(arg, "nohigh") == 0)
+		__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
+	else
+		return -EINVAL;
+	return 0;
+}
+early_param("userpte", setup_userpte);
+
+void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+	pgtable_page_dtor(pte);
+	paravirt_release_pte(page_to_pfn(pte));
+	tlb_remove_page(tlb, pte);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 2
+void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+	struct page *page = virt_to_page(pmd);
+	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+	/*
+	 * NOTE! For PAE, any changes to the top page-directory-pointer-table
+	 * entries need a full cr3 reload to flush.
+	 */
+#ifdef CONFIG_X86_PAE
+	tlb->need_flush_all = 1;
+#endif
+	pgtable_pmd_page_dtor(page);
+	tlb_remove_page(tlb, page);
+}
+
+#if CONFIG_PGTABLE_LEVELS > 3
+void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+	tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+	struct page *page = virt_to_page(pgd);
+
+	list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD				\
+	(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+
+static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
+{
+	BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
+	virt_to_page(pgd)->index = (pgoff_t)mm;
+}
+
+struct mm_struct *pgd_page_get_mm(struct page *page)
+{
+	return (struct mm_struct *)page->index;
+}
+
+static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
+{
+	/* If the pgd points to a shared pagetable level (either the
+	   ptes in non-PAE, or shared PMD in PAE), then just copy the
+	   references from swapper_pg_dir. */
+	if (CONFIG_PGTABLE_LEVELS == 2 ||
+	    (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+	    CONFIG_PGTABLE_LEVELS == 4) {
+		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+				KERNEL_PGD_PTRS);
+	}
+
+	/* list required to sync kernel mapping updates */
+	if (!SHARED_KERNEL_PMD) {
+		pgd_set_mm(pgd, mm);
+		pgd_list_add(pgd);
+	}
+}
+
+static void pgd_dtor(pgd_t *pgd)
+{
+	if (SHARED_KERNEL_PMD)
+		return;
+
+	spin_lock(&pgd_lock);
+	pgd_list_del(pgd);
+	spin_unlock(&pgd_lock);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- nyc
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update.  Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+#define PREALLOCATED_PMDS	UNSHARED_PTRS_PER_PGD
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+	/* Note: almost everything apart from _PAGE_PRESENT is
+	   reserved at the pmd (PDPT) level. */
+	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+	/*
+	 * According to Intel App note "TLBs, Paging-Structure Caches,
+	 * and Their Invalidation", April 2007, document 317080-001,
+	 * section 8.1: in PAE mode we explicitly have to flush the
+	 * TLB via cr3 if the top-level pgd is changed...
+	 */
+	flush_tlb_mm(mm);
+}
+#else  /* !CONFIG_X86_PAE */
+
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+#define PREALLOCATED_PMDS	0
+
+#endif	/* CONFIG_X86_PAE */
+
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
+{
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++)
+		if (pmds[i]) {
+			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
+			free_page((unsigned long)pmds[i]);
+			mm_dec_nr_pmds(mm);
+		}
+}
+
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
+{
+	int i;
+	bool failed = false;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
+		if (!pmd)
+			failed = true;
+		if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
+			free_page((unsigned long)pmd);
+			pmd = NULL;
+			failed = true;
+		}
+		if (pmd)
+			mm_inc_nr_pmds(mm);
+		pmds[i] = pmd;
+	}
+
+	if (failed) {
+		free_pmds(mm, pmds);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+	int i;
+
+	for(i = 0; i < PREALLOCATED_PMDS; i++) {
+		pgd_t pgd = pgdp[i];
+
+		if (pgd_val(pgd) != 0) {
+			pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+			pgdp[i] = native_make_pgd(0);
+
+			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+			pmd_free(mm, pmd);
+			mm_dec_nr_pmds(mm);
+		}
+	}
+}
+
+static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
+{
+	pud_t *pud;
+	int i;
+
+	if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
+		return;
+
+	pud = pud_offset(pgd, 0);
+
+	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
+		pmd_t *pmd = pmds[i];
+
+		if (i >= KERNEL_PGD_BOUNDARY)
+			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+			       sizeof(pmd_t) * PTRS_PER_PMD);
+
+		pud_populate(mm, pud, pmd);
+	}
+}
+
+/*
+ * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
+ * assumes that pgd should be in one page.
+ *
+ * But kernel with PAE paging that is not running as a Xen domain
+ * only needs to allocate 32 bytes for pgd instead of one page.
+ */
+#ifdef CONFIG_X86_PAE
+
+#include <linux/slab.h>
+
+#define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_ALIGN	32
+
+static struct kmem_cache *pgd_cache;
+
+static int __init pgd_cache_init(void)
+{
+	/*
+	 * When PAE kernel is running as a Xen domain, it does not use
+	 * shared kernel pmd. And this requires a whole page for pgd.
+	 */
+	if (!SHARED_KERNEL_PMD)
+		return 0;
+
+	/*
+	 * when PAE kernel is not running as a Xen domain, it uses
+	 * shared kernel pmd. Shared kernel pmd does not require a whole
+	 * page for pgd. We are able to just allocate a 32-byte for pgd.
+	 * During boot time, we create a 32-byte slab for pgd table allocation.
+	 */
+	pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
+				      SLAB_PANIC, NULL);
+	if (!pgd_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+core_initcall(pgd_cache_init);
+
+static inline pgd_t *_pgd_alloc(void)
+{
+	/*
+	 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
+	 * We allocate one page for pgd.
+	 */
+	if (!SHARED_KERNEL_PMD)
+		return (pgd_t *)__get_free_page(PGALLOC_GFP);
+
+	/*
+	 * Now PAE kernel is not running as a Xen domain. We can allocate
+	 * a 32-byte slab for pgd to save memory space.
+	 */
+	return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+	if (!SHARED_KERNEL_PMD)
+		free_page((unsigned long)pgd);
+	else
+		kmem_cache_free(pgd_cache, pgd);
+}
+#else
+static inline pgd_t *_pgd_alloc(void)
+{
+	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+	free_page((unsigned long)pgd);
+}
+#endif /* CONFIG_X86_PAE */
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd;
+	pmd_t *pmds[PREALLOCATED_PMDS];
+
+	pgd = _pgd_alloc();
+
+	if (pgd == NULL)
+		goto out;
+
+	mm->pgd = pgd;
+
+	if (preallocate_pmds(mm, pmds) != 0)
+		goto out_free_pgd;
+
+	if (paravirt_pgd_alloc(mm) != 0)
+		goto out_free_pmds;
+
+	/*
+	 * Make sure that pre-populating the pmds is atomic with
+	 * respect to anything walking the pgd_list, so that they
+	 * never see a partially populated pgd.
+	 */
+	spin_lock(&pgd_lock);
+
+	pgd_ctor(mm, pgd);
+	pgd_prepopulate_pmd(mm, pgd, pmds);
+
+	spin_unlock(&pgd_lock);
+
+	return pgd;
+
+out_free_pmds:
+	free_pmds(mm, pmds);
+out_free_pgd:
+	_pgd_free(pgd);
+out:
+	return NULL;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	pgd_mop_up_pmds(mm, pgd);
+	pgd_dtor(pgd);
+	paravirt_pgd_free(mm, pgd);
+	_pgd_free(pgd);
+}
+
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pte_t *ptep,
+			  pte_t entry, int dirty)
+{
+	int changed = !pte_same(*ptep, entry);
+
+	if (changed && dirty) {
+		*ptep = entry;
+		pte_update_defer(vma->vm_mm, address, ptep);
+	}
+
+	return changed;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp,
+			  pmd_t entry, int dirty)
+{
+	int changed = !pmd_same(*pmdp, entry);
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	if (changed && dirty) {
+		*pmdp = entry;
+		pmd_update_defer(vma->vm_mm, address, pmdp);
+		/*
+		 * We had a write-protection fault here and changed the pmd
+		 * to to more permissive. No need to flush the TLB for that,
+		 * #PF is architecturally guaranteed to do that and in the
+		 * worst-case we'll generate a spurious fault.
+		 */
+	}
+
+	return changed;
+}
+#endif
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pte_t *ptep)
+{
+	int ret = 0;
+
+	if (pte_young(*ptep))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *) &ptep->pte);
+
+	if (ret)
+		pte_update(vma->vm_mm, addr, ptep);
+
+	return ret;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+			      unsigned long addr, pmd_t *pmdp)
+{
+	int ret = 0;
+
+	if (pmd_young(*pmdp))
+		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+					 (unsigned long *)pmdp);
+
+	if (ret)
+		pmd_update(vma->vm_mm, addr, pmdp);
+
+	return ret;
+}
+#endif
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+			   unsigned long address, pte_t *ptep)
+{
+	/*
+	 * On x86 CPUs, clearing the accessed bit without a TLB flush
+	 * doesn't cause data corruption. [ It could cause incorrect
+	 * page aging and the (mistaken) reclaim of hot pages, but the
+	 * chance of that should be relatively low. ]
+	 *
+	 * So as a performance optimization don't flush the TLB when
+	 * clearing the accessed bit, it will eventually be flushed by
+	 * a context switch or a VM operation anyway. [ In the rare
+	 * event of it not getting flushed for a long time the delay
+	 * shouldn't really matter because there's no real memory
+	 * pressure for swapout to react to. ]
+	 */
+	return ptep_test_and_clear_young(vma, address, ptep);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+			   unsigned long address, pmd_t *pmdp)
+{
+	int young;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	young = pmdp_test_and_clear_young(vma, address, pmdp);
+	if (young)
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+	return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+			  unsigned long address, pmd_t *pmdp)
+{
+	int set;
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+				(unsigned long *)pmdp);
+	if (set) {
+		pmd_update(vma->vm_mm, address, pmdp);
+		/* need tlb flush only to serialize against gup-fast */
+		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+	}
+}
+#endif
+
+/**
+ * reserve_top_address - reserves a hole in the top of kernel address space
+ * @reserve - size of hole to reserve
+ *
+ * Can be used to relocate the fixmap area and poke a hole in the top
+ * of kernel address space to make room for a hypervisor.
+ */
+void __init reserve_top_address(unsigned long reserve)
+{
+#ifdef CONFIG_X86_32
+	BUG_ON(fixmaps_set > 0);
+	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
+	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
+	       -reserve, __FIXADDR_TOP + PAGE_SIZE);
+#endif
+}
+
+int fixmaps_set;
+
+void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
+{
+	unsigned long address = __fix_to_virt(idx);
+
+	if (idx >= __end_of_fixed_addresses) {
+		BUG();
+		return;
+	}
+	set_pte_vaddr(address, pte);
+	fixmaps_set++;
+}
+
+void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
+		       pgprot_t flags)
+{
+	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
+}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+	u8 mtrr;
+
+	/*
+	 * Do not use a huge page when the range is covered by non-WB type
+	 * of MTRRs.
+	 */
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
+	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+		return 0;
+
+	prot = pgprot_4k_2_large(prot);
+
+	set_pte((pte_t *)pud, pfn_pte(
+		(u64)addr >> PAGE_SHIFT,
+		__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+	return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+	u8 mtrr;
+
+	/*
+	 * Do not use a huge page when the range is covered by non-WB type
+	 * of MTRRs.
+	 */
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
+	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+		return 0;
+
+	prot = pgprot_4k_2_large(prot);
+
+	set_pte((pte_t *)pmd, pfn_pte(
+		(u64)addr >> PAGE_SHIFT,
+		__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+	return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+	if (pud_large(*pud)) {
+		pud_clear(pud);
+		return 1;
+	}
+
+	return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+	if (pmd_large(*pmd)) {
+		pmd_clear(pmd);
+		return 1;
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
new file mode 100644
index 000000000..75cc0978d
--- /dev/null
+++ b/arch/x86/mm/pgtable_32.c
@@ -0,0 +1,98 @@
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = swapper_pg_dir + pgd_index(vaddr);
+	if (pgd_none(*pgd)) {
+		BUG();
+		return;
+	}
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		BUG();
+		return;
+	}
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		BUG();
+		return;
+	}
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_val(pteval))
+		set_pte_at(&init_mm, vaddr, pte, pteval);
+	else
+		pte_clear(&init_mm, vaddr, pte);
+
+	/*
+	 * It's enough to flush this one mapping.
+	 * (PGE mappings get flushed as well)
+	 */
+	__flush_tlb_one(vaddr);
+}
+
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+
+	/* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
+	__VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
+	return 0;
+}
+early_param("vmalloc", parse_vmalloc);
+
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later.  Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+	unsigned long address;
+
+	if (!arg)
+		return -EINVAL;
+
+	address = memparse(arg, &arg);
+	reserve_top_address(address);
+	early_ioremap_init();
+	return 0;
+}
+early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
new file mode 100644
index 000000000..e666cbbb9
--- /dev/null
+++ b/arch/x86/mm/physaddr.c
@@ -0,0 +1,98 @@
+#include <linux/bootmem.h>
+#include <linux/mmdebug.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/page.h>
+
+#include "physaddr.h"
+
+#ifdef CONFIG_X86_64
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+	unsigned long y = x - __START_KERNEL_map;
+
+	/* use the carry flag to determine if x was < __START_KERNEL_map */
+	if (unlikely(x > y)) {
+		x = y + phys_base;
+
+		VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+	} else {
+		x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+		/* carry flag will be set if starting x was >= PAGE_OFFSET */
+		VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
+	}
+
+	return x;
+}
+EXPORT_SYMBOL(__phys_addr);
+
+unsigned long __phys_addr_symbol(unsigned long x)
+{
+	unsigned long y = x - __START_KERNEL_map;
+
+	/* only check upper bounds since lower bounds will trigger carry */
+	VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
+
+	return y + phys_base;
+}
+EXPORT_SYMBOL(__phys_addr_symbol);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+	unsigned long y = x - __START_KERNEL_map;
+
+	/* use the carry flag to determine if x was < __START_KERNEL_map */
+	if (unlikely(x > y)) {
+		x = y + phys_base;
+
+		if (y >= KERNEL_IMAGE_SIZE)
+			return false;
+	} else {
+		x = y + (__START_KERNEL_map - PAGE_OFFSET);
+
+		/* carry flag will be set if starting x was >= PAGE_OFFSET */
+		if ((x > y) || !phys_addr_valid(x))
+			return false;
+	}
+
+	return pfn_valid(x >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#else
+
+#ifdef CONFIG_DEBUG_VIRTUAL
+unsigned long __phys_addr(unsigned long x)
+{
+	unsigned long phys_addr = x - PAGE_OFFSET;
+	/* VMALLOC_* aren't constants  */
+	VIRTUAL_BUG_ON(x < PAGE_OFFSET);
+	VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
+	/* max_low_pfn is set early, but not _that_ early */
+	if (max_low_pfn) {
+		VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
+		BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
+	}
+	return phys_addr;
+}
+EXPORT_SYMBOL(__phys_addr);
+#endif
+
+bool __virt_addr_valid(unsigned long x)
+{
+	if (x < PAGE_OFFSET)
+		return false;
+	if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+		return false;
+	if (x >= FIXADDR_START)
+		return false;
+	return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(__virt_addr_valid);
+
+#endif	/* CONFIG_X86_64 */
diff --git a/arch/x86/mm/physaddr.h b/arch/x86/mm/physaddr.h
new file mode 100644
index 000000000..a3cd5a0c9
--- /dev/null
+++ b/arch/x86/mm/physaddr.h
@@ -0,0 +1,10 @@
+#include <asm/processor.h>
+
+static inline int phys_addr_valid(resource_size_t addr)
+{
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	return !(addr >> boot_cpu_data.x86_phys_bits);
+#else
+	return 1;
+#endif
+}
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
new file mode 100644
index 000000000..90555bf60
--- /dev/null
+++ b/arch/x86/mm/setup_nx.c
@@ -0,0 +1,60 @@
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+
+static int disable_nx;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (!strncmp(str, "on", 2)) {
+		disable_nx = 0;
+	} else if (!strncmp(str, "off", 3)) {
+		disable_nx = 1;
+	}
+	x86_configure_nx();
+	return 0;
+}
+early_param("noexec", noexec_setup);
+
+void x86_configure_nx(void)
+{
+	if (cpu_has_nx && !disable_nx)
+		__supported_pte_mask |= _PAGE_NX;
+	else
+		__supported_pte_mask &= ~_PAGE_NX;
+}
+
+void __init x86_report_nx(void)
+{
+	if (!cpu_has_nx) {
+		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+		       "missing in CPU!\n");
+	} else {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+		if (disable_nx) {
+			printk(KERN_INFO "NX (Execute Disable) protection: "
+			       "disabled by kernel command line option\n");
+		} else {
+			printk(KERN_INFO "NX (Execute Disable) protection: "
+			       "active\n");
+		}
+#else
+		/* 32bit non-PAE kernel, NX cannot be used */
+		printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
+		       "cannot be enabled: non-PAE kernel!\n");
+#endif
+	}
+}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 000000000..66338a60a
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,222 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+int acpi_numa __initdata;
+
+static __init int setup_node(int pxm)
+{
+	return acpi_map_pxm_to_node(pxm);
+}
+
+static __init void bad_srat(void)
+{
+	printk(KERN_ERR "SRAT: SRAT not used.\n");
+	acpi_numa = -1;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return acpi_numa < 0;
+}
+
+/*
+ * Callback for SLIT parsing.  pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them.  I/O localities are
+ * not supported at this point.
+ */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+	int i, j;
+
+	for (i = 0; i < slit->locality_count; i++) {
+		const int from_node = pxm_to_node(i);
+
+		if (from_node == NUMA_NO_NODE)
+			continue;
+
+		for (j = 0; j < slit->locality_count; j++) {
+			const int to_node = pxm_to_node(j);
+
+			if (to_node == NUMA_NO_NODE)
+				continue;
+
+			numa_set_distance(from_node, to_node,
+				slit->entry[slit->locality_count * i + j]);
+		}
+	}
+}
+
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	int pxm, node;
+	int apic_id;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain;
+	apic_id = pa->apic_id;
+	if (!apic->apic_id_valid(apic_id)) {
+		printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
+			 pxm, apic_id);
+		return;
+	}
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
+	set_apicid_to_node(apic_id, node);
+	node_set(node, numa_nodes_parsed);
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
+	       pxm, apic_id, node);
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+	int pxm, node;
+	int apic_id;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain_lo;
+	if (acpi_srat_revision >= 2)
+		pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	if (get_uv_system_type() >= UV_X2APIC)
+		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+	else
+		apic_id = pa->apic_id;
+
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
+
+	set_apicid_to_node(apic_id, node);
+	node_set(node, numa_nodes_parsed);
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
+	       pxm, apic_id, node);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+int __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+{
+	u64 start, end;
+	u32 hotpluggable;
+	int node, pxm;
+
+	if (srat_disabled())
+		goto out_err;
+	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
+		goto out_err_bad_srat;
+	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+		goto out_err;
+	hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
+	if (hotpluggable && !save_add_info())
+		goto out_err;
+
+	start = ma->base_address;
+	end = start + ma->length;
+	pxm = ma->proximity_domain;
+	if (acpi_srat_revision <= 1)
+		pxm &= 0xff;
+
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+		goto out_err_bad_srat;
+	}
+
+	if (numa_add_memblk(node, start, end) < 0)
+		goto out_err_bad_srat;
+
+	node_set(node, numa_nodes_parsed);
+
+	pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+		node, pxm,
+		(unsigned long long) start, (unsigned long long) end - 1,
+		hotpluggable ? " hotplug" : "");
+
+	/* Mark hotplug range in memblock. */
+	if (hotpluggable && memblock_mark_hotplug(start, ma->length))
+		pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
+			(unsigned long long)start, (unsigned long long)end - 1);
+
+	return 0;
+out_err_bad_srat:
+	bad_srat();
+out_err:
+	return -1;
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+int __init x86_acpi_numa_init(void)
+{
+	int ret;
+
+	ret = acpi_numa_init();
+	if (ret < 0)
+		return ret;
+	return srat_disabled() ? -EINVAL : 0;
+}
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000..38868adf0
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,140 @@
+/*
+ * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/mmiotrace.h>
+
+static unsigned long mmio_address;
+module_param(mmio_address, ulong, 0);
+MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
+				"(or 8 MB if read_far is non-zero).");
+
+static unsigned long read_far = 0x400100;
+module_param(read_far, ulong, 0);
+MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
+				"(default: 0x400100).");
+
+static unsigned v16(unsigned i)
+{
+	return i * 12 + 7;
+}
+
+static unsigned v32(unsigned i)
+{
+	return i * 212371 + 13;
+}
+
+static void do_write_test(void __iomem *p)
+{
+	unsigned int i;
+	pr_info("write test.\n");
+	mmiotrace_printk("Write test.\n");
+
+	for (i = 0; i < 256; i++)
+		iowrite8(i, p + i);
+
+	for (i = 1024; i < (5 * 1024); i += 2)
+		iowrite16(v16(i), p + i);
+
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		iowrite32(v32(i), p + i);
+}
+
+static void do_read_test(void __iomem *p)
+{
+	unsigned int i;
+	unsigned errs[3] = { 0 };
+	pr_info("read test.\n");
+	mmiotrace_printk("Read test.\n");
+
+	for (i = 0; i < 256; i++)
+		if (ioread8(p + i) != i)
+			++errs[0];
+
+	for (i = 1024; i < (5 * 1024); i += 2)
+		if (ioread16(p + i) != v16(i))
+			++errs[1];
+
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		if (ioread32(p + i) != v32(i))
+			++errs[2];
+
+	mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
+						errs[0], errs[1], errs[2]);
+}
+
+static void do_read_far_test(void __iomem *p)
+{
+	pr_info("read far test.\n");
+	mmiotrace_printk("Read far test.\n");
+
+	ioread32(p + read_far);
+}
+
+static void do_test(unsigned long size)
+{
+	void __iomem *p = ioremap_nocache(mmio_address, size);
+	if (!p) {
+		pr_err("could not ioremap, aborting.\n");
+		return;
+	}
+	mmiotrace_printk("ioremap returned %p.\n", p);
+	do_write_test(p);
+	do_read_test(p);
+	if (read_far && read_far < size - 4)
+		do_read_far_test(p);
+	iounmap(p);
+}
+
+/*
+ * Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
+ * a short time. We had a bug in deferred freeing procedure which tried
+ * to free this region multiple times (ioremap can reuse the same address
+ * for many mappings).
+ */
+static void do_test_bulk_ioremapping(void)
+{
+	void __iomem *p;
+	int i;
+
+	for (i = 0; i < 10; ++i) {
+		p = ioremap_nocache(mmio_address, PAGE_SIZE);
+		if (p)
+			iounmap(p);
+	}
+
+	/* Force freeing. If it will crash we will know why. */
+	synchronize_rcu();
+}
+
+static int __init init(void)
+{
+	unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+
+	if (mmio_address == 0) {
+		pr_err("you have to use the module argument mmio_address.\n");
+		pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+		return -ENXIO;
+	}
+
+	pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
+		   "and writing 16 kB of rubbish in there.\n",
+		   size >> 10, mmio_address);
+	do_test(size);
+	do_test_bulk_ioremapping();
+	pr_info("All done.\n");
+	return 0;
+}
+
+static void __exit cleanup(void)
+{
+	pr_debug("unloaded.\n");
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
new file mode 100644
index 000000000..3250f2371
--- /dev/null
+++ b/arch/x86/mm/tlb.c
@@ -0,0 +1,325 @@
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/cache.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+#include <linux/debugfs.h>
+
+/*
+ *	Smarter SMP flushing macros.
+ *		c/o Linus Torvalds.
+ *
+ *	These mean you can really definitely utterly forget about
+ *	writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *	Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *	More scalable flush, from Andi Kleen
+ *
+ *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
+ */
+
+struct flush_tlb_info {
+	struct mm_struct *flush_mm;
+	unsigned long flush_start;
+	unsigned long flush_end;
+};
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ */
+void leave_mm(int cpu)
+{
+	struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		BUG();
+	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
+		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+		load_cr3(swapper_pg_dir);
+		/*
+		 * This gets called in the idle path where RCU
+		 * functions differently.  Tracing normally
+		 * uses RCU, so we have to call the tracepoint
+		 * specially here.
+		 */
+		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+	}
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+
+/*
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) set cpu_tlbstate to TLBSTATE_OK
+ *	Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
+ *	if cpu0 was in lazy tlb mode.
+ * 1a2) update cpu active_mm
+ *	Now cpu0 accepts tlb flushes for the new mm.
+ * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
+ *	Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ *	Stop ipi delivery for the old mm. This is not synchronized with
+ *	the other cpus, but flush_tlb_func ignore flush ipis for the wrong
+ *	mm, and in the worst case we perform a superfluous tlb flush.
+ * 1b) thread switch without mm change
+ *	cpu active_mm is correct, cpu0 already handles flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ *	Atomically set the bit [other cpus will start sending flush ipis],
+ *	and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush funcation:
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+static void flush_tlb_func(void *info)
+{
+	struct flush_tlb_info *f = info;
+
+	inc_irq_stat(irq_tlb_count);
+
+	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+		return;
+	if (!f->flush_end)
+		f->flush_end = f->flush_start + PAGE_SIZE;
+
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+		if (f->flush_end == TLB_FLUSH_ALL) {
+			local_flush_tlb();
+			trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
+		} else {
+			unsigned long addr;
+			unsigned long nr_pages =
+				f->flush_end - f->flush_start / PAGE_SIZE;
+			addr = f->flush_start;
+			while (addr < f->flush_end) {
+				__flush_tlb_single(addr);
+				addr += PAGE_SIZE;
+			}
+			trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
+		}
+	} else
+		leave_mm(smp_processor_id());
+
+}
+
+void native_flush_tlb_others(const struct cpumask *cpumask,
+				 struct mm_struct *mm, unsigned long start,
+				 unsigned long end)
+{
+	struct flush_tlb_info info;
+	info.flush_mm = mm;
+	info.flush_start = start;
+	info.flush_end = end;
+
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+	if (is_uv_system()) {
+		unsigned int cpu;
+
+		cpu = smp_processor_id();
+		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
+		if (cpumask)
+			smp_call_function_many(cpumask, flush_tlb_func,
+								&info, 1);
+		return;
+	}
+	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
+}
+
+void flush_tlb_current_task(void)
+{
+	struct mm_struct *mm = current->mm;
+
+	preempt_disable();
+
+	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+	local_flush_tlb();
+	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+	preempt_enable();
+}
+
+/*
+ * See Documentation/x86/tlb.txt for details.  We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead.  Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
+void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+				unsigned long end, unsigned long vmflag)
+{
+	unsigned long addr;
+	/* do a global flush by default */
+	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
+
+	preempt_disable();
+	if (current->active_mm != mm)
+		goto out;
+
+	if (!current->mm) {
+		leave_mm(smp_processor_id());
+		goto out;
+	}
+
+	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+
+	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
+		base_pages_to_flush = TLB_FLUSH_ALL;
+		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+		local_flush_tlb();
+	} else {
+		/* flush range by one by one 'invlpg' */
+		for (addr = start; addr < end;	addr += PAGE_SIZE) {
+			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+			__flush_tlb_single(addr);
+		}
+	}
+	trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
+out:
+	if (base_pages_to_flush == TLB_FLUSH_ALL) {
+		start = 0UL;
+		end = TLB_FLUSH_ALL;
+	}
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, start, end);
+	preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	preempt_disable();
+
+	if (current->active_mm == mm) {
+		if (current->mm)
+			__flush_tlb_one(start);
+		else
+			leave_mm(smp_processor_id());
+	}
+
+	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+		flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
+
+	preempt_enable();
+}
+
+static void do_flush_tlb_all(void *info)
+{
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+	__flush_tlb_all();
+	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
+		leave_mm(smp_processor_id());
+}
+
+void flush_tlb_all(void)
+{
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
+}
+
+static void do_kernel_range_flush(void *info)
+{
+	struct flush_tlb_info *f = info;
+	unsigned long addr;
+
+	/* flush range by one by one 'invlpg' */
+	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+		__flush_tlb_single(addr);
+}
+
+void flush_tlb_kernel_range(unsigned long start, unsigned long end)
+{
+
+	/* Balance as user space task's flush, a bit conservative */
+	if (end == TLB_FLUSH_ALL ||
+	    (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	} else {
+		struct flush_tlb_info info;
+		info.flush_start = start;
+		info.flush_end = end;
+		on_each_cpu(do_kernel_range_flush, &info, 1);
+	}
+}
+
+static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
+			     size_t count, loff_t *ppos)
+{
+	char buf[32];
+	unsigned int len;
+
+	len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t tlbflush_write_file(struct file *file,
+		 const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[32];
+	ssize_t len;
+	int ceiling;
+
+	len = min(count, sizeof(buf) - 1);
+	if (copy_from_user(buf, user_buf, len))
+		return -EFAULT;
+
+	buf[len] = '\0';
+	if (kstrtoint(buf, 0, &ceiling))
+		return -EINVAL;
+
+	if (ceiling < 0)
+		return -EINVAL;
+
+	tlb_single_page_flush_ceiling = ceiling;
+	return count;
+}
+
+static const struct file_operations fops_tlbflush = {
+	.read = tlbflush_read_file,
+	.write = tlbflush_write_file,
+	.llseek = default_llseek,
+};
+
+static int __init create_tlb_single_page_flush_ceiling(void)
+{
+	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
+			    arch_debugfs_dir, NULL, &fops_tlbflush);
+	return 0;
+}
+late_initcall(create_tlb_single_page_flush_ceiling);