summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-10-20 00:10:27 -0300
commitd0b2f91bede3bd5e3d24dd6803e56eee959c1797 (patch)
tree7fee4ab0509879c373c4f2cbd5b8a5be5b4041ee /arch/x86/mm
parente914f8eb445e8f74b00303c19c2ffceaedd16a05 (diff)
Linux-libre 4.8.2-gnupck-4.8.2-gnu
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/amdtopology.c1
-rw-r--r--arch/x86/mm/dump_pagetables.c22
-rw-r--r--arch/x86/mm/extable.c15
-rw-r--r--arch/x86/mm/fault.c113
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/ident_map.c19
-rw-r--r--arch/x86/mm/init.c28
-rw-r--r--arch/x86/mm/init_32.c1
-rw-r--r--arch/x86/mm/init_64.c205
-rw-r--r--arch/x86/mm/iomap_32.c2
-rw-r--r--arch/x86/mm/ioremap.c1
-rw-r--r--arch/x86/mm/kaslr.c172
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c1
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c2
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/mmio-mod.c2
-rw-r--r--arch/x86/mm/numa.c3
-rw-r--r--arch/x86/mm/numa_32.c2
-rw-r--r--arch/x86/mm/pageattr.c49
-rw-r--r--arch/x86/mm/pat.c6
-rw-r--r--arch/x86/mm/pat_rbtree.c1
-rw-r--r--arch/x86/mm/pf_in.c1
-rw-r--r--arch/x86/mm/pgtable.c10
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/physaddr.c2
-rw-r--r--arch/x86/mm/srat.c118
-rw-r--r--arch/x86/mm/tlb.c2
28 files changed, 375 insertions, 411 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 62c0043a5..96d2b847e 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -37,4 +37,5 @@ obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 2ca15b59f..ba47524f5 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -9,7 +9,6 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 99bfb1928..ea9c49ada 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -14,7 +14,7 @@
#include <linux/debugfs.h>
#include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/seq_file.h>
#include <asm/pgtable.h>
@@ -72,9 +72,9 @@ static struct addr_marker address_markers[] = {
{ 0, "User Space" },
#ifdef CONFIG_X86_64
{ 0x8000000000000000UL, "Kernel Space" },
- { PAGE_OFFSET, "Low Kernel Mapping" },
- { VMALLOC_START, "vmalloc() Area" },
- { VMEMMAP_START, "Vmemmap" },
+ { 0/* PAGE_OFFSET */, "Low Kernel Mapping" },
+ { 0/* VMALLOC_START */, "vmalloc() Area" },
+ { 0/* VMEMMAP_START */, "Vmemmap" },
# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
# endif
@@ -434,8 +434,16 @@ void ptdump_walk_pgd_level_checkwx(void)
static int __init pt_dump_init(void)
{
+ /*
+ * Various markers are not compile-time constants, so assign them
+ * here.
+ */
+#ifdef CONFIG_X86_64
+ address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
+ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
+ address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
+#endif
#ifdef CONFIG_X86_32
- /* Not a compile-time constant on x86-32 */
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
# ifdef CONFIG_HIGHMEM
@@ -446,8 +454,4 @@ static int __init pt_dump_init(void)
return 0;
}
-
__initcall(pt_dump_init);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
-MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 4bb53b89f..832b98f82 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,6 +1,7 @@
#include <linux/module.h>
#include <asm/uaccess.h>
#include <asm/traps.h>
+#include <asm/kdebug.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int);
@@ -37,7 +38,7 @@ bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
/* Special hack for uaccess_err */
- current_thread_info()->uaccess_err = 1;
+ current->thread.uaccess_err = 1;
regs->ip = ex_fixup_addr(fixup);
return true;
}
@@ -46,8 +47,9 @@ EXPORT_SYMBOL(ex_handler_ext);
bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
- WARN_ONCE(1, "unchecked MSR access error: RDMSR from 0x%x\n",
- (unsigned int)regs->cx);
+ if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
/* Pretend that the read succeeded and returned 0. */
regs->ip = ex_fixup_addr(fixup);
@@ -60,9 +62,10 @@ EXPORT_SYMBOL(ex_handler_rdmsr_unsafe);
bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
- WARN_ONCE(1, "unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x)\n",
- (unsigned int)regs->cx,
- (unsigned int)regs->dx, (unsigned int)regs->ax);
+ if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx,
+ (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
+ show_stack_regs(regs);
/* Pretend that the write succeeded. */
regs->ip = ex_fixup_addr(fixup);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 441a68fcd..dc8023060 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,7 +13,6 @@
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <linux/prefetch.h> /* prefetchw */
#include <linux/context_tracking.h> /* exception_enter(), ... */
-#include <linux/tuxonice.h> /* incremental image support */
#include <linux/uaccess.h> /* faulthandler_disabled() */
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
@@ -440,7 +439,7 @@ static noinline int vmalloc_fault(unsigned long address)
* happen within a race in page table update. In the later
* case just flush:
*/
- pgd = pgd_offset(current->active_mm, address);
+ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
pgd_ref = pgd_offset_k(address);
if (pgd_none(*pgd_ref))
return -1;
@@ -722,10 +721,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
/* No context means no VMA to pass down */
struct vm_area_struct *vma = NULL;
- if (toi_make_writable(init_mm.pgd, address)) {
- return;
- }
-
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
/*
@@ -742,7 +737,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* In this case we need to make sure we're not recursively
* faulting through the emulate_vsyscall() logic.
*/
- if (current_thread_info()->sig_on_uaccess_error && signal) {
+ if (current->thread.sig_on_uaccess_err && signal) {
tsk->thread.trap_nr = X86_TRAP_PF;
tsk->thread.error_code = error_code | PF_USER;
tsk->thread.cr2 = address;
@@ -1013,101 +1008,10 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
}
}
-#ifdef CONFIG_TOI_INCREMENTAL
-/**
- * _toi_do_cbw - Do a copy-before-write before letting the faulting process continue
- */
-static void toi_do_cbw(struct page *page)
-{
- struct toi_cbw_state *state = this_cpu_ptr(&toi_cbw_states);
-
- state->active = 1;
- wmb();
-
- if (state->enabled && state->next && PageTOI_CBW(page)) {
- struct toi_cbw *this = state->next;
- memcpy(this->virt, page_address(page), PAGE_SIZE);
- this->pfn = page_to_pfn(page);
- state->next = this->next;
- }
-
- state->active = 0;
-}
-
-/**
- * _toi_make_writable - Defuse TOI's write protection
- */
-int _toi_make_writable(pte_t *pte)
-{
- struct page *page = pte_page(*pte);
- if (PageTOI_RO(page)) {
- pgd_t *pgd = __va(read_cr3());
- /*
- * If this is a TuxOnIce caused fault, we may not have permission to
- * write to a page needed to reset the permissions of the original
- * page. Use swapper_pg_dir to get around this.
- */
- load_cr3(swapper_pg_dir);
-
- set_pte_atomic(pte, pte_mkwrite(*pte));
- SetPageTOI_Dirty(page);
- ClearPageTOI_RO(page);
-
- toi_do_cbw(page);
-
- load_cr3(pgd);
- return 1;
- }
- return 0;
-}
-
-/**
- * toi_make_writable - Handle a (potential) fault caused by TOI's write protection
- *
- * Make a page writable that was protected. Might be because of a fault, or
- * because we're allocating it and want it to be untracked.
- *
- * Note that in the fault handling case, we don't care about the error code. If
- * called from the double fault handler, we won't have one. We just check to
- * see if the page was made RO by TOI, and mark it dirty/release the protection
- * if it was.
- */
-int toi_make_writable(pgd_t *pgd, unsigned long address)
-{
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = pgd + pgd_index(address);
- if (!pgd_present(*pgd))
- return 0;
-
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return 0;
-
- if (pud_large(*pud))
- return _toi_make_writable((pte_t *) pud);
-
- pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
- return 0;
-
- if (pmd_large(*pmd))
- return _toi_make_writable((pte_t *) pmd);
-
- pte = pte_offset_kernel(pmd, address);
- if (!pte_present(*pte))
- return 0;
-
- return _toi_make_writable(pte);
-}
-#endif
-
static int spurious_fault_check(unsigned long error_code, pte_t *pte)
{
if ((error_code & PF_WRITE) && !pte_write(*pte))
- return 0;
+ return 0;
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
@@ -1287,15 +1191,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);
- /*
- * Detect and handle page faults due to TuxOnIce making pages read-only
- * so that it can create incremental images.
- *
- * Do it early to avoid double faults.
- */
- if (unlikely(toi_make_writable(init_mm.pgd, address)))
- return;
-
if (unlikely(kmmio_fault(regs, address)))
return;
@@ -1458,7 +1353,7 @@ good_area:
* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
- fault = handle_mm_fault(mm, vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags);
major |= fault & VM_FAULT_MAJOR;
/*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index a6d739258..6d18b70ed 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,5 +1,5 @@
#include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/swap.h> /* for totalram_pages */
#include <linux/bootmem.h>
diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index ec21796ac..4473cb4f8 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -3,15 +3,17 @@
* included by both the compressed kernel and the regular kernel.
*/
-static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
+static void ident_pmd_init(struct x86_mapping_info *info, pmd_t *pmd_page,
unsigned long addr, unsigned long end)
{
addr &= PMD_MASK;
for (; addr < end; addr += PMD_SIZE) {
pmd_t *pmd = pmd_page + pmd_index(addr);
- if (!pmd_present(*pmd))
- set_pmd(pmd, __pmd(addr | pmd_flag));
+ if (pmd_present(*pmd))
+ continue;
+
+ set_pmd(pmd, __pmd((addr - info->offset) | info->pmd_flag));
}
}
@@ -30,13 +32,13 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
if (pud_present(*pud)) {
pmd = pmd_offset(pud, 0);
- ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ ident_pmd_init(info, pmd, addr, next);
continue;
}
pmd = (pmd_t *)info->alloc_pgt_page(info->context);
if (!pmd)
return -ENOMEM;
- ident_pmd_init(info->pmd_flag, pmd, addr, next);
+ ident_pmd_init(info, pmd, addr, next);
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
}
@@ -44,14 +46,15 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
}
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
- unsigned long addr, unsigned long end)
+ unsigned long pstart, unsigned long pend)
{
+ unsigned long addr = pstart + info->offset;
+ unsigned long end = pend + info->offset;
unsigned long next;
int result;
- int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
for (; addr < end; addr = next) {
- pgd_t *pgd = pgd_page + pgd_index(addr) + off;
+ pgd_t *pgd = pgd_page + pgd_index(addr);
pud_t *pud;
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 4459e4c4b..d28a2d741 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -17,6 +17,7 @@
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
+#include <asm/kaslr.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
@@ -121,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num)
return __va(pfn << PAGE_SHIFT);
}
-/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
-#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
+/*
+ * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
+ * With KASLR memory randomization, depending on the machine e820 memory
+ * and the PUD alignment. We may need twice more pages when KASLR memory
+ * randomization is enabled.
+ */
+#ifndef CONFIG_RANDOMIZE_MEMORY
+#define INIT_PGD_PAGE_COUNT 6
+#else
+#define INIT_PGD_PAGE_COUNT 12
+#endif
+#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
{
@@ -150,10 +161,10 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
-#if !defined(CONFIG_KMEMCHECK) && !defined(CONFIG_TOI_INCREMENTAL)
+#if !defined(CONFIG_KMEMCHECK)
/*
- * For CONFIG_KMEMCHECK, TuxOnIce's incremental image support or
- * pagealloc debugging, identity mapping will use small pages.
+ * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
+ * use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
@@ -207,7 +218,7 @@ static int __meminit save_mr(struct map_range *mr, int nr_range,
* adjust the page_size_mask for small range to go with
* big page size instead small one if nearby are ram too.
*/
-static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+static void __ref adjust_range_page_size_mask(struct map_range *mr,
int nr_range)
{
int i;
@@ -395,7 +406,7 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
-unsigned long __init_refok init_memory_mapping(unsigned long start,
+unsigned long __ref init_memory_mapping(unsigned long start,
unsigned long end)
{
struct map_range mr[NR_RANGE_MR];
@@ -590,6 +601,9 @@ void __init init_mem_mapping(void)
/* the ISA range is always mapped regardless of memory holes */
init_memory_mapping(0, ISA_END_ADDRESS);
+ /* Init the trampoline, possibly with KASLR memory offset */
+ init_trampoline();
+
/*
* If the allocation is in bottom-up direction, we setup direct mapping
* in bottom-up, otherwise we setup direct mapping in top-down.
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 84df150ee..cf8059016 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -5,7 +5,6 @@
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*/
-#include <linux/module.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bce2e5d9e..14b9dd71d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -27,7 +27,6 @@
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/dma-mapping.h>
-#include <linux/module.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/memremap.h>
@@ -328,22 +327,30 @@ void __init cleanup_highmap(void)
}
}
+/*
+ * Create PTE level page table mapping for physical addresses.
+ * It returns the last physical address mapped.
+ */
static unsigned long __meminit
-phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
+phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
pgprot_t prot)
{
- unsigned long pages = 0, next;
- unsigned long last_map_addr = end;
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
+ pte_t *pte;
int i;
- pte_t *pte = pte_page + pte_index(addr);
+ pte = pte_page + pte_index(paddr);
+ i = pte_index(paddr);
- for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
- next = (addr & PAGE_MASK) + PAGE_SIZE;
- if (addr >= end) {
+ for (; i < PTRS_PER_PTE; i++, paddr = paddr_next, pte++) {
+ paddr_next = (paddr & PAGE_MASK) + PAGE_SIZE;
+ if (paddr >= paddr_end) {
if (!after_bootmem &&
- !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) &&
- !e820_any_mapped(addr & PAGE_MASK, next, E820_RESERVED_KERN))
+ !e820_any_mapped(paddr & PAGE_MASK, paddr_next,
+ E820_RAM) &&
+ !e820_any_mapped(paddr & PAGE_MASK, paddr_next,
+ E820_RESERVED_KERN))
set_pte(pte, __pte(0));
continue;
}
@@ -354,54 +361,61 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
* pagetable pages as RO. So assume someone who pre-setup
* these mappings are more intelligent.
*/
- if (pte_val(*pte)) {
+ if (!pte_none(*pte)) {
if (!after_bootmem)
pages++;
continue;
}
if (0)
- printk(" pte=%p addr=%lx pte=%016lx\n",
- pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
+ pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
+ pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
pages++;
- set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
- last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+ paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
}
update_page_count(PG_LEVEL_4K, pages);
- return last_map_addr;
+ return paddr_last;
}
+/*
+ * Create PMD level page table mapping for physical addresses. The virtual
+ * and physical address have to be aligned at this level.
+ * It returns the last physical address mapped.
+ */
static unsigned long __meminit
-phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
+phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
unsigned long page_size_mask, pgprot_t prot)
{
- unsigned long pages = 0, next;
- unsigned long last_map_addr = end;
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
- int i = pmd_index(address);
+ int i = pmd_index(paddr);
- for (; i < PTRS_PER_PMD; i++, address = next) {
- pmd_t *pmd = pmd_page + pmd_index(address);
+ for (; i < PTRS_PER_PMD; i++, paddr = paddr_next) {
+ pmd_t *pmd = pmd_page + pmd_index(paddr);
pte_t *pte;
pgprot_t new_prot = prot;
- next = (address & PMD_MASK) + PMD_SIZE;
- if (address >= end) {
+ paddr_next = (paddr & PMD_MASK) + PMD_SIZE;
+ if (paddr >= paddr_end) {
if (!after_bootmem &&
- !e820_any_mapped(address & PMD_MASK, next, E820_RAM) &&
- !e820_any_mapped(address & PMD_MASK, next, E820_RESERVED_KERN))
+ !e820_any_mapped(paddr & PMD_MASK, paddr_next,
+ E820_RAM) &&
+ !e820_any_mapped(paddr & PMD_MASK, paddr_next,
+ E820_RESERVED_KERN))
set_pmd(pmd, __pmd(0));
continue;
}
- if (pmd_val(*pmd)) {
+ if (!pmd_none(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(&init_mm.page_table_lock);
pte = (pte_t *)pmd_page_vaddr(*pmd);
- last_map_addr = phys_pte_init(pte, address,
- end, prot);
+ paddr_last = phys_pte_init(pte, paddr,
+ paddr_end, prot);
spin_unlock(&init_mm.page_table_lock);
continue;
}
@@ -420,7 +434,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
if (page_size_mask & (1 << PG_LEVEL_2M)) {
if (!after_bootmem)
pages++;
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
@@ -430,51 +444,65 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pmd,
- pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
+ pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
pte = alloc_low_page();
- last_map_addr = phys_pte_init(pte, address, end, new_prot);
+ paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
spin_lock(&init_mm.page_table_lock);
pmd_populate_kernel(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
- return last_map_addr;
+ return paddr_last;
}
+/*
+ * Create PUD level page table mapping for physical addresses. The virtual
+ * and physical address do not have to be aligned at this level. KASLR can
+ * randomize virtual addresses up to this level.
+ * It returns the last physical address mapped.
+ */
static unsigned long __meminit
-phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
- unsigned long page_size_mask)
+phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
{
- unsigned long pages = 0, next;
- unsigned long last_map_addr = end;
- int i = pud_index(addr);
+ unsigned long pages = 0, paddr_next;
+ unsigned long paddr_last = paddr_end;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+ int i = pud_index(vaddr);
- for (; i < PTRS_PER_PUD; i++, addr = next) {
- pud_t *pud = pud_page + pud_index(addr);
+ for (; i < PTRS_PER_PUD; i++, paddr = paddr_next) {
+ pud_t *pud;
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
- next = (addr & PUD_MASK) + PUD_SIZE;
- if (addr >= end) {
+ vaddr = (unsigned long)__va(paddr);
+ pud = pud_page + pud_index(vaddr);
+ paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
+
+ if (paddr >= paddr_end) {
if (!after_bootmem &&
- !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
- !e820_any_mapped(addr & PUD_MASK, next, E820_RESERVED_KERN))
+ !e820_any_mapped(paddr & PUD_MASK, paddr_next,
+ E820_RAM) &&
+ !e820_any_mapped(paddr & PUD_MASK, paddr_next,
+ E820_RESERVED_KERN))
set_pud(pud, __pud(0));
continue;
}
- if (pud_val(*pud)) {
+ if (!pud_none(*pud)) {
if (!pud_large(*pud)) {
pmd = pmd_offset(pud, 0);
- last_map_addr = phys_pmd_init(pmd, addr, end,
- page_size_mask, prot);
+ paddr_last = phys_pmd_init(pmd, paddr,
+ paddr_end,
+ page_size_mask,
+ prot);
__flush_tlb_all();
continue;
}
@@ -493,7 +521,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
if (page_size_mask & (1 << PG_LEVEL_1G)) {
if (!after_bootmem)
pages++;
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
@@ -503,16 +531,16 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
pages++;
spin_lock(&init_mm.page_table_lock);
set_pte((pte_t *)pud,
- pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+ pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
- last_map_addr = next;
+ paddr_last = paddr_next;
continue;
}
pmd = alloc_low_page();
- last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
- prot);
+ paddr_last = phys_pmd_init(pmd, paddr, paddr_end,
+ page_size_mask, prot);
spin_lock(&init_mm.page_table_lock);
pud_populate(&init_mm, pud, pmd);
@@ -522,38 +550,44 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
update_page_count(PG_LEVEL_1G, pages);
- return last_map_addr;
+ return paddr_last;
}
+/*
+ * Create page table mapping for the physical memory for specific physical
+ * addresses. The virtual and physical addresses have to be aligned on PMD level
+ * down. It returns the last physical address mapped.
+ */
unsigned long __meminit
-kernel_physical_mapping_init(unsigned long start,
- unsigned long end,
+kernel_physical_mapping_init(unsigned long paddr_start,
+ unsigned long paddr_end,
unsigned long page_size_mask)
{
bool pgd_changed = false;
- unsigned long next, last_map_addr = end;
- unsigned long addr;
+ unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
- start = (unsigned long)__va(start);
- end = (unsigned long)__va(end);
- addr = start;
+ paddr_last = paddr_end;
+ vaddr = (unsigned long)__va(paddr_start);
+ vaddr_end = (unsigned long)__va(paddr_end);
+ vaddr_start = vaddr;
- for (; start < end; start = next) {
- pgd_t *pgd = pgd_offset_k(start);
+ for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+ pgd_t *pgd = pgd_offset_k(vaddr);
pud_t *pud;
- next = (start & PGDIR_MASK) + PGDIR_SIZE;
+ vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
- last_map_addr = phys_pud_init(pud, __pa(start),
- __pa(end), page_size_mask);
+ paddr_last = phys_pud_init(pud, __pa(vaddr),
+ __pa(vaddr_end),
+ page_size_mask);
continue;
}
pud = alloc_low_page();
- last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
- page_size_mask);
+ paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+ page_size_mask);
spin_lock(&init_mm.page_table_lock);
pgd_populate(&init_mm, pgd, pud);
@@ -562,11 +596,11 @@ kernel_physical_mapping_init(unsigned long start,
}
if (pgd_changed)
- sync_global_pgds(addr, end - 1, 0);
+ sync_global_pgds(vaddr_start, vaddr_end - 1, 0);
__flush_tlb_all();
- return last_map_addr;
+ return paddr_last;
}
#ifndef CONFIG_NUMA
@@ -673,7 +707,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
for (i = 0; i < PTRS_PER_PTE; i++) {
pte = pte_start + i;
- if (pte_val(*pte))
+ if (!pte_none(*pte))
return;
}
@@ -691,7 +725,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
for (i = 0; i < PTRS_PER_PMD; i++) {
pmd = pmd_start + i;
- if (pmd_val(*pmd))
+ if (!pmd_none(*pmd))
return;
}
@@ -702,27 +736,6 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
spin_unlock(&init_mm.page_table_lock);
}
-/* Return true if pgd is changed, otherwise return false. */
-static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
-{
- pud_t *pud;
- int i;
-
- for (i = 0; i < PTRS_PER_PUD; i++) {
- pud = pud_start + i;
- if (pud_val(*pud))
- return false;
- }
-
- /* free a pud table */
- free_pagetable(pgd_page(*pgd), 0);
- spin_lock(&init_mm.page_table_lock);
- pgd_clear(pgd);
- spin_unlock(&init_mm.page_table_lock);
-
- return true;
-}
-
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
bool direct)
@@ -913,7 +926,6 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
unsigned long addr;
pgd_t *pgd;
pud_t *pud;
- bool pgd_changed = false;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
@@ -924,13 +936,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
pud = (pud_t *)pgd_page_vaddr(*pgd);
remove_pud_table(pud, addr, next, direct);
- if (free_pud_table(pud, pgd))
- pgd_changed = true;
}
- if (pgd_changed)
- sync_global_pgds(start, end - 1, 1);
-
flush_tlb_all();
}
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 9c0ff045f..ada98b39b 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -18,7 +18,7 @@
#include <asm/iomap.h>
#include <asm/pat.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/highmem.h>
static int is_io_mapping_possible(resource_size_t base, unsigned long size)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index f0894910b..7aaa26358 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -9,7 +9,6 @@
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/io.h>
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
new file mode 100644
index 000000000..bda8d5eef
--- /dev/null
+++ b/arch/x86/mm/kaslr.c
@@ -0,0 +1,172 @@
+/*
+ * This file implements KASLR memory randomization for x86_64. It randomizes
+ * the virtual address space of kernel memory regions (physical memory
+ * mapping, vmalloc & vmemmap) for x86_64. This security feature mitigates
+ * exploits relying on predictable kernel addresses.
+ *
+ * Entropy is generated using the KASLR early boot functions now shared in
+ * the lib directory (originally written by Kees Cook). Randomization is
+ * done on PGD & PUD page table levels to increase possible addresses. The
+ * physical memory mapping code was adapted to support PUD level virtual
+ * addresses. This implementation on the best configuration provides 30,000
+ * possible virtual addresses in average for each memory region. An additional
+ * low memory page is used to ensure each CPU can start with a PGD aligned
+ * virtual address (for realmode).
+ *
+ * The order of each memory region is not changed. The feature looks at
+ * the available space for the regions based on different configuration
+ * options and randomizes the base and space between each. The size of the
+ * physical memory mapping is the available physical memory.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/random.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+#include <asm/kaslr.h>
+
+#include "mm_internal.h"
+
+#define TB_SHIFT 40
+
+/*
+ * Virtual address start and end range for randomization. The end changes base
+ * on configuration to have the highest amount of space for randomization.
+ * It increases the possible random position for each randomized region.
+ *
+ * You need to add an if/def entry if you introduce a new memory region
+ * compatible with KASLR. Your entry must be in logical order with memory
+ * layout. For example, ESPFIX is before EFI because its virtual address is
+ * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
+ * ensure that this order is correct and won't be changed.
+ */
+static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
+static const unsigned long vaddr_end = VMEMMAP_START;
+
+/* Default values */
+unsigned long page_offset_base = __PAGE_OFFSET_BASE;
+EXPORT_SYMBOL(page_offset_base);
+unsigned long vmalloc_base = __VMALLOC_BASE;
+EXPORT_SYMBOL(vmalloc_base);
+
+/*
+ * Memory regions randomized by KASLR (except modules that use a separate logic
+ * earlier during boot). The list is ordered based on virtual addresses. This
+ * order is kept after randomization.
+ */
+static __initdata struct kaslr_memory_region {
+ unsigned long *base;
+ unsigned long size_tb;
+} kaslr_regions[] = {
+ { &page_offset_base, 64/* Maximum */ },
+ { &vmalloc_base, VMALLOC_SIZE_TB },
+};
+
+/* Get size in bytes used by the memory region */
+static inline unsigned long get_padding(struct kaslr_memory_region *region)
+{
+ return (region->size_tb << TB_SHIFT);
+}
+
+/*
+ * Apply no randomization if KASLR was disabled at boot or if KASAN
+ * is enabled. KASAN shadow mappings rely on regions being PGD aligned.
+ */
+static inline bool kaslr_memory_enabled(void)
+{
+ return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
+}
+
+/* Initialize base and padding for each memory region randomized with KASLR */
+void __init kernel_randomize_memory(void)
+{
+ size_t i;
+ unsigned long vaddr = vaddr_start;
+ unsigned long rand, memory_tb;
+ struct rnd_state rand_state;
+ unsigned long remain_entropy;
+
+ if (!kaslr_memory_enabled())
+ return;
+
+ /*
+ * Update Physical memory mapping to available and
+ * add padding if needed (especially for memory hotplug support).
+ */
+ BUG_ON(kaslr_regions[0].base != &page_offset_base);
+ memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
+ CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+
+ /* Adapt phyiscal memory region size based on available memory */
+ if (memory_tb < kaslr_regions[0].size_tb)
+ kaslr_regions[0].size_tb = memory_tb;
+
+ /* Calculate entropy available between regions */
+ remain_entropy = vaddr_end - vaddr_start;
+ for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++)
+ remain_entropy -= get_padding(&kaslr_regions[i]);
+
+ prandom_seed_state(&rand_state, kaslr_get_random_long("Memory"));
+
+ for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) {
+ unsigned long entropy;
+
+ /*
+ * Select a random virtual address using the extra entropy
+ * available.
+ */
+ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
+ prandom_bytes_state(&rand_state, &rand, sizeof(rand));
+ entropy = (rand % (entropy + 1)) & PUD_MASK;
+ vaddr += entropy;
+ *kaslr_regions[i].base = vaddr;
+
+ /*
+ * Jump the region and add a minimum padding based on
+ * randomization alignment.
+ */
+ vaddr += get_padding(&kaslr_regions[i]);
+ vaddr = round_up(vaddr + 1, PUD_SIZE);
+ remain_entropy -= entropy;
+ }
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+ unsigned long paddr, paddr_next;
+ pgd_t *pgd;
+ pud_t *pud_page, *pud_page_tramp;
+ int i;
+
+ if (!kaslr_memory_enabled()) {
+ init_trampoline_default();
+ return;
+ }
+
+ pud_page_tramp = alloc_low_page();
+
+ paddr = 0;
+ pgd = pgd_offset_k((unsigned long)__va(paddr));
+ pud_page = (pud_t *) pgd_page_vaddr(*pgd);
+
+ for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) {
+ pud_t *pud, *pud_tramp;
+ unsigned long vaddr = (unsigned long)__va(paddr);
+
+ pud_tramp = pud_page_tramp + pud_index(paddr);
+ pud = pud_page + pud_index(vaddr);
+ paddr_next = (paddr & PUD_MASK) + PUD_SIZE;
+
+ *pud_tramp = *pud;
+ }
+
+ set_pgd(&trampoline_pgd_entry,
+ __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
+}
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b4f2e7e9e..4515bae36 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -14,7 +14,6 @@
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
#include <linux/mm.h>
-#include <linux/module.h>
#include <linux/page-flags.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index aec124214..c2638a7d2 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -1,5 +1,5 @@
#include <linux/kmemcheck.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <asm/page.h>
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index ddb2244b0..afc47f5c9 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -11,7 +11,7 @@
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/ptrace.h>
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 0057a7acc..bef36622e 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -24,7 +24,7 @@
#define DEBUG 1
-#include <linux/module.h>
+#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 9c086c571..fb682108f 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,4 +1,5 @@
/* Common code for 32 and 64-bit NUMA */
+#include <linux/acpi.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
@@ -7,7 +8,6 @@
#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
-#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <linux/topology.h>
@@ -15,7 +15,6 @@
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/dma.h>
-#include <asm/acpi.h>
#include <asm/amd_nb.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 47b6436e4..6b7ce6279 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,7 +24,7 @@
#include <linux/bootmem.h>
#include <linux/memblock.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include "numa_internal.h"
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b2659b9d0..e3353c97d 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -101,7 +101,8 @@ static inline unsigned long highmap_start_pfn(void)
static inline unsigned long highmap_end_pfn(void)
{
- return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
+ /* Do not reference physical address outside the kernel. */
+ return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
}
#endif
@@ -112,6 +113,12 @@ within(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr < end;
}
+static inline int
+within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
+{
+ return addr >= start && addr <= end;
+}
+
/*
* Flushing functions
*/
@@ -746,18 +753,6 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
return true;
}
-static bool try_to_free_pud_page(pud_t *pud)
-{
- int i;
-
- for (i = 0; i < PTRS_PER_PUD; i++)
- if (!pud_none(pud[i]))
- return false;
-
- free_page((unsigned long)pud);
- return true;
-}
-
static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -871,16 +866,6 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}
-static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
-{
- pgd_t *pgd_entry = root + pgd_index(addr);
-
- unmap_pud_range(pgd_entry, addr, end);
-
- if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
- pgd_clear(pgd_entry);
-}
-
static int alloc_pte_page(pmd_t *pmd)
{
pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
@@ -1113,7 +1098,12 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
ret = populate_pud(cpa, addr, pgd_entry, pgprot);
if (ret < 0) {
- unmap_pgd_range(cpa->pgd, addr,
+ /*
+ * Leave the PUD page in place in case some other CPU or thread
+ * already found it, but remove any useless entries we just
+ * added to it.
+ */
+ unmap_pud_range(pgd_entry, addr,
addr + (cpa->numpages << PAGE_SHIFT));
return ret;
}
@@ -1185,7 +1175,7 @@ repeat:
return __cpa_process_fault(cpa, address, primary);
old_pte = *kpte;
- if (!pte_val(old_pte))
+ if (pte_none(old_pte))
return __cpa_process_fault(cpa, address, primary);
if (level == PG_LEVEL_4K) {
@@ -1316,7 +1306,8 @@ static int cpa_process_alias(struct cpa_data *cpa)
* to touch the high mapped kernel as well:
*/
if (!within(vaddr, (unsigned long)_text, _brk_end) &&
- within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
+ within_inclusive(cpa->pfn, highmap_start_pfn(),
+ highmap_end_pfn())) {
unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
__START_KERNEL_map - phys_base;
alias_cpa = *cpa;
@@ -1992,12 +1983,6 @@ out:
return retval;
}
-void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
- unsigned numpages)
-{
- unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
-}
-
/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 5431a32f7..170cc4ff0 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -11,7 +11,6 @@
#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
@@ -755,11 +754,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
return 1;
while (cursor < to) {
- if (!devmem_is_allowed(pfn)) {
- pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
- current->comm, from, to - 1);
+ if (!devmem_is_allowed(pfn))
return 0;
- }
cursor += PAGE_SIZE;
pfn++;
}
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 2f7702253..de391b7bc 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -11,7 +11,6 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/rbtree_augmented.h>
#include <linux/sched.h>
#include <linux/gfp.h>
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 9f0614dae..a23586953 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -26,7 +26,6 @@
* Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
*/
-#include <linux/module.h>
#include <linux/ptrace.h> /* struct pt_regs */
#include "pf_in.h"
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index aa0ff4b02..3feec5af4 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -18,7 +18,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
- return (pte_t *)__get_free_page(PGALLOC_GFP);
+ return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -207,9 +207,13 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
{
int i;
bool failed = false;
+ gfp_t gfp = PGALLOC_GFP;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
- pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
if (!pmd)
failed = true;
if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 75cc0978d..9adce7768 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -8,7 +8,6 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
-#include <linux/module.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -47,7 +46,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
return;
}
pte = pte_offset_kernel(pmd, vaddr);
- if (pte_val(pteval))
+ if (!pte_none(pteval))
set_pte_at(&init_mm, vaddr, pte, pteval);
else
pte_clear(&init_mm, vaddr, pte);
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index e666cbbb9..cfc3b9121 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -1,6 +1,6 @@
#include <linux/bootmem.h>
#include <linux/mmdebug.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/mm.h>
#include <asm/page.h>
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index b5f821881..35fe69529 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -13,10 +13,8 @@
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
#include <linux/mm.h>
#include <asm/proto.h>
#include <asm/numa.h>
@@ -24,51 +22,6 @@
#include <asm/apic.h>
#include <asm/uv/uv.h>
-int acpi_numa __initdata;
-
-static __init int setup_node(int pxm)
-{
- return acpi_map_pxm_to_node(pxm);
-}
-
-static __init void bad_srat(void)
-{
- printk(KERN_ERR "SRAT: SRAT not used.\n");
- acpi_numa = -1;
-}
-
-static __init inline int srat_disabled(void)
-{
- return acpi_numa < 0;
-}
-
-/*
- * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for
- * I/O localities since SRAT does not list them. I/O localities are
- * not supported at this point.
- */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
- int i, j;
-
- for (i = 0; i < slit->locality_count; i++) {
- const int from_node = pxm_to_node(i);
-
- if (from_node == NUMA_NO_NODE)
- continue;
-
- for (j = 0; j < slit->locality_count; j++) {
- const int to_node = pxm_to_node(j);
-
- if (to_node == NUMA_NO_NODE)
- continue;
-
- numa_set_distance(from_node, to_node,
- slit->entry[slit->locality_count * i + j]);
- }
- }
-}
-
/* Callback for Proximity Domain -> x2APIC mapping */
void __init
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
@@ -91,7 +44,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
pxm, apic_id);
return;
}
- node = setup_node(pxm);
+ node = acpi_map_pxm_to_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
@@ -104,7 +57,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
pxm, apic_id, node);
}
@@ -127,7 +79,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
pxm = pa->proximity_domain_lo;
if (acpi_srat_revision >= 2)
pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
- node = setup_node(pxm);
+ node = acpi_map_pxm_to_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
@@ -146,74 +98,10 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
- acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
pxm, apic_id, node);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static inline int save_add_info(void) {return 1;}
-#else
-static inline int save_add_info(void) {return 0;}
-#endif
-
-/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
-int __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
-{
- u64 start, end;
- u32 hotpluggable;
- int node, pxm;
-
- if (srat_disabled())
- goto out_err;
- if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
- goto out_err_bad_srat;
- if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
- goto out_err;
- hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
- if (hotpluggable && !save_add_info())
- goto out_err;
-
- start = ma->base_address;
- end = start + ma->length;
- pxm = ma->proximity_domain;
- if (acpi_srat_revision <= 1)
- pxm &= 0xff;
-
- node = setup_node(pxm);
- if (node < 0) {
- printk(KERN_ERR "SRAT: Too many proximity domains.\n");
- goto out_err_bad_srat;
- }
-
- if (numa_add_memblk(node, start, end) < 0)
- goto out_err_bad_srat;
-
- node_set(node, numa_nodes_parsed);
-
- pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
- node, pxm,
- (unsigned long long) start, (unsigned long long) end - 1,
- hotpluggable ? " hotplug" : "",
- ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");
-
- /* Mark hotplug range in memblock. */
- if (hotpluggable && memblock_mark_hotplug(start, ma->length))
- pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n",
- (unsigned long long)start, (unsigned long long)end - 1);
-
- max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
-
- return 0;
-out_err_bad_srat:
- bad_srat();
-out_err:
- return -1;
-}
-
-void __init acpi_numa_arch_fixup(void) {}
-
int __init x86_acpi_numa_init(void)
{
int ret;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5643fd0b1..4dbe65622 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -4,7 +4,7 @@
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/cpu.h>
#include <asm/tlbflush.h>