summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug68
-rw-r--r--mm/Makefile19
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/bootmem.c7
-rw-r--r--mm/compaction.c327
-rw-r--r--mm/debug.c173
-rw-r--r--mm/debug_page_ref.c54
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/fadvise.c8
-rw-r--r--mm/failslab.c12
-rw-r--r--mm/filemap.c301
-rw-r--r--mm/frame_vector.c2
-rw-r--r--mm/gup.c91
-rw-r--r--mm/huge_memory.c455
-rw-r--r--mm/hugetlb.c13
-rw-r--r--mm/internal.h34
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kasan/kasan.c162
-rw-r--r--mm/kasan/kasan.h37
-rw-r--r--mm/kasan/report.c68
-rw-r--r--mm/kmemcheck.c6
-rw-r--r--mm/kmemleak-test.c2
-rw-r--r--mm/kmemleak.c32
-rw-r--r--mm/ksm.c27
-rw-r--r--mm/madvise.c25
-rw-r--r--mm/memblock.c11
-rw-r--r--mm/memcontrol.c239
-rw-r--r--mm/memory-failure.c66
-rw-r--r--mm/memory.c170
-rw-r--r--mm/memory_hotplug.c86
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/mempool.c36
-rw-r--r--mm/migrate.c56
-rw-r--r--mm/mincore.c8
-rw-r--r--mm/mm_init.c7
-rw-r--r--mm/mmap.c176
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c21
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c4
-rw-r--r--mm/nommu.c149
-rw-r--r--mm/oom_kill.c219
-rw-r--r--mm/page-writeback.c74
-rw-r--r--mm/page_alloc.c489
-rw-r--r--mm/page_ext.c10
-rw-r--r--mm/page_io.c117
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_owner.c99
-rw-r--r--mm/page_poison.c (renamed from mm/debug-pagealloc.c)67
-rw-r--r--mm/percpu-km.c6
-rw-r--r--mm/percpu.c43
-rw-r--r--mm/pgtable-generic.c14
-rw-r--r--mm/process_vm_access.c11
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/rmap.c119
-rw-r--r--mm/shmem.c181
-rw-r--r--mm/slab.c1162
-rw-r--r--mm/slab.h99
-rw-r--r--mm/slab_common.c28
-rw-r--r--mm/slub.c353
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c21
-rw-r--r--mm/swap.c19
-rw-r--r--mm/swap_cgroup.c5
-rw-r--r--mm/swap_state.c12
-rw-r--r--mm/swapfile.c34
-rw-r--r--mm/truncate.c46
-rw-r--r--mm/userfaultfd.c7
-rw-r--r--mm/util.c128
-rw-r--r--mm/vmalloc.c31
-rw-r--r--mm/vmscan.c232
-rw-r--r--mm/vmstat.c17
-rw-r--r--mm/workingset.c170
-rw-r--r--mm/zsmalloc.c36
-rw-r--r--mm/zswap.c4
78 files changed, 4061 insertions, 2836 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 03cbfa072..989f8f3d7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -187,7 +187,6 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -652,10 +651,9 @@ config IDLE_PAGE_TRACKING
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
- default !ZONE_DMA
- depends on !ZONE_DMA
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
depends on X86_64 #arch_add_memory() comprehends device memory
help
@@ -669,3 +667,8 @@ config ZONE_DEVICE
config FRAME_VECTOR
bool
+
+config ARCH_USES_HIGH_VMA_FLAGS
+ bool
+config ARCH_HAS_PKEYS
+ bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53..22f4cd96a 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
- This results in a large slowdown, but helps to find certain types
- of memory corruption.
+ Depending on runtime enablement, this results in a small or large
+ slowdown, but helps to find certain types of memory corruption.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,69 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
+ By default this option will have a small overhead, e.g. by not
+ allowing the kernel mapping to be backed by large pages on some
+ architectures. Even bigger overhead comes when the debugging is
+ enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+ command line parameter.
+
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+ bool "Enable debug page memory allocations by default?"
+ default n
+ depends on DEBUG_PAGEALLOC
+ ---help---
+ Enable debug page memory allocations by default? This value
+ can be overridden by debug_pagealloc=off|on.
+
config PAGE_POISONING
+ bool "Poison pages after freeing"
+ select PAGE_EXTENSION
+ select PAGE_POISONING_NO_SANITY if HIBERNATION
+ ---help---
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages. The filling of the memory helps
+ reduce the risk of information leaks from freed data. This does
+ have a potential performance impact.
+
+ Note that "poison" here is not the same thing as the "HWPoison"
+ for CONFIG_MEMORY_FAILURE. This is software poisoning only.
+
+ If unsure, say N
+
+config PAGE_POISONING_NO_SANITY
+ depends on PAGE_POISONING
+ bool "Only poison, don't sanity check"
+ ---help---
+ Skip the sanity checking on alloc, only fill the pages with
+ poison on free. This reduces some of the overhead of the
+ poisoning feature.
+
+ If you are only interested in sanitization, say Y. Otherwise
+ say N.
+
+config PAGE_POISONING_ZERO
+ bool "Use zero for poisoning instead of random data"
+ depends on PAGE_POISONING
+ ---help---
+ Instead of using the existing poison value, fill the pages with
+ zeros. This makes it harder to detect when errors are occurring
+ due to sanitization but the zeroing at free means that it is
+ no longer necessary to write zeros when GFP_ZERO is used on
+ allocation.
+
+ Enabling page poisoning with this option will disable hibernation
+
+ If unsure, say N
bool
+
+config DEBUG_PAGE_REF
+ bool "Enable tracepoint to track down page reference manipulation"
+ depends on DEBUG_KERNEL
+ depends on TRACEPOINTS
+ ---help---
+ This is a feature to add tracepoint for tracking down page reference
+ manipulation. This tracking is useful to diagnose functional failure
+ due to migration failures caused by page reference mismatches. Be
+ careful when enabling this feature because it adds about 30 KB to the
+ kernel code. However the runtime performance overhead is virtually
+ nil until the tracepoints are actually enabled.
diff --git a/mm/Makefile b/mm/Makefile
index e3a53f5ba..0f6ae638a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,8 +3,24 @@
#
KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slab.o := n
KASAN_SANITIZE_slub.o := n
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -48,7 +64,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
-obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
@@ -81,3 +97,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c554d173a..0c6317b7d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -898,7 +898,7 @@ static atomic_t nr_wb_congested[2];
void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
- enum wb_state bit;
+ enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
if (test_and_clear_bit(bit, &congested->state))
@@ -911,7 +911,7 @@ EXPORT_SYMBOL(clear_wb_congested);
void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
- enum wb_state bit;
+ enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
if (!test_and_set_bit(bit, &congested->state))
@@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
return -EFAULT;
- printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
- table->procname);
+ pr_warn_once("%s exported in /proc is scheduled for removal\n",
+ table->procname);
*lenp = 2;
*ppos += *lenp;
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 300117f1a..57b3e9bd6 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -13,10 +13,10 @@
/*
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
* page list.
- * @b_dev_info: balloon device decriptor where we will insert a new page to
+ * @b_dev_info: balloon device descriptor where we will insert a new page to
*
* Driver must call it to properly allocate a new enlisted balloon page
- * before definetively removing it from the guest system.
+ * before definitively removing it from the guest system.
* This function returns the page address for the recently enqueued page or
* NULL in the case we fail to allocate a new page this turn.
*/
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 91e32bc85..0aa7dda52 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
#define bdebug(fmt, args...) ({ \
if (unlikely(bootmem_debug)) \
- printk(KERN_INFO \
- "bootmem::%s " fmt, \
+ pr_info("bootmem::%s " fmt, \
__func__, ## args); \
})
@@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 29fb26970..f8e925eb4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
+#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -17,6 +18,8 @@
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -71,49 +74,6 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
- unsigned long end_pfn, struct zone *zone)
-{
- struct page *start_page;
- struct page *end_page;
-
- /* end_pfn is one past the range we are checking */
- end_pfn--;
-
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
- return NULL;
-
- start_page = pfn_to_page(start_pfn);
-
- if (page_zone(start_page) != zone)
- return NULL;
-
- end_page = pfn_to_page(end_pfn);
-
- /* This gives a shorter code than deriving page_zone(end_page) */
- if (page_zone_id(start_page) != page_zone_id(end_page))
- return NULL;
-
- return start_page;
-}
-
#ifdef CONFIG_COMPACTION
/* Do not skip compaction more than 64 times */
@@ -200,7 +160,8 @@ static void reset_cached_positions(struct zone *zone)
{
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ zone->compact_cached_free_pfn =
+ round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
}
/*
@@ -554,13 +515,17 @@ unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn;
+ unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
LIST_HEAD(freelist);
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn += isolated,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
@@ -573,11 +538,13 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
}
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +830,23 @@ unsigned long
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, block_end_pfn;
+ unsigned long pfn, block_start_pfn, block_end_pfn;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, end_pfn);
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
continue;
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1095,7 +1067,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
- unsigned long low_pfn, end_pfn;
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+ unsigned long low_pfn;
unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
@@ -1107,16 +1081,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
+ block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < zone->zone_start_pfn)
+ block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
- for (; end_pfn <= cc->free_pfn;
- low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+ for (; block_end_pfn <= cc->free_pfn;
+ low_pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
@@ -1127,7 +1106,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
&& compact_should_abort(cc))
break;
- page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
if (!page)
continue;
@@ -1146,8 +1126,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
/* Perform the isolation */
isolate_start_pfn = low_pfn;
- low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
- isolate_mode);
+ low_pfn = isolate_migratepages_block(cc, low_pfn,
+ block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
@@ -1203,11 +1183,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/*
* Mark that the PG_migrate_skip information should be cleared
- * by kswapd when it goes to sleep. kswapd does not set the
+ * by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
- if (!current_is_kswapd())
+ if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
@@ -1350,10 +1330,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/*
* Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
+ * is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
@@ -1363,11 +1342,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
- cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
zone->compact_cached_free_pfn = cc->free_pfn;
}
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
@@ -1489,6 +1468,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
+ .direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1751,4 +1731,225 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+ for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ classzone_idx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+
+ };
+ bool success = false;
+
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.classzone_idx);
+ count_vm_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ if (kthread_should_stop())
+ return;
+ status = compact_zone(zone, &cc);
+
+ if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+ cc.classzone_idx, 0)) {
+ success = true;
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (status == COMPACT_COMPLETE) {
+ /*
+ * We use sync migration mode here, so we defer like
+ * sync direct compaction does.
+ */
+ defer_compaction(zone, cc.order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/classzone_idx in case it was higher/tighter than
+ * our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_classzone_idx > classzone_idx)
+ pgdat->kcompactd_classzone_idx = classzone_idx;
+
+ if (!waitqueue_active(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ classzone_idx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ wait_event_freezable(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat));
+
+ kcompactd_do_work(pgdat);
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+subsys_initcall(kcompactd_init)
+
#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug.c b/mm/debug.c
index 5c6da0ffd..8865bfb41 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,97 +9,52 @@
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
-
-static const struct trace_print_flags pageflag_names[] = {
- {1UL << PG_locked, "locked" },
- {1UL << PG_error, "error" },
- {1UL << PG_referenced, "referenced" },
- {1UL << PG_uptodate, "uptodate" },
- {1UL << PG_dirty, "dirty" },
- {1UL << PG_lru, "lru" },
- {1UL << PG_active, "active" },
- {1UL << PG_slab, "slab" },
- {1UL << PG_owner_priv_1, "owner_priv_1" },
- {1UL << PG_arch_1, "arch_1" },
- {1UL << PG_reserved, "reserved" },
- {1UL << PG_private, "private" },
- {1UL << PG_private_2, "private_2" },
- {1UL << PG_writeback, "writeback" },
- {1UL << PG_head, "head" },
- {1UL << PG_swapcache, "swapcache" },
- {1UL << PG_mappedtodisk, "mappedtodisk" },
- {1UL << PG_reclaim, "reclaim" },
- {1UL << PG_swapbacked, "swapbacked" },
- {1UL << PG_unevictable, "unevictable" },
-#ifdef CONFIG_MMU
- {1UL << PG_mlocked, "mlocked" },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- {1UL << PG_uncached, "uncached" },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {1UL << PG_hwpoison, "hwpoison" },
-#endif
-#ifdef CONFIG_TOI_INCREMENTAL
- {1UL << PG_toi_untracked, "toi_untracked" },
- {1UL << PG_toi_ro, "toi_ro" },
- {1UL << PG_toi_cbw, "toi_cbw" },
- {1UL << PG_toi_dirty, "toi_dirty" },
-#endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
- {1UL << PG_young, "young" },
- {1UL << PG_idle, "idle" },
-#endif
+#include <trace/events/mmflags.h>
+#include <linux/migrate.h>
+#include <linux/page_owner.h>
+
+#include "internal.h"
+
+char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
};
-static void dump_flags(unsigned long flags,
- const struct trace_print_flags *names, int count)
-{
- const char *delim = "";
- unsigned long mask;
- int i;
-
- pr_emerg("flags: %#lx(", flags);
-
- /* remove zone id */
- flags &= (1UL << NR_PAGEFLAGS) - 1;
-
- for (i = 0; i < count && flags; i++) {
-
- mask = names[i].mask;
- if ((flags & mask) != mask)
- continue;
-
- flags &= ~mask;
- pr_cont("%s%s", delim, names[i].name);
- delim = "|";
- }
+const struct trace_print_flags pageflag_names[] = {
+ __def_pageflag_names,
+ {0, NULL}
+};
- /* check for left over flags */
- if (flags)
- pr_cont("%s%#lx", delim, flags);
+const struct trace_print_flags gfpflag_names[] = {
+ __def_gfpflag_names,
+ {0, NULL}
+};
- pr_cont(")\n");
-}
+const struct trace_print_flags vmaflag_names[] = {
+ __def_vmaflag_names,
+ {0, NULL}
+};
-void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags)
+void __dump_page(struct page *page, const char *reason)
{
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
- page, atomic_read(&page->_count), page_mapcount(page),
+ page, page_ref_count(page), page_mapcount(page),
page->mapping, page->index);
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
pr_cont("\n");
- BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
- dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+
+ pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
- if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_flags(page->flags & badflags,
- pageflag_names, ARRAY_SIZE(pageflag_names));
- }
+
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
@@ -108,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason,
void dump_page(struct page *page, const char *reason)
{
- dump_page_badflags(page, reason, 0);
+ __dump_page(page, reason);
+ dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
-static const struct trace_print_flags vmaflags_names[] = {
- {VM_READ, "read" },
- {VM_WRITE, "write" },
- {VM_EXEC, "exec" },
- {VM_SHARED, "shared" },
- {VM_MAYREAD, "mayread" },
- {VM_MAYWRITE, "maywrite" },
- {VM_MAYEXEC, "mayexec" },
- {VM_MAYSHARE, "mayshare" },
- {VM_GROWSDOWN, "growsdown" },
- {VM_PFNMAP, "pfnmap" },
- {VM_DENYWRITE, "denywrite" },
- {VM_LOCKONFAULT, "lockonfault" },
- {VM_LOCKED, "locked" },
- {VM_IO, "io" },
- {VM_SEQ_READ, "seqread" },
- {VM_RAND_READ, "randread" },
- {VM_DONTCOPY, "dontcopy" },
- {VM_DONTEXPAND, "dontexpand" },
- {VM_ACCOUNT, "account" },
- {VM_NORESERVE, "noreserve" },
- {VM_HUGETLB, "hugetlb" },
-#if defined(CONFIG_X86)
- {VM_PAT, "pat" },
-#elif defined(CONFIG_PPC)
- {VM_SAO, "sao" },
-#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
- {VM_GROWSUP, "growsup" },
-#elif !defined(CONFIG_MMU)
- {VM_MAPPED_COPY, "mappedcopy" },
-#else
- {VM_ARCH_1, "arch_1" },
-#endif
- {VM_DONTDUMP, "dontdump" },
-#ifdef CONFIG_MEM_SOFT_DIRTY
- {VM_SOFTDIRTY, "softdirty" },
-#endif
- {VM_MIXEDMAP, "mixedmap" },
- {VM_HUGEPAGE, "hugepage" },
- {VM_NOHUGEPAGE, "nohugepage" },
- {VM_MERGEABLE, "mergeable" },
-};
-
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %p start %p end %p\n"
"next %p prev %p mm %p\n"
"prot %lx anon_vma %p vm_ops %p\n"
- "pgoff %lx file %p private_data %p\n",
+ "pgoff %lx file %p private_data %p\n"
+ "flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
vma->vm_prev, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
- vma->vm_file, vma->vm_private_data);
- dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+ vma->vm_file, vma->vm_private_data,
+ vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
@@ -202,7 +116,7 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
#endif
- "%s", /* This is here to hold the comma */
+ "def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
#ifdef CONFIG_MMU
@@ -236,11 +150,8 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
mm->tlb_flush_pending,
#endif
- "" /* This is here to not have a comma! */
- );
-
- dump_flags(mm->def_flags, vmaflags_names,
- ARRAY_SIZE(vmaflags_names));
+ mm->def_flags, &mm->def_flags
+ );
}
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c
new file mode 100644
index 000000000..1aef3d562
--- /dev/null
+++ b/mm/debug_page_ref.c
@@ -0,0 +1,54 @@
+#include <linux/mm_types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_ref.h>
+
+void __page_ref_set(struct page *page, int v)
+{
+ trace_page_ref_set(page, v);
+}
+EXPORT_SYMBOL(__page_ref_set);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_set);
+
+void __page_ref_mod(struct page *page, int v)
+{
+ trace_page_ref_mod(page, v);
+}
+EXPORT_SYMBOL(__page_ref_mod);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod);
+
+void __page_ref_mod_and_test(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_test(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_test);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test);
+
+void __page_ref_mod_and_return(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_return(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_return);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return);
+
+void __page_ref_mod_unless(struct page *page, int v, int u)
+{
+ trace_page_ref_mod_unless(page, v, u);
+}
+EXPORT_SYMBOL(__page_ref_mod_unless);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless);
+
+void __page_ref_freeze(struct page *page, int v, int ret)
+{
+ trace_page_ref_freeze(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_freeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze);
+
+void __page_ref_unfreeze(struct page *page, int v)
+{
+ trace_page_ref_unfreeze(page, v);
+}
+EXPORT_SYMBOL(__page_ref_unfreeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 57312b5d6..abcbfe86c 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool)
"dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
else
- printk(KERN_ERR
- "dma_pool_destroy %s, %p busy\n",
+ pr_err("dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
/* leak the still-in-use consistent memory */
list_del(&page->page_list);
@@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pr_err("dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
return;
}
@@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
else
- printk(KERN_ERR
- "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
return;
}
@@ -452,13 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ pr_err("dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
return;
}
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index b8a5bc66b..b8024fa71 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -97,8 +97,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
break;
case POSIX_FADV_WILLNEED:
/* First and last PARTIAL page! */
- start_index = offset >> PAGE_CACHE_SHIFT;
- end_index = endbyte >> PAGE_CACHE_SHIFT;
+ start_index = offset >> PAGE_SHIFT;
+ end_index = endbyte >> PAGE_SHIFT;
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;
@@ -124,8 +124,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
* preserved on the expectation that it is better to preserve
* needed memory than to discard unneeded memory.
*/
- start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
- end_index = (endbyte >> PAGE_CACHE_SHIFT);
+ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
+ end_index = (endbyte >> PAGE_SHIFT);
if (end_index >= start_index) {
unsigned long count = invalidate_mapping_pages(mapping,
diff --git a/mm/failslab.c b/mm/failslab.c
index 79171b4a5..b0fac98cd 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,7 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/mm.h>
+#include "slab.h"
static struct {
struct fault_attr attr;
@@ -11,18 +13,22 @@ static struct {
.cache_filter = false,
};
-bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ /* No fault-injection for bootstrap cache */
+ if (unlikely(s == kmem_cache))
+ return false;
+
if (gfpflags & __GFP_NOFAIL)
return false;
if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
return false;
- if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
- return should_fail(&failslab.attr, size);
+ return should_fail(&failslab.attr, s->object_size);
}
static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index f800f8749..31f4b0d40 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,7 +101,7 @@
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
+ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock and
- * mem_cgroup_begin_page_stat().
+ * is safe. The caller must hold the mapping's tree_lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg)
+void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
@@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, memcg,
- inode_to_wb(mapping->host));
+ account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
/**
@@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct mem_cgroup *memcg;
unsigned long flags;
void (*freepage)(struct page *);
@@ -263,15 +259,13 @@ void delete_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
- page_cache_release(page);
+ put_page(page);
}
EXPORT_SYMBOL(delete_from_page_cache);
@@ -358,8 +352,8 @@ EXPORT_SYMBOL(filemap_flush);
static int __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
- pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
+ pgoff_t index = start_byte >> PAGE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_SHIFT;
struct pagevec pvec;
int nr_pages;
int ret = 0;
@@ -551,19 +545,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
- struct mem_cgroup *memcg;
unsigned long flags;
pgoff_t offset = old->index;
freepage = mapping->a_ops->freepage;
- page_cache_get(new);
+ get_page(new);
new->mapping = mapping;
new->index = offset;
- memcg = mem_cgroup_begin_page_stat(old);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(old, NULL, memcg);
+ __delete_from_page_cache(old, NULL);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
@@ -576,12 +568,11 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
- mem_cgroup_replace_page(old, new);
+ mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
- page_cache_release(old);
+ put_page(old);
}
return error;
@@ -595,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index,
+ error = __radix_tree_create(&mapping->page_tree, page->index, 0,
&node, &slot);
if (error)
return error;
@@ -660,7 +651,7 @@ static int __add_to_page_cache_locked(struct page *page,
return error;
}
- page_cache_get(page);
+ get_page(page);
page->mapping = mapping;
page->index = offset;
@@ -684,7 +675,7 @@ err_insert:
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
- page_cache_release(page);
+ put_page(page);
return error;
}
@@ -1092,7 +1083,7 @@ repeat:
* include/linux/pagemap.h for details.
*/
if (unlikely(page != *pagep)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
}
@@ -1130,7 +1121,7 @@ repeat:
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
VM_BUG_ON_PAGE(page->index != offset, page);
@@ -1177,7 +1168,7 @@ repeat:
if (fgp_flags & FGP_LOCK) {
if (fgp_flags & FGP_NOWAIT) {
if (!trylock_page(page)) {
- page_cache_release(page);
+ put_page(page);
return NULL;
}
} else {
@@ -1187,7 +1178,7 @@ repeat:
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
VM_BUG_ON_PAGE(page->index != offset, page);
@@ -1218,7 +1209,7 @@ no_page:
err = add_to_page_cache_lru(page, mapping, offset,
gfp_mask & GFP_RECLAIM_MASK);
if (unlikely(err)) {
- page_cache_release(page);
+ put_page(page);
page = NULL;
if (err == -EEXIST)
goto repeat;
@@ -1264,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1272,8 +1262,10 @@ repeat:
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
@@ -1286,7 +1278,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
export:
@@ -1326,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1336,13 +1327,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- WARN_ON(iter.index);
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1357,7 +1343,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -1393,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
struct page *page;
repeat:
@@ -1404,12 +1389,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1424,7 +1405,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -1434,7 +1415,7 @@ repeat:
* negatives, which is just confusing to the caller.
*/
if (page->mapping == NULL || page->index != iter.index) {
- page_cache_release(page);
+ put_page(page);
break;
}
@@ -1469,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *page;
@@ -1480,12 +1460,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page.
@@ -1506,7 +1482,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -1548,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
struct page *page;
@@ -1558,12 +1533,8 @@ repeat:
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
@@ -1578,7 +1549,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
export:
@@ -1639,11 +1610,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
unsigned int prev_offset;
int error = 0;
- index = *ppos >> PAGE_CACHE_SHIFT;
- prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
- prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
- last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
- offset = *ppos & ~PAGE_CACHE_MASK;
+ index = *ppos >> PAGE_SHIFT;
+ prev_index = ra->prev_pos >> PAGE_SHIFT;
+ prev_offset = ra->prev_pos & (PAGE_SIZE-1);
+ last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
for (;;) {
struct page *page;
@@ -1668,7 +1639,16 @@ find_page:
index, last_index - index);
}
if (!PageUptodate(page)) {
- if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ wait_on_page_locked_killable(page);
+ if (PageUptodate(page))
+ goto page_ok;
+
+ if (inode->i_blkbits == PAGE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
if (!trylock_page(page))
@@ -1692,18 +1672,18 @@ page_ok:
*/
isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index)) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
+ nr = PAGE_SIZE;
if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ nr = ((isize - 1) & ~PAGE_MASK) + 1;
if (nr <= offset) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
}
@@ -1731,11 +1711,11 @@ page_ok:
ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
+ index += offset >> PAGE_SHIFT;
+ offset &= ~PAGE_MASK;
prev_offset = offset;
- page_cache_release(page);
+ put_page(page);
written += ret;
if (!iov_iter_count(iter))
goto out;
@@ -1755,7 +1735,7 @@ page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
continue;
}
@@ -1777,7 +1757,7 @@ readpage:
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
+ put_page(page);
error = 0;
goto find_page;
}
@@ -1794,7 +1774,7 @@ readpage:
* invalidate_mapping_pages got it
*/
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto find_page;
}
unlock_page(page);
@@ -1809,7 +1789,7 @@ readpage:
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
- page_cache_release(page);
+ put_page(page);
goto out;
no_cached_page:
@@ -1825,7 +1805,7 @@ no_cached_page:
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
- page_cache_release(page);
+ put_page(page);
if (error == -EEXIST) {
error = 0;
goto find_page;
@@ -1837,10 +1817,10 @@ no_cached_page:
out:
ra->prev_pos = prev_index;
- ra->prev_pos <<= PAGE_CACHE_SHIFT;
+ ra->prev_pos <<= PAGE_SHIFT;
ra->prev_pos |= prev_offset;
- *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
+ *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
file_accessed(filp);
return written ? written : error;
}
@@ -1860,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
+ size_t count = iov_iter_count(iter);
+
+ if (!count)
+ goto out; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
loff_t size;
- if (!count)
- goto out; /* skip atime */
size = i_size_read(inode);
retval = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
@@ -1931,7 +1912,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
else if (ret == -EEXIST)
ret = 0; /* losing race to add is OK */
- page_cache_release(page);
+ put_page(page);
} while (ret == AOP_TRUNCATED_PAGE);
@@ -2041,8 +2022,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size;
int ret = 0;
- size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
- if (offset >= size >> PAGE_CACHE_SHIFT)
+ size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (offset >= size >> PAGE_SHIFT)
return VM_FAULT_SIGBUS;
/*
@@ -2068,7 +2049,7 @@ retry_find:
}
if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
- page_cache_release(page);
+ put_page(page);
return ret | VM_FAULT_RETRY;
}
@@ -2091,10 +2072,10 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
- if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
+ size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= size >> PAGE_SHIFT)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return VM_FAULT_SIGBUS;
}
@@ -2139,7 +2120,7 @@ page_not_uptodate:
if (!PageUptodate(page))
error = -EIO;
}
- page_cache_release(page);
+ put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
@@ -2171,10 +2152,11 @@ repeat:
if (unlikely(!page))
goto next;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- break;
- else
- goto next;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ goto next;
}
if (!page_cache_get_speculative(page))
@@ -2182,7 +2164,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -2196,8 +2178,8 @@ repeat:
if (page->mapping != mapping || !PageUptodate(page))
goto unlock;
- size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
- if (page->index >= size >> PAGE_CACHE_SHIFT)
+ size = round_up(i_size_read(mapping->host), PAGE_SIZE);
+ if (page->index >= size >> PAGE_SHIFT)
goto unlock;
pte = vmf->pte + page->index - vmf->pgoff;
@@ -2213,7 +2195,7 @@ repeat:
unlock:
unlock_page(page);
skip:
- page_cache_release(page);
+ put_page(page);
next:
if (iter.index == vmf->max_pgoff)
break;
@@ -2296,14 +2278,14 @@ static struct page *wait_on_page_read(struct page *page)
if (!IS_ERR(page)) {
wait_on_page_locked(page);
if (!PageUptodate(page)) {
- page_cache_release(page);
+ put_page(page);
page = ERR_PTR(-EIO);
}
}
return page;
}
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data,
@@ -2319,59 +2301,80 @@ repeat:
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, gfp);
if (unlikely(err)) {
- page_cache_release(page);
+ put_page(page);
if (err == -EEXIST)
goto repeat;
/* Presumably ENOMEM for radix tree node */
return ERR_PTR(err);
}
+
+filler:
err = filler(data, page);
if (err < 0) {
- page_cache_release(page);
- page = ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
+ put_page(page);
+ return ERR_PTR(err);
}
- }
- return page;
-}
-
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
-{
- struct page *page;
- int err;
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
+ goto out;
+ }
+ if (PageUptodate(page))
+ goto out;
-retry:
- page = __read_cache_page(mapping, index, filler, data, gfp);
- if (IS_ERR(page))
- return page;
+ /*
+ * Page is not up to date and may be locked due one of the following
+ * case a: Page is being filled and the page lock is held
+ * case b: Read/write error clearing the page uptodate status
+ * case c: Truncation in progress (page locked)
+ * case d: Reclaim in progress
+ *
+ * Case a, the page will be up to date when the page is unlocked.
+ * There is no need to serialise on the page lock here as the page
+ * is pinned so the lock gives no additional protection. Even if the
+ * the page is truncated, the data is still valid if PageUptodate as
+ * it's a race vs truncate race.
+ * Case b, the page will not be up to date
+ * Case c, the page may be truncated but in itself, the data may still
+ * be valid after IO completes as it's a read vs truncate race. The
+ * operation must restart if the page is not uptodate on unlock but
+ * otherwise serialising on page lock to stabilise the mapping gives
+ * no additional guarantees to the caller as the page lock is
+ * released before return.
+ * Case d, similar to truncation. If reclaim holds the page lock, it
+ * will be a race with remove_mapping that determines if the mapping
+ * is valid on unlock but otherwise the data is valid and there is
+ * no need to serialise with page lock.
+ *
+ * As the page lock gives no additional guarantee, we optimistically
+ * wait on the page to be unlocked and check if it's up to date and
+ * use the page if it is. Otherwise, the page lock is required to
+ * distinguish between the different cases. The motivation is that we
+ * avoid spurious serialisations and wakeups when multiple processes
+ * wait on the same page for IO to complete.
+ */
+ wait_on_page_locked(page);
if (PageUptodate(page))
goto out;
+ /* Distinguish between all the cases under the safety of the lock */
lock_page(page);
+
+ /* Case c or d, restart the operation */
if (!page->mapping) {
unlock_page(page);
- page_cache_release(page);
- goto retry;
+ put_page(page);
+ goto repeat;
}
+
+ /* Someone else locked and filled the page in a very small window */
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
- err = filler(data, page);
- if (err < 0) {
- page_cache_release(page);
- return ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
- }
+ goto filler;
+
out:
mark_page_accessed(page);
return page;
@@ -2508,7 +2511,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
struct iov_iter data;
write_len = iov_iter_count(from);
- end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+ end = (pos + write_len - 1) >> PAGE_SHIFT;
written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
if (written)
@@ -2522,7 +2525,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/
if (mapping->nrpages) {
written = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
+ pos >> PAGE_SHIFT, end);
/*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
@@ -2547,7 +2550,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
+ pos >> PAGE_SHIFT, end);
}
if (written > 0) {
@@ -2608,8 +2611,8 @@ ssize_t generic_perform_write(struct file *file,
size_t copied; /* Bytes copied from user */
void *fsdata;
- offset = (pos & (PAGE_CACHE_SIZE - 1));
- bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
@@ -2662,7 +2665,7 @@ again:
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
- bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
@@ -2749,8 +2752,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
- pos >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
} else {
/*
* We don't know how much we wrote, so just return
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 7cf2b7163..381bb07ed 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
- ret = get_user_pages_locked(current, mm, start, nr_frames,
+ ret = get_user_pages_locked(start, nr_frames,
write, force, (struct page **)(vec->ptrs), &locked);
goto out;
}
diff --git a/mm/gup.c b/mm/gup.c
index 7bf19ffa2..c057784c8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -14,6 +14,7 @@
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -363,6 +364,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
+ if (*flags & FOLL_REMOTE)
+ fault_flags |= FAULT_FLAG_REMOTE;
if (nonblocking)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
if (*flags & FOLL_NOWAIT)
@@ -413,11 +416,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
+ int write = (gup_flags & FOLL_WRITE);
+ int foreign = (gup_flags & FOLL_REMOTE);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
- if (gup_flags & FOLL_WRITE) {
+ if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
@@ -443,6 +448,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_MAYREAD))
return -EFAULT;
}
+ /*
+ * gups are always data accesses, not instruction
+ * fetches, so execute=false here
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return -EFAULT;
return 0;
}
@@ -609,6 +620,28 @@ next_page:
}
EXPORT_SYMBOL(__get_user_pages);
+bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
+{
+ bool write = !!(fault_flags & FAULT_FLAG_WRITE);
+ bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
+ vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
+
+ if (!(vm_flags & vma->vm_flags))
+ return false;
+
+ /*
+ * The architecture might have a hardware protection
+ * mechanism other than read/write that can deny access.
+ *
+ * gup always represents data access, not instruction
+ * fetches, so execute=false here:
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return false;
+
+ return true;
+}
+
/*
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
@@ -644,7 +677,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
- vm_flags_t vm_flags;
int ret, major = 0;
if (unlocked)
@@ -655,8 +687,7 @@ retry:
if (!vma || address < vma->vm_start)
return -EFAULT;
- vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
- if (!(vm_flags & vma->vm_flags))
+ if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
ret = handle_mm_fault(mm, vma, address, fault_flags);
@@ -807,13 +838,13 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
* if (locked)
* up_read(&mm->mmap_sem);
*/
-long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked)
{
- return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, NULL, locked, true, FOLL_TOUCH);
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ write, force, pages, NULL, locked, true,
+ FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);
@@ -860,17 +891,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
* or if "force" shall be set to 1 (get_user_pages_fast misses the
* "force" parameter).
*/
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
- return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
- force, pages, FOLL_TOUCH);
+ return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
+ write, force, pages, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
/*
- * get_user_pages() - pin user pages in memory
+ * get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
@@ -924,12 +954,30 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* should use get_user_pages because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages, int write,
- int force, struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, vmas, NULL, false, FOLL_TOUCH);
+ pages, vmas, NULL, false,
+ FOLL_TOUCH | FOLL_REMOTE);
+}
+EXPORT_SYMBOL(get_user_pages_remote);
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's. We also
+ * obviously don't pass FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ write, force, pages, vmas, NULL, false,
+ FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1058,7 +1106,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
* @addr: user address
*
* Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by page_cache_release() or put_page().
+ * to be freed afterwards by put_page().
*
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile, to preserve alignment with its headers; and also returns
@@ -1144,6 +1192,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
pte_protnone(pte) || (write && !pte_write(pte)))
goto pte_unmap;
+ if (!arch_pte_access_permitted(pte, write))
+ goto pte_unmap;
+
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
head = compound_head(page);
@@ -1439,7 +1490,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- struct mm_struct *mm = current->mm;
int nr, ret;
start &= PAGE_MASK;
@@ -1451,8 +1501,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(current, mm, start,
- nr_pages - nr, write, 0, pages);
+ ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a7db0a2db..b49ee126d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -78,12 +78,12 @@ unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
-static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
@@ -98,7 +98,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* it would have happened if the vma was large enough during page
* fault.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void)
if (recommended_min > min_free_kbytes) {
if (user_min_free_kbytes >= 0)
- pr_info("raising min_free_kbytes from %d to %lu "
- "to help transparent hugepage allocations\n",
+ pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
min_free_kbytes, recommended_min);
min_free_kbytes = recommended_min;
@@ -233,7 +232,7 @@ retry:
return READ_ONCE(huge_zero_page);
}
-static void put_huge_zero_page(void)
+void put_huge_zero_page(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -270,37 +269,35 @@ static struct shrinker huge_zero_page_shrinker = {
#ifdef CONFIG_SYSFS
-static ssize_t double_flag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf,
- enum transparent_hugepage_flag enabled,
- enum transparent_hugepage_flag req_madv)
-{
- if (test_bit(enabled, &transparent_hugepage_flags)) {
- VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
- return sprintf(buf, "[always] madvise never\n");
- } else if (test_bit(req_madv, &transparent_hugepage_flags))
- return sprintf(buf, "always [madvise] never\n");
- else
- return sprintf(buf, "always madvise [never]\n");
-}
-static ssize_t double_flag_store(struct kobject *kobj,
+static ssize_t triple_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag deferred,
enum transparent_hugepage_flag req_madv)
{
- if (!memcmp("always", buf,
+ if (!memcmp("defer", buf,
+ min(sizeof("defer")-1, count))) {
+ if (enabled == deferred)
+ return -EINVAL;
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(deferred, &transparent_hugepage_flags);
+ } else if (!memcmp("always", buf,
min(sizeof("always")-1, count))) {
- set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(enabled, &transparent_hugepage_flags);
} else if (!memcmp("madvise", buf,
min(sizeof("madvise")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
set_bit(req_madv, &transparent_hugepage_flags);
} else if (!memcmp("never", buf,
min(sizeof("never")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
} else
return -EINVAL;
@@ -310,17 +307,22 @@ static ssize_t double_flag_store(struct kobject *kobj,
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
}
+
static ssize_t enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
ssize_t ret;
- ret = double_flag_store(kobj, attr, buf, count,
+ ret = triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
@@ -378,16 +380,23 @@ static ssize_t single_flag_store(struct kobject *kobj,
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] defer madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [defer] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer [madvise] never\n");
+ else
+ return sprintf(buf, "always defer madvise [never]\n");
+
}
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- return double_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ return triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
}
static struct kobj_attribute defrag_attr =
@@ -660,6 +669,18 @@ static int __init hugepage_init(void)
return -EINVAL;
}
+ khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
+ khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
+ /*
+ * hugepages can't be allocated by the buddy allocator
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+ /*
+ * we use page->mapping and page->index in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
goto err_sysfs;
@@ -764,7 +785,6 @@ void prep_transhuge_page(struct page *page)
* we use page->mapping and page->indexlru in second tail page
* as list_head: assuming THP order >= 2
*/
- BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
INIT_LIST_HEAD(page_deferred_list(page));
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
@@ -843,9 +863,30 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
return 0;
}
-static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+/*
+ * If THP is set to always then directly reclaim/compact as necessary
+ * If set to defer then do no reclaim and defer to khugepaged
+ * If set to madvise and the VMA is flagged then directly reclaim/compact
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+ gfp_t reclaim_flags = 0;
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
+ (vma->vm_flags & VM_HUGEPAGE))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_KSWAPD_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+
+ return GFP_TRANSHUGE | reclaim_flags;
+}
+
+/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
+static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
- return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
+ return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
}
/* Caller must hold page table lock. */
@@ -919,7 +960,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
return ret;
}
- gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ gfp = alloc_hugepage_direct_gfpmask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
@@ -1257,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
/*
* We can only reuse the page if nobody else maps the huge page or it's
- * part. We can do it by checking page_mapcount() on each sub-page, but
- * it's expensive.
- * The cheaper way is to check page_count() to be equal 1: every
- * mapcount takes page reference reference, so this way we can
- * guarantee, that the PMD is the only mapping.
- * This can give false negative if somebody pinned the page, but that's
- * fine.
+ * part.
*/
- if (page_mapcount(page) == 1 && page_count(page) == 1) {
+ if (page_trans_huge_mapcount(page, NULL) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1279,7 +1314,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -1643,12 +1678,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (vma_is_dax(vma)) {
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
- put_huge_zero_page();
+ tlb_remove_page(tlb, pmd_page(orig_pmd));
} else if (is_huge_zero_pmd(orig_pmd)) {
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
- put_huge_zero_page();
+ tlb_remove_page(tlb, pmd_page(orig_pmd));
} else {
struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
@@ -2038,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (pte_write(pteval)) {
writable = true;
} else {
- if (PageSwapCache(page) && !reuse_swap_page(page)) {
+ if (PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
unlock_page(page);
result = SCAN_SWAP_CACHE_PAGE;
goto out;
@@ -2248,11 +2284,12 @@ static int khugepaged_find_target_node(void)
return 0;
}
-static inline struct page *alloc_hugepage(int defrag)
+static inline struct page *alloc_khugepaged_hugepage(void)
{
struct page *page;
- page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ HPAGE_PMD_ORDER);
if (page)
prep_transhuge_page(page);
return page;
@@ -2263,7 +2300,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
struct page *hpage;
do {
- hpage = alloc_hugepage(khugepaged_defrag());
+ hpage = alloc_khugepaged_hugepage();
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
@@ -2333,8 +2370,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
- gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
- __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
@@ -2535,7 +2571,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
khugepaged_node_load[node]++;
if (!PageLRU(page)) {
- result = SCAN_SCAN_ABORT;
+ result = SCAN_PAGE_LRU;
goto out_unmap;
}
if (PageLocked(page)) {
@@ -2855,7 +2891,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
- atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
write = pmd_write(*pmd);
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
@@ -2945,44 +2981,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
unsigned long haddr = address & HPAGE_PMD_MASK;
mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
ptl = pmd_lock(mm, pmd);
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ struct page *page = pmd_page(*pmd);
if (PageMlocked(page))
- get_page(page);
- else
- page = NULL;
+ clear_page_mlock(page);
} else if (!pmd_devmap(*pmd))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, false);
+ __split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
- if (page) {
- lock_page(page);
- munlock_vma_page(page);
- unlock_page(page);
- put_page(page);
- }
}
-static void split_huge_pmd_address(struct vm_area_struct *vma,
- unsigned long address)
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-
pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -2994,11 +3019,20 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
+
+ /*
+ * If caller asks to setup a migration entries, we need a page to check
+ * pmd against. Otherwise we can end up replacing wrong page.
+ */
+ VM_BUG_ON(freeze && !page);
+ if (page && page != pmd_page(*pmd))
+ return;
+
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_pmd(vma, pmd, address);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3014,7 +3048,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start);
+ split_huge_pmd_address(vma, start, false, NULL);
/*
* If the new end address isn't hpage aligned and it could
@@ -3024,7 +3058,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end);
+ split_huge_pmd_address(vma, end, false, NULL);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3038,208 +3072,58 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart);
+ split_huge_pmd_address(next, nstart, false, NULL);
}
}
-static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static void freeze_page(struct page *page)
{
- unsigned long haddr = address & HPAGE_PMD_MASK;
- spinlock_t *ptl;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
- pmd = pmd_offset(pud, address);
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_present(*pmd)) {
- spin_unlock(ptl);
- return;
- }
- if (pmd_trans_huge(*pmd)) {
- if (page == pmd_page(*pmd))
- __split_huge_pmd_locked(vma, pmd, haddr, true);
- spin_unlock(ptl);
- return;
- }
- spin_unlock(ptl);
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- pte_t entry, swp_pte;
- swp_entry_t swp_entry;
-
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non PMD-aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!pte_present(*pte))
- continue;
- if (page_to_pfn(page) != pte_pfn(*pte))
- continue;
- flush_cache_page(vma, address, page_to_pfn(page));
- entry = ptep_clear_flush(vma, address, pte);
- if (pte_dirty(entry))
- SetPageDirty(page);
- swp_entry = make_migration_entry(page, pte_write(entry));
- swp_pte = swp_entry_to_pte(swp_entry);
- if (pte_soft_dirty(entry))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(vma->vm_mm, address, pte, swp_pte);
- page_remove_rmap(page, false);
- put_page(page);
- }
- pte_unmap_unlock(pte - 1, ptl);
-}
-
-static void freeze_page(struct anon_vma *anon_vma, struct page *page)
-{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
+ enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
+ TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
+ int i, ret;
VM_BUG_ON_PAGE(!PageHead(page), page);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
- pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
-
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- freeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
-}
-
-static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
-{
- spinlock_t *ptl;
- pmd_t *pmd;
- pte_t *pte, entry;
- swp_entry_t swp_entry;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non-PMD aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!is_swap_pte(*pte))
- continue;
-
- swp_entry = pte_to_swp_entry(*pte);
- if (!is_migration_entry(swp_entry))
- continue;
- if (migration_entry_to_page(swp_entry) != page)
- continue;
-
- get_page(page);
- page_add_anon_rmap(page, vma, address, false);
-
- entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
- if (PageDirty(page))
- entry = pte_mkdirty(entry);
- if (is_write_migration_entry(swp_entry))
- entry = maybe_mkwrite(entry, vma);
-
- flush_dcache_page(page);
- set_pte_at(vma->vm_mm, address, pte, entry);
+ /* We only need TTU_SPLIT_HUGE_PMD once */
+ ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
+ for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
+ /* Cut short if the page is unmapped */
+ if (page_count(page) == 1)
+ return;
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
+ ret = try_to_unmap(page + i, ttu_flags);
}
- pte_unmap_unlock(pte - 1, ptl);
+ VM_BUG_ON(ret);
}
-static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+static void unfreeze_page(struct page *page)
{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
-
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
- pgoff, pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
+ int i;
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- unfreeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ remove_migration_ptes(page + i, page + i, true);
}
-static int __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
- int mapcount;
struct page *page_tail = head + tail;
- mapcount = atomic_read(&page_tail->_mapcount) + 1;
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+ VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
- * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
* get_page_unless_zero(), and atomic_set() is implemented in C not
* using locked ops. spin_unlock on x86 sometime uses locked ops
* because of PPro errata 66, 92, so unless somebody can guarantee
* atomic_set() here would be safe on all archs (and not only on x86),
- * it's safer to use atomic_add().
+ * it's safer to use atomic_inc().
*/
- atomic_add(mapcount + 1, &page_tail->_count);
-
+ page_ref_inc(page_tail);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3273,8 +3157,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
lru_add_page_tail(head, page_tail, lruvec, list);
-
- return mapcount;
}
static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3282,7 +3164,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
- int i, tail_mapcount;
+ int i;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -3291,15 +3173,13 @@ static void __split_huge_page(struct page *page, struct list_head *list)
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- tail_mapcount = 0;
for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
- tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
- atomic_sub(tail_mapcount, &head->_count);
+ __split_huge_page_tail(head, i, lruvec, list);
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
- unfreeze_page(page_anon_vma(head), head);
+ unfreeze_page(head);
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
@@ -3338,6 +3218,64 @@ int total_mapcount(struct page *page)
}
/*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+{
+ int i, ret, _total_mapcount, mapcount;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ return mapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = ret = 0;
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ ret = max(ret, mapcount);
+ _total_mapcount += mapcount;
+ }
+ if (PageDoubleMap(page)) {
+ ret -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ ret += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ return ret;
+}
+
+/*
* This function splits huge page into normal pages. @page can point to any
* subpage of huge page to split. Split doesn't change the position of @page.
*
@@ -3395,7 +3333,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
mlocked = PageMlocked(page);
- freeze_page(anon_vma, head);
+ freeze_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -3424,7 +3362,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
BUG();
} else {
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- unfreeze_page(anon_vma, head);
+ unfreeze_page(head);
ret = -EBUSY;
}
@@ -3459,6 +3397,7 @@ void deferred_split_huge_page(struct page *page)
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
list_add_tail(page_deferred_list(page), &pgdata->split_queue);
pgdata->split_queue_len++;
}
@@ -3566,7 +3505,7 @@ next:
}
}
- pr_info("%lu of %lu THP split", split, total);
+ pr_info("%lu of %lu THP split\n", split, total);
return 0;
}
@@ -3577,7 +3516,7 @@ static int __init split_huge_pages_debugfs(void)
{
void *ret;
- ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+ ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
&split_huge_pages_fops);
if (!ret)
pr_warn("Failed to create split_huge_pages in debugfs");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aefba5a9c..19d0d08b3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warning("hugepagesz= specified twice, ignoring\n");
+ pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warning("hugepages= specified twice without "
- "interleaving hugepagesz=, ignoring\n");
+ pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
return 1;
}
@@ -3347,7 +3346,7 @@ retry_avoidcopy:
old_page != pagecache_page)
outside_reserve = 1;
- page_cache_get(old_page);
+ get_page(old_page);
/*
* Drop page table lock as buddy allocator may be called. It will
@@ -3365,7 +3364,7 @@ retry_avoidcopy:
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
- page_cache_release(old_page);
+ put_page(old_page);
BUG_ON(huge_pte_none(pte));
unmap_ref_private(mm, vma, old_page, address);
BUG_ON(huge_pte_none(pte));
@@ -3426,9 +3425,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_release_all:
- page_cache_release(new_page);
+ put_page(new_page);
out_release_old:
- page_cache_release(old_page);
+ put_page(old_page);
spin_lock(ptl); /* Caller expects lock to be held */
return ret;
diff --git a/mm/internal.h b/mm/internal.h
index a38a21ebd..b79abb672 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -14,6 +14,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/tracepoint-defs.h>
/*
* The set of flags that only affect watermark checking and reclaim
@@ -37,10 +38,10 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline void set_page_count(struct page *page, int v)
-{
- atomic_set(&page->_count, v);
-}
+void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details);
extern int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
@@ -63,7 +64,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
static inline void set_page_refcounted(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
+ VM_BUG_ON_PAGE(page_ref_count(page), page);
set_page_count(page, 1);
}
@@ -131,13 +132,22 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
return page_idx ^ (1 << order);
}
+extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone);
+
+static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ if (zone->contiguous)
+ return pfn_to_page(start_pfn);
+
+ return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
-#ifdef CONFIG_MEMORY_FAILURE
-extern bool is_free_buddy_page(struct page *page);
-#endif
extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -162,6 +172,7 @@ struct compact_control {
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const int alloc_flags; /* alloc flags of a direct compactor */
@@ -380,7 +391,7 @@ extern int mminit_loglevel;
do { \
if (level < mminit_loglevel) { \
if (level <= MMINIT_WARNING) \
- printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+ pr_warn("mminit::" prefix " " fmt, ##arg); \
else \
printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
@@ -466,4 +477,9 @@ static inline void try_to_unmap_flush_dirty(void)
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags vmaflag_names[];
+extern const struct trace_print_flags gfpflag_names[];
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index a61460d9f..131daadf4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,5 +1,6 @@
KASAN_SANITIZE := n
UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 1ad20ade8..38f1dd79a 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -17,7 +17,9 @@
#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
+#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/linkage.h>
@@ -32,7 +34,6 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
-#include <linux/kasan.h>
#include "kasan.h"
#include "../slab.h"
@@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order)
KASAN_FREE_PAGE);
}
+#ifdef CONFIG_SLAB
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static size_t optimal_redzone(size_t object_size)
+{
+ int rz =
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+ return rz;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+ unsigned long *flags)
+{
+ int redzone_adjust;
+ /* Make sure the adjusted size is still less than
+ * KMALLOC_MAX_CACHE_SIZE.
+ * TODO: this check is only useful for SLAB, but not SLUB. We'll need
+ * to skip it for SLUB when it starts using kasan_cache_create().
+ */
+ if (*size > KMALLOC_MAX_CACHE_SIZE -
+ sizeof(struct kasan_alloc_meta) -
+ sizeof(struct kasan_free_meta))
+ return;
+ *flags |= SLAB_KASAN;
+ /* Add alloc meta. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
+
+ /* Add free meta. */
+ if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+ }
+ redzone_adjust = optimal_redzone(cache->object_size) -
+ (*size - cache->object_size);
+ if (redzone_adjust > 0)
+ *size += redzone_adjust;
+ *size = min(KMALLOC_MAX_CACHE_SIZE,
+ max(*size,
+ cache->object_size +
+ optimal_redzone(cache->object_size)));
+}
+#endif
+
void kasan_poison_slab(struct page *page)
{
kasan_poison_shadow(page_address(page),
@@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
kasan_poison_shadow(object,
round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+ alloc_info->state = KASAN_STATE_INIT;
+ }
+#endif
}
-void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+#ifdef CONFIG_SLAB
+static inline int in_irqentry_text(unsigned long ptr)
{
- kasan_kmalloc(cache, object, cache->object_size);
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+ int i;
+
+ if (!trace->nr_entries)
+ return;
+ for (i = 0; i < trace->nr_entries; i++)
+ if (in_irqentry_text(trace->entries[i])) {
+ /* Include the irqentry function into the stack. */
+ trace->nr_entries = i + 1;
+ break;
+ }
+}
+
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[KASAN_STACK_DEPTH];
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = KASAN_STACK_DEPTH,
+ .skip = 0
+ };
+
+ save_stack_trace(&trace);
+ filter_irq_stacks(&trace);
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries-1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ return depot_save_stack(&trace, flags);
+}
+
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+ track->pid = current->pid;
+ track->stack = save_stack(flags);
+}
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+#endif
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ kasan_kmalloc(cache, object, cache->object_size, flags);
}
void kasan_slab_free(struct kmem_cache *cache, void *object)
@@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return;
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_free_meta *free_info =
+ get_free_info(cache, object);
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+ alloc_info->state = KASAN_STATE_FREE;
+ set_track(&free_info->track, GFP_NOWAIT);
+ }
+#endif
+
kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
}
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+ gfp_t flags)
{
unsigned long redzone_start;
unsigned long redzone_end;
@@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
kasan_unpoison_shadow(object, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+
+ alloc_info->state = KASAN_STATE_ALLOC;
+ alloc_info->alloc_size = size;
+ set_track(&alloc_info->track, flags);
+ }
+#endif
}
EXPORT_SYMBOL(kasan_kmalloc);
-void kasan_kmalloc_large(const void *ptr, size_t size)
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
{
struct page *page;
unsigned long redzone_start;
@@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size)
KASAN_PAGE_REDZONE);
}
-void kasan_krealloc(const void *object, size_t size)
+void kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
@@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size)
page = virt_to_head_page(object);
if (unlikely(!PageSlab(page)))
- kasan_kmalloc_large(object, size);
+ kasan_kmalloc_large(object, size, flags);
else
- kasan_kmalloc(page->slab_cache, object, size);
+ kasan_kmalloc(page->slab_cache, object, size, flags);
}
void kasan_kfree(void *ptr)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 4f6c62e5c..30a2f0ba0 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,6 +2,7 @@
#define __MM_KASAN_KASAN_H
#include <linux/kasan.h>
+#include <linux/stackdepot.h>
#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
@@ -54,6 +55,42 @@ struct kasan_global {
#endif
};
+/**
+ * Structures to keep alloc and free tracks *
+ */
+
+enum kasan_state {
+ KASAN_STATE_INIT,
+ KASAN_STATE_ALLOC,
+ KASAN_STATE_FREE
+};
+
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+ u32 pid;
+ depot_stack_handle_t stack;
+};
+
+struct kasan_alloc_meta {
+ struct kasan_track track;
+ u32 state : 2; /* enum kasan_state */
+ u32 alloc_size : 30;
+ u32 reserved;
+};
+
+struct kasan_free_meta {
+ /* Allocator freelist pointer, unused by KASAN. */
+ void **freelist;
+ struct kasan_track track;
+};
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object);
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object);
+
+
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 12f222d02..60869a5a0 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -18,6 +18,7 @@
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/stackdepot.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
+#ifdef CONFIG_SLAB
+static void print_track(struct kasan_track *track)
+{
+ pr_err("PID = %u\n", track->pid);
+ if (track->stack) {
+ struct stack_trace trace;
+
+ depot_fetch_stack(track->stack, &trace);
+ print_stack_trace(&trace, 0);
+ } else {
+ pr_err("(stack is not available)\n");
+ }
+}
+
+static void object_err(struct kmem_cache *cache, struct page *page,
+ void *object, char *unused_reason)
+{
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ struct kasan_free_meta *free_info;
+
+ dump_stack();
+ pr_err("Object at %p, in cache %s\n", object, cache->name);
+ if (!(cache->flags & SLAB_KASAN))
+ return;
+ switch (alloc_info->state) {
+ case KASAN_STATE_INIT:
+ pr_err("Object not allocated yet\n");
+ break;
+ case KASAN_STATE_ALLOC:
+ pr_err("Object allocated with size %u bytes.\n",
+ alloc_info->alloc_size);
+ pr_err("Allocation:\n");
+ print_track(&alloc_info->track);
+ break;
+ case KASAN_STATE_FREE:
+ pr_err("Object freed, allocated with size %u bytes\n",
+ alloc_info->alloc_size);
+ free_info = get_free_info(cache, object);
+ pr_err("Allocation:\n");
+ print_track(&alloc_info->track);
+ pr_err("Deallocation:\n");
+ print_track(&free_info->track);
+ break;
+ }
+}
+#endif
+
static void print_address_description(struct kasan_access_info *info)
{
const void *addr = info->access_addr;
@@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info)
if (PageSlab(page)) {
void *object;
struct kmem_cache *cache = page->slab_cache;
- void *last_object;
-
- object = virt_to_obj(cache, page_address(page), addr);
- last_object = page_address(page) +
- page->objects * cache->size;
-
- if (unlikely(object > last_object))
- object = last_object; /* we hit into padding */
-
+ object = nearest_obj(cache, page,
+ (void *)info->access_addr);
object_err(cache, page, object,
- "kasan: bad access detected");
+ "kasan: bad access detected");
return;
}
dump_page(page, "kasan: bad access detected");
@@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info)
if (!init_task_stack_addr(addr))
pr_err("Address belongs to variable %pS\n", addr);
}
-
dump_stack();
}
@@ -214,8 +254,7 @@ static void kasan_report_error(struct kasan_access_info *info)
*/
kasan_disable_current();
spin_lock_irqsave(&report_lock, flags);
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
if (info->access_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
if ((unsigned long)info->access_addr < PAGE_SIZE)
@@ -236,8 +275,7 @@ static void kasan_report_error(struct kasan_access_info *info)
print_address_description(info);
print_shadow_for_address(info->first_bad_addr);
}
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, flags);
kasan_enable_current();
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb59..5bf191756 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
if (!shadow) {
if (printk_ratelimit())
- printk(KERN_ERR "kmemcheck: failed to allocate "
- "shadow bitmap\n");
+ pr_err("kmemcheck: failed to allocate shadow bitmap\n");
return;
}
@@ -60,6 +59,9 @@ void kmemcheck_free_shadow(struct page *page, int order)
void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
size_t size)
{
+ if (unlikely(!object)) /* Skip object if allocation failed */
+ return;
+
/*
* Has already been memset(), which initializes the shadow for us
* as well.
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index dcdcadb69..dd3c23a80 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void)
struct test_node *elem;
int i;
- printk(KERN_INFO "Kmemleak testing\n");
+ pr_info("Kmemleak testing\n");
/* make some orphan objects */
pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 25c0ad36f..e6429926e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -276,7 +276,7 @@ static void kmemleak_disable(void);
* Print a warning and dump the stack trace.
*/
#define kmemleak_warn(x...) do { \
- pr_warning(x); \
+ pr_warn(x); \
dump_stack(); \
kmemleak_warning = 1; \
} while (0)
@@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
- pr_warning("Cannot allocate a kmemleak_object structure\n");
+ pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
return NULL;
}
@@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
else if (parent->pointer + parent->size <= ptr)
link = &parent->rb_node.rb_right;
else {
- kmemleak_stop("Cannot insert 0x%lx into the object "
- "search tree (overlaps existing)\n",
+ kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
* No need for parent->lock here since "parent" cannot
@@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size)
object = find_and_remove_object(ptr, 1);
if (!object) {
#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx "
- "(size %zu)\n", ptr, size);
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
#endif
return;
}
@@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color)
object = find_and_get_object(ptr, 0);
if (!object) {
- kmemleak_warn("Trying to color unknown object "
- "at 0x%08lx as %s\n", ptr,
+ kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
+ ptr,
(color == KMEMLEAK_GREY) ? "Grey" :
(color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
return;
@@ -764,7 +763,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
if (!area) {
- pr_warning("Cannot allocate a scan area\n");
+ pr_warn("Cannot allocate a scan area\n");
goto out;
}
@@ -1463,8 +1462,8 @@ static void kmemleak_scan(void)
if (new_leaks) {
kmemleak_found_leaks = true;
- pr_info("%d new suspected memory leaks (see "
- "/sys/kernel/debug/kmemleak)\n", new_leaks);
+ pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n",
+ new_leaks);
}
}
@@ -1515,7 +1514,7 @@ static void start_scan_thread(void)
return;
scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
if (IS_ERR(scan_thread)) {
- pr_warning("Failed to create the scan thread\n");
+ pr_warn("Failed to create the scan thread\n");
scan_thread = NULL;
}
}
@@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work)
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
else
- pr_info("Kmemleak disabled without freeing internal data. "
- "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
+ pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n");
}
static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1874,8 +1872,8 @@ void __init kmemleak_init(void)
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warning("Early log buffer exceeded (%d), please increase "
- "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
+ pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
+ crt_early_log);
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
@@ -1960,7 +1958,7 @@ static int __init kmemleak_late_init(void)
dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
&kmemleak_fops);
if (!dentry)
- pr_warning("Failed to create the debugfs kmemleak file\n");
+ pr_warn("Failed to create the debugfs kmemleak file\n");
mutex_lock(&scan_mutex);
start_scan_thread();
mutex_unlock(&scan_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index ca6d2a06a..4786b4150 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -352,13 +352,17 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
- * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
+ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ *
+ * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * of the process that owns 'vma'. We also do not want to enforce
+ * protection keys here anyway.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
@@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
- page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
+ page = follow_page(vma, addr,
+ FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE |
+ FAULT_FLAG_REMOTE);
else
ret = VM_FAULT_WRITE;
put_page(page);
@@ -777,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void)
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ up_read(&mm->mmap_sem);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -788,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void)
free_mm_slot(mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
mmdrop(mm);
- } else {
+ } else
spin_unlock(&ksm_mmlist_lock);
- up_read(&mm->mmap_sem);
- }
}
/* Clean up stable nodes, but don't worry if some are still busy */
@@ -1657,8 +1661,15 @@ next_mm:
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
- spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
+ /*
+ * up_read(&mm->mmap_sem) first because after
+ * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
+ * already have been freed under us by __ksm_exit()
+ * because the "mm_slot" is still hashed and
+ * ksm_scan.mm_slot doesn't point to it anymore.
+ */
+ spin_unlock(&ksm_mmlist_lock);
}
/* Repeat until we've completed scanning the whole list */
diff --git a/mm/madvise.c b/mm/madvise.c
index f56825b6d..07427d3fc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -170,7 +170,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
vma, index);
if (page)
- page_cache_release(page);
+ put_page(page);
}
return 0;
@@ -204,14 +204,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
page = find_get_entry(mapping, index);
if (!radix_tree_exceptional_entry(page)) {
if (page)
- page_cache_release(page);
+ put_page(page);
continue;
}
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0);
if (page)
- page_cache_release(page);
+ put_page(page);
}
lru_add_drain(); /* Push any new pages onto the LRU now */
@@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
}
pr_info("Injecting memory failure for page %#lx at %#lx\n",
page_to_pfn(p), start);
- /* Ignore return value for now */
- memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior)
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
+ * MADV_FREE - the application marks pages in the given range as lazy free,
+ * where actual purges are postponed until memory pressure happens.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
* MADV_DONTFORK - omit this area from child's address space when forking:
* typically, to avoid COWing pages pinned by get_user_pages().
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_HWPOISON - trigger memory error handler as if the given memory range
+ * were corrupted by unrecoverable hardware memory failure.
+ * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
* this area with pages of identical content from other such areas.
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
+ * MADV_HUGEPAGE - the application wants to back the given range by transparent
+ * huge pages in the future. Existing pages might be coalesced and
+ * new pages might be allocated as THP.
+ * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
+ * transparent huge pages so the existing pages will not be
+ * coalesced into THP and new pages will not be allocated as THP.
+ * MADV_DONTDUMP - the application wants to prevent pages in the given range
+ * from being included in its core dump.
+ * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
*
* return values:
* zero - success
diff --git a/mm/memblock.c b/mm/memblock.c
index dd7989929..b570dddb4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
- WARN_ONCE(1, "memblock: bottom-up allocation failed, "
- "memory hotunplug may be affected\n");
+ WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid,
@@ -612,14 +611,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.memory;
-
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -740,14 +737,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.reserved;
-
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a65ad1d59..fe787f5c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -269,31 +269,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -664,9 +639,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid,
- unsigned int lru_mask)
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
{
unsigned long nr = 0;
int zid;
@@ -1177,12 +1151,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- /* oom_info_lock ensures that parallel ooms do not interleave */
- static DEFINE_MUTEX(oom_info_lock);
struct mem_cgroup *iter;
unsigned int i;
- mutex_lock(&oom_info_lock);
rcu_read_lock();
if (p) {
@@ -1226,7 +1197,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont("\n");
}
- mutex_unlock(&oom_info_lock);
}
/*
@@ -1711,19 +1681,13 @@ cleanup:
}
/**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
- * @page: page that is going to change accounted state
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
*
- * memcg = mem_cgroup_begin_page_stat(page);
- * if (TestClearPageState(page))
- * mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg);
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup and stabilizes their page->mem_cgroup binding.
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1732,25 +1696,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page from being uncharged.
- * E.g. end-writeback clearing PageWriteback(), which allows
- * migration to go ahead and uncharge the page before the
- * account transaction might be complete.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return NULL;
+ return;
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
@@ -1761,21 +1718,23 @@ again:
/*
* When charge migration first begins, we can have locked and
* unlocked page stat updates happening concurrently. Track
- * the task who has the lock for mem_cgroup_end_page_stat().
+ * the task who has the lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return memcg;
+ return;
}
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
/**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
- * @memcg: the memcg that was accounted against
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -1787,7 +1746,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
rcu_read_unlock();
}
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2363,9 +2322,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_online(memcg))
- return 0;
-
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
return ret;
@@ -2384,10 +2340,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
- int ret;
+ int ret = 0;
memcg = get_mem_cgroup_from_mm(current->mm);
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!mem_cgroup_is_root(memcg))
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
css_put(&memcg->css);
return ret;
}
@@ -2757,39 +2714,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static unsigned long tree_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_stat(iter, idx);
+ memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += mem_cgroup_read_stat(iter, i);
+ }
}
-static unsigned long tree_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx)
+static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_events(iter, idx);
+ memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_EVENTS; i++)
+ events[i] += mem_cgroup_read_events(iter, i);
+ }
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val;
+ unsigned long val = 0;
if (mem_cgroup_is_root(memcg)) {
- val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
- if (swap)
- val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_RSS);
+ if (swap)
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_SWAP);
+ }
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -2855,6 +2821,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
+ if (cgroup_memory_nokmem)
+ return 0;
+
BUG_ON(memcg->kmemcg_id >= 0);
BUG_ON(memcg->kmem_state);
@@ -2875,24 +2844,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
return 0;
}
-static int memcg_propagate_kmem(struct mem_cgroup *parent,
- struct mem_cgroup *memcg)
-{
- int ret = 0;
-
- mutex_lock(&memcg_limit_mutex);
- /*
- * If the parent cgroup is not kmem-online now, it cannot be
- * onlined after this point, because it has at least one child
- * already.
- */
- if (memcg_kmem_online(parent) ||
- (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
- ret = memcg_online_kmem(memcg);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
-}
-
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css;
@@ -2951,10 +2902,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
}
#else
-static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
-{
- return 0;
-}
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
return 0;
@@ -2970,22 +2917,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- int ret = 0;
+ int ret;
mutex_lock(&memcg_limit_mutex);
- /* Top-level cgroup doesn't propagate from root */
- if (!memcg_kmem_online(memcg)) {
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- ret = -EBUSY;
- if (ret)
- goto out;
- ret = memcg_online_kmem(memcg);
- if (ret)
- goto out;
- }
ret = page_counter_limit(&memcg->kmem, limit);
-out:
mutex_unlock(&memcg_limit_mutex);
return ret;
}
@@ -4236,7 +4171,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- error = memcg_propagate_kmem(parent, memcg);
+ error = memcg_online_kmem(memcg);
if (error)
goto fail;
@@ -4320,9 +4255,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
- mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
- memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
memcg->low = 0;
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4490,7 +4427,7 @@ static int mem_cgroup_move_account(struct page *page,
VM_BUG_ON(compound && !PageTransHuge(page));
/*
- * Prevent mem_cgroup_replace_page() from looking at
+ * Prevent mem_cgroup_migrate() from looking at
* page->mem_cgroup of its source page while we change it.
*/
ret = -EBUSY;
@@ -4932,9 +4869,9 @@ static void mem_cgroup_move_charge(void)
lru_add_drain_all();
/*
- * Signal mem_cgroup_begin_page_stat() to take the memcg's
- * move_lock while we're moving its pages to another memcg.
- * Then wait for already started RCU-only updates to finish.
+ * Signal lock_page_memcg() to take the memcg's move_lock
+ * while we're moving its pages to another memcg. Then wait
+ * for already started RCU-only updates to finish.
*/
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
@@ -5150,6 +5087,8 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[MEMCG_NR_EVENTS];
int i;
/*
@@ -5163,22 +5102,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
+ tree_stat(memcg, stat);
+ tree_events(memcg, events);
+
seq_printf(m, "anon %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+ seq_printf(m, "kernel_stack %llu\n",
+ (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+ seq_printf(m, "slab %llu\n",
+ (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
+ stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+ (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) {
struct mem_cgroup *mi;
@@ -5190,12 +5134,17 @@ static int memory_stat_show(struct seq_file *m, void *v)
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
+ seq_printf(m, "slab_reclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ seq_printf(m, "slab_unreclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ events[MEM_CGROUP_EVENTS_PGFAULT]);
seq_printf(m, "pgmajfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+ events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
return 0;
}
@@ -5468,6 +5417,10 @@ static void uncharge_list(struct list_head *page_list)
struct list_head *next;
struct page *page;
+ /*
+ * Note that the list can be a single page->lru; hence the
+ * do-while loop instead of a simple list_for_each_entry().
+ */
next = page_list->next;
do {
unsigned int nr_pages = 1;
@@ -5554,16 +5507,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
}
/**
- * mem_cgroup_replace_page - migrate a charge to another page
- * @oldpage: currently charged page
- * @newpage: page to transfer the charge to
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
*
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
*
* Both pages must be locked, @newpage->mapping must be set up.
- * Either or both pages might be on the LRU already.
*/
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
@@ -5596,7 +5549,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, true);
+ commit_charge(newpage, memcg, false);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ac595e7a3..ca5acee53 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
struct siginfo si;
int ret;
- printk(KERN_ERR
- "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_addr = (void *)addr;
@@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
}
if (ret < 0)
- printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
- t->comm, t->pid, ret);
+ pr_info("MCE: Error sending signal to %s:%d: %d\n",
+ t->comm, t->pid, ret);
return ret;
}
@@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
} else {
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- printk(KERN_ERR
- "MCE: Out of memory while machine check handling\n");
+ pr_err("MCE: Out of memory while machine check handling\n");
return;
}
}
@@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr_valid == 0) {
- printk(KERN_ERR
- "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
force_sig(SIGKILL, tk->tsk);
}
@@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
*/
else if (kill_proc(tk->tsk, tk->addr, trapno,
pfn, page, flags) < 0)
- printk(KERN_ERR
- "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
kfree(tk);
@@ -542,7 +538,7 @@ static int delete_from_lru_cache(struct page *p)
/*
* drop the page count elevated by isolate_lru_page()
*/
- page_cache_release(p);
+ put_page(p);
return 0;
}
return -EIO;
@@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
*/
static int me_unknown(struct page *p, unsigned long pfn)
{
- printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+ pr_err("MCE %#lx: Unknown page state\n", pfn);
return MF_FAILED;
}
@@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (mapping->a_ops->error_remove_page) {
err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
- pfn, err);
+ pr_info("MCE %#lx: Failed to punch page: %d\n",
+ pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
pr_info("MCE %#lx: failed to release buffers\n", pfn);
@@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("MCE %#lx: Failed to invalidate\n", pfn);
}
return ret;
}
@@ -826,8 +821,6 @@ static struct page_state {
#undef lru
#undef swapbacked
#undef head
-#undef tail
-#undef compound
#undef slab
#undef reserved
@@ -856,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p,
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
- printk(KERN_ERR
- "MCE %#lx: %s still referenced by %d users\n",
+ pr_err("MCE %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
}
@@ -896,7 +888,15 @@ int get_hwpoison_page(struct page *page)
}
}
- return get_page_unless_zero(head);
+ if (get_page_unless_zero(head)) {
+ if (head == compound_head(page))
+ return 1;
+
+ pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page));
+ put_page(head);
+ }
+
+ return 0;
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
@@ -936,8 +936,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
if (PageSwapCache(p)) {
- printk(KERN_ERR
- "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -955,8 +954,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- printk(KERN_INFO
- "MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -974,8 +972,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
- printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -1042,16 +1040,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
if (!pfn_valid(pfn)) {
- printk(KERN_ERR
- "MCE %#lx: memory outside kernel control\n",
- pfn);
+ pr_err("MCE %#lx: memory outside kernel control\n", pfn);
return -ENXIO;
}
p = pfn_to_page(pfn);
orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
+ pr_err("MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
@@ -1182,7 +1178,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ pr_err("MCE %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
put_hwpoison_page(hpage);
diff --git a/mm/memory.c b/mm/memory.c
index 869aa2cb2..dc696bcf0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -65,6 +65,7 @@
#include <linux/userfaultfd_k.h>
#include <asm/io.h>
+#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -562,8 +563,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
- pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
@@ -661,9 +661,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
return;
}
if (nr_unshown) {
- printk(KERN_ALERT
- "BUG: Bad page map: %lu messages suppressed\n",
- nr_unshown);
+ pr_alert("BUG: Bad page map: %lu messages suppressed\n",
+ nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
@@ -674,15 +673,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- printk(KERN_ALERT
- "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
+ current->comm,
+ (long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- printk(KERN_ALERT
- "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
@@ -1145,6 +1142,12 @@ again:
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
+ /*
+ * oom_reaper cannot tear down dirty
+ * pages
+ */
+ if (unlikely(details && details->ignore_dirty))
+ continue;
force_flush = 1;
set_page_dirty(page);
}
@@ -1163,8 +1166,8 @@ again:
}
continue;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
+ /* only check swap_entries if explicitly asked for in details */
+ if (unlikely(details && !details->check_swap_entries))
continue;
entry = pte_to_swp_entry(ptent);
@@ -1219,15 +1222,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
-#ifdef CONFIG_DEBUG_VM
- if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
- pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
- __func__, addr, end,
- vma->vm_start,
- vma->vm_end);
- BUG();
- }
-#endif
+ VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
+ !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
split_huge_pmd(vma, pmd, addr);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
@@ -1269,7 +1265,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
return addr;
}
-static void unmap_page_range(struct mmu_gather *tlb,
+void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
@@ -1277,9 +1273,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
- if (details && !details->check_mapping)
- details = NULL;
-
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
@@ -1591,8 +1584,29 @@ out:
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
+ return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+/**
+ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * cow mappings. In general, using multiple vmas is preferable;
+ * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ */
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t pgprot)
+{
int ret;
- pgprot_t pgprot = vma->vm_page_prot;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1614,7 +1628,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_pfn_prot);
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
@@ -1916,7 +1930,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long end = addr + size;
int err;
- BUG_ON(addr >= end);
+ if (WARN_ON(addr >= end))
+ return -EINVAL;
+
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2071,7 +2087,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
VM_BUG_ON_PAGE(PageAnon(page), page);
mapping = page->mapping;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if ((dirtied || page_mkwrite) && mapping) {
/*
@@ -2205,7 +2221,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
}
if (new_page)
- page_cache_release(new_page);
+ put_page(new_page);
pte_unmap_unlock(page_table, ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2220,14 +2236,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
munlock_vma_page(old_page);
unlock_page(old_page);
}
- page_cache_release(old_page);
+ put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
- page_cache_release(new_page);
+ put_page(new_page);
oom:
if (old_page)
- page_cache_release(old_page);
+ put_page(old_page);
return VM_FAULT_OOM;
}
@@ -2275,7 +2291,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
{
int page_mkwrite = 0;
- page_cache_get(old_page);
+ get_page(old_page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
@@ -2284,7 +2300,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
tmp = do_page_mkwrite(vma, old_page, address);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
+ put_page(old_page);
return tmp;
}
/*
@@ -2298,7 +2314,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
- page_cache_release(old_page);
+ put_page(old_page);
return 0;
}
page_mkwrite = 1;
@@ -2357,8 +2373,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* not dirty accountable.
*/
if (PageAnon(old_page) && !PageKsm(old_page)) {
+ int total_mapcount;
if (!trylock_page(old_page)) {
- page_cache_get(old_page);
+ get_page(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
@@ -2366,18 +2383,23 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
- page_cache_release(old_page);
+ put_page(old_page);
return 0;
}
- page_cache_release(old_page);
+ put_page(old_page);
}
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
+ if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (total_mapcount == 1) {
+ /*
+ * The page is all ours. Move it to
+ * our anon_vma so the rmap code will
+ * not search our parent or siblings.
+ * Protected against the rmap code by
+ * the page lock.
+ */
+ page_move_anon_rmap(compound_head(old_page),
+ vma, address);
+ }
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);
@@ -2392,7 +2414,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Ok, we need to copy. Oh, well..
*/
- page_cache_get(old_page);
+ get_page(old_page);
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
@@ -2417,7 +2439,6 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
- /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details->first_index;
if (zba < vba)
zba = vba;
@@ -2452,7 +2473,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- struct zap_details details;
+ struct zap_details details = { };
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -2602,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
inc_mm_counter_fast(mm, MM_ANONPAGES);
dec_mm_counter_fast(mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+ if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -2636,7 +2657,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* parallel locked swapcache.
*/
unlock_page(swapcache);
- page_cache_release(swapcache);
+ put_page(swapcache);
}
if (flags & FAULT_FLAG_WRITE) {
@@ -2658,10 +2679,10 @@ out_nomap:
out_page:
unlock_page(page);
out_release:
- page_cache_release(page);
+ put_page(page);
if (page != swapcache) {
unlock_page(swapcache);
- page_cache_release(swapcache);
+ put_page(swapcache);
}
return ret;
}
@@ -2769,7 +2790,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
- page_cache_release(page);
+ put_page(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
}
@@ -2788,10 +2809,10 @@ unlock:
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
- page_cache_release(page);
+ put_page(page);
goto unlock;
oom_free_page:
- page_cache_release(page);
+ put_page(page);
oom:
return VM_FAULT_OOM;
}
@@ -2824,7 +2845,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
- page_cache_release(vmf.page);
+ put_page(vmf.page);
return VM_FAULT_HWPOISON;
}
@@ -3013,7 +3034,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
return ret;
}
do_set_pte(vma, address, fault_page, pte, false, false);
@@ -3041,7 +3062,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
- page_cache_release(new_page);
+ put_page(new_page);
return VM_FAULT_OOM;
}
@@ -3058,7 +3079,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
@@ -3074,7 +3095,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
@@ -3085,7 +3106,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg, false);
- page_cache_release(new_page);
+ put_page(new_page);
return ret;
}
@@ -3113,7 +3134,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
tmp = do_page_mkwrite(vma, fault_page, address);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(fault_page);
+ put_page(fault_page);
return tmp;
}
}
@@ -3122,7 +3143,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
return ret;
}
do_set_pte(vma, address, fault_page, pte, true, false);
@@ -3162,8 +3183,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
- pgoff_t pgoff = (((address & PAGE_MASK)
- - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ pgoff_t pgoff = linear_page_index(vma, address);
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
@@ -3397,6 +3417,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd;
pte_t *pte;
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3437,12 +3462,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * Use pte_alloc() instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) &&
- unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pte_alloc(mm, pmd, address)))
return VM_FAULT_OOM;
/*
* If a huge pmd materialized under us just retry later. Use
@@ -3714,7 +3738,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
@@ -3750,7 +3774,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
buf, maddr + offset, bytes);
}
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
len -= bytes;
buf += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4af58a3a8..aa34431c3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -33,6 +33,7 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
@@ -77,6 +78,9 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+bool memhp_auto_online;
+EXPORT_SYMBOL_GPL(memhp_auto_online);
+
void get_online_mems(void)
{
might_sleep();
@@ -138,7 +142,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->name = "System RAM";
res->start = start;
res->end = start + size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0) {
pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
@@ -163,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page,
page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
- atomic_inc(&page->_count);
+ page_ref_inc(page);
}
void put_page_bootmem(struct page *page)
@@ -174,7 +178,7 @@ void put_page_bootmem(struct page *page)
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
- if (atomic_dec_return(&page->_count) == 1) {
+ if (page_ref_dec_return(page) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
@@ -509,6 +513,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
int start_sec, end_sec;
struct vmem_altmap *altmap;
+ clear_zone_contiguous(zone);
+
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -521,7 +527,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
if (altmap->base_pfn != phys_start_pfn
|| vmem_altmap_offset(altmap) > nr_pages) {
pr_warn_once("memory add fail, invalid altmap\n");
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
altmap->alloc = 0;
}
@@ -539,7 +546,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
err = 0;
}
vmemmap_populate_print_last();
-
+out:
+ set_zone_contiguous(zone);
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
@@ -811,6 +819,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
}
+ clear_zone_contiguous(zone);
+
/*
* We can only remove entire sections
*/
@@ -826,6 +836,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
if (ret)
break;
}
+
+ set_zone_contiguous(zone);
+
return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
@@ -1042,14 +1055,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = zone_to_nid(zone);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
- if (ret) {
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
- }
+ if (ret)
+ goto failed_addition;
+
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
@@ -1067,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
- printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) pfn << PAGE_SHIFT,
- (((unsigned long long) pfn + nr_pages)
- << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
+ goto failed_addition;
}
zone->present_pages += onlined_pages;
@@ -1082,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
pgdat_resize_unlock(zone->zone_pgdat, &flags);
if (onlined_pages) {
- node_states_set_node(zone_to_nid(zone), &arg);
+ node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL, NULL);
else
@@ -1093,8 +1100,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
init_per_zone_wmark_min();
- if (onlined_pages)
- kswapd_run(zone_to_nid(zone));
+ if (onlined_pages) {
+ kswapd_run(nid);
+ kcompactd_run(nid);
+ }
vm_total_pages = nr_free_pagecache_pages();
@@ -1103,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
return 0;
+
+failed_addition:
+ pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1261,8 +1277,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
return zone_default;
}
+static int online_memory_block(struct memory_block *mem, void *arg)
+{
+ return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
pg_data_t *pgdat = NULL;
@@ -1322,6 +1343,11 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ /* online pages if requested */
+ if (online)
+ walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
+ NULL, online_memory_block);
+
goto out;
error:
@@ -1345,7 +1371,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, memhp_auto_online);
if (ret < 0)
release_memory_resource(res);
return ret;
@@ -1504,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
} else {
#ifdef CONFIG_DEBUG_VM
- printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
- pfn);
+ pr_alert("removing pfn %lx from LRU failed\n", pfn);
dump_page(page, "failed to remove from LRU");
#endif
put_page(page);
@@ -1833,7 +1858,7 @@ repeat:
ret = -EBUSY;
goto failed_removal;
}
- printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
@@ -1858,8 +1883,10 @@ repeat:
zone_pcp_update(zone);
node_states_clear_node(node, &arg);
- if (arg.status_change_nid >= 0)
+ if (arg.status_change_nid >= 0) {
kswapd_stop(node);
+ kcompactd_stop(node);
+ }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
@@ -1868,9 +1895,9 @@ repeat:
return 0;
failed_removal:
- printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) start_pfn << PAGE_SHIFT,
- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
@@ -1943,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
- pr_warn("removing memory fails, because memory "
- "[%pa-%pa] is onlined\n",
+ pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9a3f6b90e..36cc01bc9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (!is_vm_hugetlb_page(vma) &&
+ (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ !(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
@@ -844,12 +846,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
}
}
-static int lookup_node(struct mm_struct *mm, unsigned long addr)
+static int lookup_node(unsigned long addr)
{
struct page *p;
int err;
- err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+ err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
@@ -904,7 +906,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(mm, addr);
+ err = lookup_node(addr);
if (err < 0)
goto out;
*policy = err;
@@ -2557,9 +2559,7 @@ static void __init check_numabalancing_enable(void)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
- pr_info("%s automatic NUMA balancing. "
- "Configure with numa_balancing= or the "
- "kernel.numa_balancing sysctl",
+ pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 7924f4f58..9b7a14a79 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
-static void kasan_unpoison_element(mempool_t *pool, void *element)
+static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
{
if (pool->alloc == mempool_alloc_slab)
- kasan_slab_alloc(pool->pool_data, element);
+ kasan_slab_alloc(pool->pool_data, element, flags);
if (pool->alloc == mempool_kmalloc)
- kasan_krealloc(element, (size_t)pool->pool_data);
+ kasan_krealloc(element, (size_t)pool->pool_data, flags);
if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
@@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
-static void *remove_element(mempool_t *pool)
+static void *remove_element(mempool_t *pool, gfp_t flags)
{
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
- kasan_unpoison_element(pool, element);
+ kasan_unpoison_element(pool, element, flags);
check_element(pool, element);
return element;
}
@@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool)
return;
while (pool->curr_nr) {
- void *element = remove_element(pool);
+ void *element = remove_element(pool, GFP_KERNEL);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
@@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
- element = remove_element(pool);
+ element = remove_element(pool, GFP_KERNEL);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
@@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
- * Note: using __GFP_ZERO is not supported.
+ * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported.
*/
-void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_t wait;
gfp_t gfp_temp;
+ /* If oom killed, memory reserves are essential to prevent livelock */
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
+ /* No element size to zero on allocation */
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
+
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
- gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
+ if (likely(pool->curr_nr)) {
+ /*
+ * Don't allocate from emergency reserves if there are
+ * elements available. This check is racy, but it will
+ * be rechecked each loop.
+ */
+ gfp_temp |= __GFP_NOMEMALLOC;
+ }
element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL))
@@ -336,7 +347,7 @@ repeat_alloc:
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
- element = remove_element(pool);
+ element = remove_element(pool, gfp_temp);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
@@ -352,11 +363,12 @@ repeat_alloc:
* We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
- if (gfp_temp != gfp_mask) {
+ if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
+ gfp_temp = gfp_mask;
/* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 625741faa..f9dfb18a4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/page_owner.h>
#include <asm/tlbflush.h>
@@ -171,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
else
page_add_file_rmap(new);
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
/* No need to invalidate - it was non-present before */
@@ -186,14 +187,17 @@ out:
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-static void remove_migration_ptes(struct page *old, struct page *new)
+void remove_migration_ptes(struct page *old, struct page *new, bool locked)
{
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = old,
};
- rmap_walk(new, &rwc);
+ if (locked)
+ rmap_walk_locked(new, &rwc);
+ else
+ rmap_walk(new, &rwc);
}
/*
@@ -325,7 +329,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
/* No turning back from here */
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -349,7 +352,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -363,7 +366,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
- page_unfreeze_refs(page, expected_count);
+ page_ref_unfreeze(page, expected_count);
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -372,7 +375,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
* Now we know that no one else is looking at the page:
* no turning back from here.
*/
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -398,7 +400,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock(&mapping->tree_lock);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -452,21 +454,22 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
+
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
+
return MIGRATEPAGE_SUCCESS;
}
@@ -578,6 +581,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
+
+ copy_page_owner(page, newpage);
+
+ mem_cgroup_migrate(page, newpage);
}
/************************************************************
@@ -698,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(page, page);
+ remove_migration_ptes(page, page, false);
rc = mapping->a_ops->writepage(page, &wbc);
@@ -772,7 +779,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* page is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- set_page_memcg(page, NULL);
if (!PageAnon(page))
page->mapping = NULL;
}
@@ -897,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (page_was_mapped)
remove_migration_ptes(page,
- rc == MIGRATEPAGE_SUCCESS ? newpage : page);
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
out_unlock_both:
unlock_page(newpage);
@@ -952,8 +958,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
}
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
put_new_page = NULL;
+ set_page_owner_migrate_reason(newpage, reason);
+ }
out:
if (rc != -EAGAIN) {
@@ -1024,7 +1032,7 @@ out:
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
int *result = NULL;
@@ -1071,7 +1079,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_was_mapped)
remove_migration_ptes(hpage,
- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
unlock_page(new_hpage);
@@ -1082,6 +1090,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
put_new_page = NULL;
+ set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
@@ -1154,7 +1163,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode);
+ pass > 2, mode, reason);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
@@ -1773,7 +1782,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
put_page(new_page);
goto out_fail;
}
-
+ /*
+ * We are not sure a pending tlb flush here is for a huge page
+ * mapping or not. Hence use the tlb range variant
+ */
if (mm_tlb_flush_pending(mm))
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1829,12 +1841,11 @@ fail_putback:
page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
if (page_count(page) != 2) {
set_pmd_at(mm, mmun_start, pmd, orig_entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
+ flush_pmd_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(new_page, true);
@@ -1842,9 +1853,8 @@ fail_putback:
}
mlock_migrate_page(new_page, page);
- set_page_memcg(new_page, page_memcg(page));
- set_page_memcg(page, NULL);
page_remove_rmap(page, true);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mincore.c b/mm/mincore.c
index 563f32045..c0b5ba965 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
#endif
if (page) {
present = PageUptodate(page);
- page_cache_release(page);
+ put_page(page);
}
return present;
@@ -211,7 +211,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
* return values:
* zero - success
* -EFAULT - vec points to an illegal address
- * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
+ * -EINVAL - addr is not a multiple of PAGE_SIZE
* -ENOMEM - Addresses in the range [addr, addr + len] are
* invalid for the address space of this process, or
* specify one or more pages which are not currently
@@ -226,14 +226,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
unsigned char *tmp;
/* Check the start address: needs to be page-aligned.. */
- if (start & ~PAGE_CACHE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
if (!access_ok(VERIFY_READ, (void __user *) start, len))
return -ENOMEM;
- /* This also avoids any overflows on PAGE_CACHE_ALIGN */
+ /* This also avoids any overflows on PAGE_ALIGN */
pages = len >> PAGE_SHIFT;
pages += (offset_in_page(len)) != 0;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fdadf918d..5b72266b4 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void)
/* Iterate the zonelist */
for_each_zone_zonelist(zone, z, zonelist, zoneid) {
#ifdef CONFIG_NUMA
- printk(KERN_CONT "%d:%s ",
- zone->node, zone->name);
+ pr_cont("%d:%s ", zone->node, zone->name);
#else
- printk(KERN_CONT "0:%s ", zone->name);
+ pr_cont("0:%s ", zone->name);
#endif /* CONFIG_NUMA */
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index fdd163e22..7328b7436 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -37,12 +37,12 @@
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
-#include <linux/sched/sysctl.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
+#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -123,130 +123,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
}
}
-
-int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-/*
- * Make sure vm_committed_as in one cacheline and not cacheline shared with
- * other variables. It can be updated by several CPUs frequently.
- */
-struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
-
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
- -(s64)vm_committed_as_batch * num_online_cpus(),
- "memory commitment underflow");
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
/*
* Requires inode->i_mapping->i_mmap_rwsem
*/
@@ -1270,6 +1146,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long pgoff, unsigned long *populate)
{
struct mm_struct *mm = current->mm;
+ int pkey = 0;
*populate = 0;
@@ -1309,11 +1186,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (offset_in_page(addr))
return addr;
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(mm);
+ if (pkey < 0)
+ pkey = 0;
+ }
+
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -2640,10 +2523,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
struct vm_area_struct *vma;
unsigned long populate = 0;
unsigned long ret = -EINVAL;
+ struct file *file, *prfile;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
- "See Documentation/vm/remap_file_pages.txt.\n",
- current->comm, current->pid);
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
if (prot)
return ret;
@@ -2708,9 +2591,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
vma_get_file(vma);
+ file = vma->vm_file;
+ prfile = vma->vm_prfile;
ret = do_mmap_pgoff(vma->vm_file, start, size,
prot, flags, pgoff, &populate);
- vma_fput(vma);
+ if (!IS_ERR_VALUE(ret) && file && prfile) {
+ struct vm_area_struct *new_vma;
+
+ new_vma = find_vma(mm, ret);
+ if (!new_vma->vm_prfile)
+ new_vma->vm_prfile = prfile;
+ if (new_vma != vma)
+ get_file(prfile);
+ }
+ /*
+ * two fput()s instead of vma_fput(vma),
+ * coz vma may not be available anymore.
+ */
+ fput(file);
+ if (prfile)
+ fput(prfile);
out:
up_write(&mm->mmap_sem);
if (populate)
@@ -3009,8 +2909,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (is_data_mapping(flags) &&
mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
if (ignore_rlimit_data)
- pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
- "%lu. Will be forbidden soon.\n",
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA));
@@ -3065,11 +2964,16 @@ static int special_mapping_fault(struct vm_area_struct *vma,
pgoff_t pgoff;
struct page **pages;
- if (vma->vm_ops == &legacy_special_mapping_vmops)
+ if (vma->vm_ops == &legacy_special_mapping_vmops) {
pages = vma->vm_private_data;
- else
- pages = ((struct vm_special_mapping *)vma->vm_private_data)->
- pages;
+ } else {
+ struct vm_special_mapping *sm = vma->vm_private_data;
+
+ if (sm->fault)
+ return sm->fault(sm, vma, vmf);
+
+ pages = sm->pages;
+ }
for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5fbdd367b..f4259e496 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f7cb3d4d9..b650c5412 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -24,6 +24,7 @@
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/ksm.h>
+#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -354,10 +355,13 @@ fail:
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
- unsigned long vm_flags, nstart, end, tmp, reqprot;
+ unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+ const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
+ (prot & PROT_READ);
+
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
@@ -374,13 +378,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
return -EINVAL;
reqprot = prot;
- /*
- * Does the application expect PROT_READ to imply PROT_EXEC:
- */
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- prot |= PROT_EXEC;
-
- vm_flags = calc_vm_prot_bits(prot);
down_write(&current->mm->mmap_sem);
@@ -411,10 +408,15 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
for (nstart = start ; ; ) {
unsigned long newflags;
+ int pkey = arch_override_mprotect_pkey(vma, prot, -1);
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vm_flags;
+ /* Does the application expect PROT_READ to imply PROT_EXEC */
+ if (rier && (vma->vm_flags & VM_MAYEXEC))
+ prot |= PROT_EXEC;
+
+ newflags = calc_vm_prot_bits(prot, pkey);
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
@@ -445,6 +447,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = -ENOMEM;
goto out;
}
+ prot = reqprot;
}
out:
up_write(&current->mm->mmap_sem);
diff --git a/mm/mremap.c b/mm/mremap.c
index 8eeba02fc..3fa0a467d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -20,7 +20,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/sysctl.h>
#include <linux/uaccess.h>
#include <linux/mm-arch-hooks.h>
@@ -214,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
continue;
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
- if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
- new_pmd, new_addr))
+ if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 99feb2b07..bd05a70f4 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 1a4a06d09..4cfc2fc93 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,7 +33,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
-#include <linux/sched/sysctl.h>
#include <linux/printk.h>
#include <asm/uaccess.h>
@@ -48,33 +47,11 @@ struct page *mem_map;
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
unsigned long highest_memmap_pfn;
-struct percpu_counter vm_committed_as;
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
EXPORT_SYMBOL(mem_map);
/* list of mapped, potentially shareable regions */
@@ -162,7 +139,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (pages) {
pages[i] = virt_to_page(start);
if (pages[i])
- page_cache_get(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
vmas[i] = vma;
@@ -182,8 +159,7 @@ finish_or_fault:
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
@@ -194,18 +170,16 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
+ return __get_user_pages(current, current->mm, start, nr_pages, flags,
+ pages, vmas, NULL);
}
EXPORT_SYMBOL(get_user_pages);
-long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- int write, int force, struct page **pages,
- int *locked)
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
{
- return get_user_pages(tsk, mm, start, nr_pages, write, force,
- pages, NULL);
+ return get_user_pages(start, nr_pages, write, force, pages, NULL);
}
EXPORT_SYMBOL(get_user_pages_locked);
@@ -216,19 +190,18 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
{
long ret;
down_read(&mm->mmap_sem);
- ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
- pages, NULL);
+ ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
+ NULL, NULL);
up_read(&mm->mmap_sem);
return ret;
}
EXPORT_SYMBOL(__get_user_pages_unlocked);
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
- return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
- force, pages, 0);
+ return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
+ write, force, pages, 0);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -1084,7 +1057,7 @@ static unsigned long determine_vm_flags(struct file *file,
{
unsigned long vm_flags;
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+ vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
if (!(capabilities & NOMMU_MAP_DIRECT)) {
@@ -1829,100 +1802,6 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some 3% for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dc490c069..86349586e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+
+#include <asm/tlb.h>
+#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
@@ -287,9 +292,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
-
return OOM_SCAN_OK;
}
@@ -386,10 +388,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
+
cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
@@ -408,6 +410,176 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
bool oom_killer_disabled __read_mostly;
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+static struct task_struct *oom_reaper_list;
+static DEFINE_SPINLOCK(oom_reaper_lock);
+
+
+static bool __oom_reap_task(struct task_struct *tsk)
+{
+ struct mmu_gather tlb;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct task_struct *p;
+ struct zap_details details = {.check_swap_entries = true,
+ .ignore_dirty = true};
+ bool ret = true;
+
+ /*
+ * Make sure we find the associated mm_struct even when the particular
+ * thread has already terminated and cleared its mm.
+ * We might have race with exit path so consider our work done if there
+ * is no mm.
+ */
+ p = find_lock_task_mm(tsk);
+ if (!p)
+ return true;
+
+ mm = p->mm;
+ if (!atomic_inc_not_zero(&mm->mm_users)) {
+ task_unlock(p);
+ return true;
+ }
+
+ task_unlock(p);
+
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ ret = false;
+ goto out;
+ }
+
+ tlb_gather_mmu(&tlb, mm, 0, -1);
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ /*
+ * mlocked VMAs require explicit munlocking before unmap.
+ * Let's keep it simple here and skip such VMAs.
+ */
+ if (vma->vm_flags & VM_LOCKED)
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+ unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+ &details);
+ }
+ tlb_finish_mmu(&tlb, 0, -1);
+ pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+ task_pid_nr(tsk), tsk->comm,
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)));
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+ * reasonably reclaimable memory anymore. OOM killer can continue
+ * by selecting other victim if unmapping hasn't led to any
+ * improvements. This also means that selecting this task doesn't
+ * make any sense.
+ */
+ tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+ exit_oom_victim(tsk);
+out:
+ mmput(mm);
+ return ret;
+}
+
+#define MAX_OOM_REAP_RETRIES 10
+static void oom_reap_task(struct task_struct *tsk)
+{
+ int attempts = 0;
+
+ /* Retry the down_read_trylock(mmap_sem) a few times */
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+ schedule_timeout_idle(HZ/10);
+
+ if (attempts > MAX_OOM_REAP_RETRIES) {
+ pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ debug_show_all_locks();
+ }
+
+ /* Drop a reference taken by wake_oom_reaper */
+ put_task_struct(tsk);
+}
+
+static int oom_reaper(void *unused)
+{
+ set_freezable();
+
+ while (true) {
+ struct task_struct *tsk = NULL;
+
+ wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+ spin_lock(&oom_reaper_lock);
+ if (oom_reaper_list != NULL) {
+ tsk = oom_reaper_list;
+ oom_reaper_list = tsk->oom_reaper_list;
+ }
+ spin_unlock(&oom_reaper_lock);
+
+ if (tsk)
+ oom_reap_task(tsk);
+ }
+
+ return 0;
+}
+
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+ if (!oom_reaper_th)
+ return;
+
+ /* tsk is already queued? */
+ if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ return;
+
+ get_task_struct(tsk);
+
+ spin_lock(&oom_reaper_lock);
+ tsk->oom_reaper_list = oom_reaper_list;
+ oom_reaper_list = tsk;
+ spin_unlock(&oom_reaper_lock);
+ wake_up(&oom_reaper_wait);
+}
+
+static int __init oom_init(void)
+{
+ oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+ if (IS_ERR(oom_reaper_th)) {
+ pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
+ PTR_ERR(oom_reaper_th));
+ oom_reaper_th = NULL;
+ }
+ return 0;
+}
+subsys_initcall(oom_init)
+#else
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
/**
* mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
@@ -434,9 +606,10 @@ void mark_oom_victim(struct task_struct *tsk)
/**
* exit_oom_victim - note the exit of an OOM victim
*/
-void exit_oom_victim(void)
+void exit_oom_victim(struct task_struct *tsk)
{
- clear_thread_flag(TIF_MEMDIE);
+ if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
@@ -458,15 +631,11 @@ void exit_oom_victim(void)
bool oom_killer_disable(void)
{
/*
- * Make sure to not race with an ongoing OOM killer
- * and that the current is not the victim.
+ * Make sure to not race with an ongoing OOM killer. Check that the
+ * current is not killed (possibly due to sharing the victim's memory).
*/
- mutex_lock(&oom_lock);
- if (test_thread_flag(TIF_MEMDIE)) {
- mutex_unlock(&oom_lock);
+ if (mutex_lock_killable(&oom_lock))
return false;
- }
-
oom_killer_disabled = true;
mutex_unlock(&oom_lock);
@@ -501,7 +670,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
* returning.
@@ -517,6 +685,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
+ bool can_oom_reap = true;
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -607,17 +776,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD))
- continue;
- if (is_global_init(p))
- continue;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
+ p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ /*
+ * We cannot use oom_reaper for the mm shared by this
+ * process because it wouldn't get killed and so the
+ * memory might be still used.
+ */
+ can_oom_reap = false;
continue;
-
+ }
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
rcu_read_unlock();
+ if (can_oom_reap)
+ wake_oom_reaper(victim);
+
mmdrop(mm);
put_task_struct(victim);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2a88657ea..0b68ee935 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1177,6 +1177,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
+ unsigned long shift;
/*
* The dirty rate will match the writeout rate in long term, except
@@ -1301,11 +1302,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
- step >>= dirty_ratelimit / (2 * step + 1);
- /*
- * Limit the tracking speed to avoid overshooting.
- */
- step = (step + 7) / 8;
+ shift = dirty_ratelimit / (2 * step + 1);
+ if (shift < BITS_PER_LONG)
+ step = DIV_ROUND_UP(step >> shift, 8);
+ else
+ step = 0;
if (dirty_ratelimit < balanced_dirty_ratelimit)
dirty_ratelimit += step;
@@ -2185,8 +2186,8 @@ int write_cache_pages(struct address_space *mapping,
cycled = 0;
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
@@ -2391,14 +2392,14 @@ int write_one_page(struct page *page, int wait)
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- page_cache_get(page);
+ get_page(page);
ret = mapping->a_ops->writepage(page, &wbc);
if (ret == 0 && wait) {
wait_on_page_writeback(page);
if (PageError(page))
ret = -EIO;
}
- page_cache_release(page);
+ put_page(page);
} else {
unlock_page(page);
}
@@ -2419,12 +2420,11 @@ int __set_page_dirty_no_writeback(struct page *page)
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg)
+void account_page_dirtied(struct page *page, struct address_space *mapping)
{
struct inode *inode = mapping->host;
@@ -2436,12 +2436,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
__inc_wb_stat(wb, WB_DIRTIED);
- task_io_account_write(PAGE_CACHE_SIZE);
+ task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
}
@@ -2451,16 +2451,16 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, struct bdi_writeback *wb)
+ struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
- task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+ task_io_account_cancelled_write(PAGE_SIZE);
}
}
@@ -2478,26 +2478,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
if (!mapping) {
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 1;
}
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2505,7 +2503,7 @@ int __set_page_dirty_nobuffers(struct page *page)
}
return 1;
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2635,17 +2633,16 @@ void cancel_dirty_page(struct page *page)
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping, memcg, wb);
+ account_page_cleaned(page, mapping, wb);
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
} else {
ClearPageDirty(page);
}
@@ -2676,7 +2673,6 @@ int clear_page_dirty_for_io(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
/*
@@ -2714,16 +2710,14 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
- memcg = mem_cgroup_begin_page_stat(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
return ret;
}
return TestClearPageDirty(page);
@@ -2733,10 +2727,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2760,21 +2753,20 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2802,10 +2794,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7b387cc2a..8aaa0e57f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -224,6 +224,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
@@ -237,6 +250,7 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -248,6 +262,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -294,13 +309,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
+ unsigned long max_initialise;
+
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
- /* Initialise at least 2G of the highest zone */
(*nr_initialised)++;
- if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ if ((*nr_initialised > max_initialise) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -417,7 +439,7 @@ static void bad_page(struct page *page, const char *reason,
goto out;
}
if (nr_unshown) {
- printk(KERN_ALERT
+ pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
@@ -427,9 +449,14 @@ static void bad_page(struct page *page, const char *reason,
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
- printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page_badflags(page, reason, bad_flags);
+ __dump_page(page, reason);
+ bad_flags &= page->flags;
+ if (bad_flags)
+ pr_alert("bad because of flags: %#lx(%pGp)\n",
+ bad_flags, &bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
@@ -478,7 +505,9 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -489,6 +518,9 @@ static int __init early_debug_pagealloc(char *buf)
if (strcmp(buf, "on") == 0)
_debug_pagealloc_enabled = true;
+ if (strcmp(buf, "off") == 0)
+ _debug_pagealloc_enabled = false;
+
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -520,11 +552,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
- printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -762,7 +794,7 @@ static inline int free_pages_check(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -978,7 +1010,7 @@ static inline void init_reserved_page(unsigned long pfn)
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -1029,6 +1061,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
return true;
@@ -1131,6 +1164,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
return __free_pages_boot_core(page, pfn, order);
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(struct page *page,
unsigned long pfn, int nr_pages)
@@ -1281,9 +1383,13 @@ free_range:
pgdat_init_report_one_done();
return 0;
}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
+ struct zone *zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
int nid;
/* There will be num_node_state(N_MEMORY) threads */
@@ -1297,8 +1403,11 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
+#endif
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1387,7 +1496,7 @@ static inline int check_new_page(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -1408,10 +1517,17 @@ static inline int check_new_page(struct page *page)
return 0;
}
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+ return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled() && poisoned;
+}
+
static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
int alloc_flags)
{
int i;
+ bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
@@ -1422,6 +1538,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
SetPageTOI_Untracked(p);
toi_make_writable(init_mm.pgd, (unsigned long) page_address(p));
}
+ if (poisoned)
+ poisoned &= page_is_poisoned(p);
}
set_page_private(page, 0);
@@ -1429,9 +1547,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
- if (gfp_flags & __GFP_ZERO)
+ if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
@@ -2270,19 +2389,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
list_del(&page->lru);
pcp->count--;
} else {
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- /*
- * __GFP_NOFAIL is not to be used in new code.
- *
- * All __GFP_NOFAIL callers should be fixed so that they
- * properly detect and handle allocation failures.
- *
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with
- * __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- }
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
page = NULL;
@@ -2722,9 +2833,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
- current->comm, order, gfp_mask);
-
+ pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
+ current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
@@ -2780,8 +2890,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
* keep looping as per tradition.
+ *
+ * But do not keep looping if oom_killer_disable()
+ * was already called, for the system is trying to
+ * enter a quiescent state during suspend.
*/
- *did_some_progress = 1;
+ *did_some_progress = !oom_killer_disabled;
goto out;
}
if (pm_suspended_storage())
@@ -3040,14 +3154,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
- /*
- * If this allocation cannot block and it is for a specific node, then
- * fail early. There's no need to wakeup kswapd or retry for a
- * speculative node-specific allocation.
- */
- if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
- goto nopage;
-
retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3404,7 +3510,7 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- atomic_add(size - 1, &page->_count);
+ page_ref_add(page, size - 1);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3416,7 +3522,7 @@ refill:
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
- if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3424,7 +3530,7 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- atomic_set(&page->_count, size);
+ set_page_count(page, size);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = size;
@@ -3635,6 +3741,49 @@ static inline void show_node(struct zone *zone)
printk("Node %d ", zone_to_nid(zone));
}
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ */
+ available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
@@ -3967,9 +4116,7 @@ static int __parse_numa_zonelist_order(char *s)
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
- printk(KERN_WARNING
- "Ignoring invalid numa_zonelist_order value: "
- "%s\n", s);
+ pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4433,12 +4580,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. "
- "Total pages: %ld\n",
- nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
- page_group_by_mobility_disabled ? "off" : "on",
- vm_total_pages);
+ pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ nr_online_nodes,
+ zonelist_order_name[current_zonelist_order],
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
@@ -4523,6 +4669,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ struct memblock_region *r = NULL, *tmp;
+#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4536,20 +4685,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
- * There can be holes in boot-time mem_map[]s
- * handed to this function. They do not
- * exist on hotplugged memory.
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
+ */
+ if (context != MEMMAP_EARLY)
+ goto not_early;
+
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+ break;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+ * from zone_movable_pfn[nid] to end of each node should be
+ * ZONE_MOVABLE not ZONE_NORMAL. skip it.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (!mirrored_kernelcore && zone_movable_pfn[nid])
+ if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
continue;
- if (!early_pfn_in_nid(pfn, nid))
+
+ /*
+ * Check given memblock attribute by firmware which can affect
+ * kernel memory layout. If zone==ZONE_MOVABLE but memory is
+ * mirrored, it's an overlapped memmap init. skip it.
+ */
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, tmp)
+ if (pfn < memblock_region_memory_end_pfn(tmp))
+ break;
+ r = tmp;
+ }
+ if (pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ /* already initialized as NORMAL */
+ pfn = memblock_region_memory_end_pfn(r);
continue;
- if (!update_defer_init(pgdat, pfn, end_pfn,
- &nr_initialised))
- break;
+ }
}
+#endif
+not_early:
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -4966,11 +5146,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
@@ -4985,31 +5160,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *ignored)
{
- unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
+ return *zone_end_pfn - *zone_start_pfn;
}
/*
@@ -5055,6 +5230,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
@@ -5066,7 +5242,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
- return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (zone_movable_pfn[nid]) {
+ if (mirrored_kernelcore) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ } else {
+ if (zone_type == ZONE_NORMAL)
+ nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ }
+ }
+
+ return nr_absent;
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5074,8 +5282,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *zones_size)
{
+ unsigned int zone;
+
+ *zone_start_pfn = node_start_pfn;
+ for (zone = 0; zone < zone_type; zone++)
+ *zone_start_pfn += zones_size[zone];
+
+ *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
+
return zones_size[zone_type];
}
@@ -5104,15 +5322,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
@@ -5233,7 +5458,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
@@ -5249,11 +5473,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
@@ -5272,8 +5500,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds freesize %lu\n",
+ pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
@@ -5322,7 +5549,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
}
}
@@ -5390,6 +5616,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+ start_pfn = node_start_pfn;
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5480,8 +5708,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
min_pfn = min(min_pfn, start_pfn);
if (min_pfn == ULONG_MAX) {
- printk(KERN_WARNING
- "Could not find start_pfn for node %d\n", nid);
+ pr_warn("Could not find start_pfn for node %d\n", nid);
return 0;
}
@@ -5561,6 +5788,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
/*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_memblock(memory, r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.");
+
+ goto out2;
+ }
+
+ /*
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
@@ -5820,6 +6077,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
*/
static int __init cmdline_parse_kernelcore(char *p)
{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
return cmdline_parse_core(p, &required_kernelcore);
}
@@ -5917,22 +6180,21 @@ void __init mem_init_print_info(const char *str)
#undef adj_init_size
- pr_info("Memory: %luK/%luK available "
- "(%luK kernel code, %luK rwdata, %luK rodata, "
- "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
+ ", %luK highmem"
#endif
- "%s%s)\n",
- nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
- totalcma_pages << (PAGE_SHIFT-10),
+ "%s%s)\n",
+ nr_free_pages() << (PAGE_SHIFT - 10),
+ physpages << (PAGE_SHIFT - 10),
+ codesize >> 10, datasize >> 10, rosize >> 10,
+ (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT-10),
+ totalhigh_pages << (PAGE_SHIFT - 10),
#endif
- str ? ", " : "", str ? str : "");
+ str ? ", " : "", str ? str : "");
}
/**
@@ -6107,8 +6369,17 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_MIN] = tmp;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ /*
+ * Set the kswapd watermarks distance according to the
+ * scale factor in proportion to available memory, but
+ * ensure a minimum size on small systems.
+ */
+ tmp = max_t(u64, tmp >> 2,
+ mult_frac(zone->managed_pages,
+ watermark_scale_factor, 10000));
+
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6249,6 +6520,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
@@ -6440,11 +6726,8 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
- tablename,
- (1UL << log2qty),
- ilog2(size) - PAGE_SHIFT,
- size);
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
if (_hash_shift)
*_hash_shift = log2qty;
@@ -6595,7 +6878,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* This check already skips compound tails of THP
* because their page->_count is zero at all time.
*/
- if (!atomic_read(&page->_count)) {
+ if (!page_ref_count(page)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
@@ -6945,8 +7228,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
#ifdef CONFIG_DEBUG_VM
- printk(KERN_INFO "remove from free list %lx %d %lx\n",
- pfn, 1 << order, end_pfn);
+ pr_info("remove from free list %lx %d %lx\n",
+ pfn, 1 << order, end_pfn);
#endif
list_del(&page->lru);
rmv_page_order(page);
@@ -6959,7 +7242,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
-#ifdef CONFIG_MEMORY_FAILURE
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -6978,4 +7260,3 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
-#endif
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 292ca7b8d..2d864e64f 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page)
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (unlikely(!base))
return NULL;
@@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (!section->page_ext)
return NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index b995a5ba5..985f23cfa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -56,31 +56,20 @@ void end_swap_bio_write(struct bio *bio)
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
- printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
}
-static void end_swap_bio_read(struct bio *bio)
+static void swap_slot_free_notify(struct page *page)
{
- struct page *page = bio->bi_io_vec[0].bv_page;
-
- if (bio->bi_error) {
- SetPageError(page);
- ClearPageUptodate(page);
- printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
- goto out;
- }
-
- SetPageUptodate(page);
+ struct swap_info_struct *sis;
+ struct gendisk *disk;
/*
* There is no guarantee that the page is in swap cache - the software
@@ -88,42 +77,59 @@ static void end_swap_bio_read(struct bio *bio)
* swapcache page. So we must check PG_swapcache before proceeding with
* this optimization.
*/
- if (likely(PageSwapCache(page))) {
- struct swap_info_struct *sis;
+ if (unlikely(!PageSwapCache(page)))
+ return;
- sis = page_swap_info(page);
- if (sis->flags & SWP_BLKDEV) {
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- struct gendisk *disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
- unsigned long offset;
+ sis = page_swap_info(page);
+ if (!(sis->flags & SWP_BLKDEV))
+ return;
- entry.val = page_private(page);
- offset = swp_offset(entry);
+ /*
+ * The swap subsystem performs lazy swap slot freeing,
+ * expecting that the page will be swapped out again.
+ * So we can avoid an unnecessary write if the page
+ * isn't redirtied.
+ * This is good for real swap storage because we can
+ * reduce unnecessary I/O and enhance wear-leveling
+ * if an SSD is used as the as swap device.
+ * But if in-memory swap device (eg zram) is used,
+ * this causes a duplicated copy between uncompressed
+ * data in VM-owned memory and compressed data in
+ * zram-owned memory. So let's free zram-owned memory
+ * and make the VM-owned decompressed page *dirty*,
+ * so the page should be swapped out somewhere again if
+ * we again wish to reclaim it.
+ */
+ disk = sis->bdev->bd_disk;
+ if (disk->fops->swap_slot_free_notify) {
+ swp_entry_t entry;
+ unsigned long offset;
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
- }
+ entry.val = page_private(page);
+ offset = swp_offset(entry);
+
+ SetPageDirty(page);
+ disk->fops->swap_slot_free_notify(sis->bdev,
+ offset);
}
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ struct page *page = bio->bi_io_vec[0].bv_page;
+ if (bio->bi_error) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ goto out;
+ }
+
+ SetPageUptodate(page);
+ swap_slot_free_notify(page);
out:
unlock_page(page);
bio_put(bio);
@@ -216,7 +222,7 @@ reprobe:
out:
return ret;
bad_bmap:
- printk(KERN_ERR "swapon: swapfile has holes\n");
+ pr_err("swapon: swapfile has holes\n");
ret = -EINVAL;
goto out;
}
@@ -246,7 +252,7 @@ out:
static sector_t swap_page_sector(struct page *page)
{
- return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
+ return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
}
int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -290,8 +296,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
*/
set_page_dirty(page);
ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
- page_file_offset(page));
+ pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
+ page_file_offset(page));
}
end_page_writeback(page);
return ret;
@@ -347,6 +353,11 @@ int swap_readpage(struct page *page)
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
if (!ret) {
+ if (trylock_page(page)) {
+ swap_slot_free_notify(page);
+ unlock_page(page);
+ }
+
count_vm_event(PSWPIN);
return 0;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 31555b689..c4f568206 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -215,7 +215,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* all pages in [start_pfn...end_pfn) must be in the same zone.
* zone->lock must be held before call this.
*
- * Returns 1 if all pages in the range are isolated.
+ * Returns the last tested pfn.
*/
static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10f..ac3d8d129 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
#include "internal.h"
static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static void init_early_allocated_pages(void);
@@ -37,7 +39,7 @@ static void init_page_owner(void)
if (page_owner_disabled)
return;
- page_owner_inited = true;
+ static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
page_ext->nr_entries = trace.nr_entries;
+ page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ page_ext->last_migrate_reason = reason;
+}
+
gfp_t __get_page_owner_gfp(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
return page_ext->gfp_mask;
}
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+ int i;
+
+ new_ext->order = old_ext->order;
+ new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->nr_entries = old_ext->nr_entries;
+
+ for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
+ new_ext->trace_entries[i] = old_ext->trace_entries[i];
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask 0x%x\n",
- page_ext->order, page_ext->gfp_mask);
+ "Page allocated via order %u, mask %#x(%pGg)\n",
+ page_ext->order, page_ext->gfp_mask,
+ &page_ext->gfp_mask);
if (ret >= count)
goto err;
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pfnblock_migratetype(page, pfn);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
+ migratetype_names[page_mt],
pfn >> pageblock_order,
- pageblock_mt,
- pageblock_mt != page_mt ? "Fallback" : " ",
- PageLocked(page) ? "K" : " ",
- PageError(page) ? "E" : " ",
- PageReferenced(page) ? "R" : " ",
- PageUptodate(page) ? "U" : " ",
- PageDirty(page) ? "D" : " ",
- PageLRU(page) ? "L" : " ",
- PageActive(page) ? "A" : " ",
- PageSlab(page) ? "S" : " ",
- PageWriteback(page) ? "W" : " ",
- PageCompound(page) ? "C" : " ",
- PageSwapCache(page) ? "B" : " ",
- PageMappedToDisk(page) ? "M" : " ");
+ migratetype_names[pageblock_mt],
+ page->flags, &page->flags);
if (ret >= count)
goto err;
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
+ if (page_ext->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -150,6 +183,30 @@ err:
return -ENOMEM;
}
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
+ gfp_t gfp_mask = page_ext->gfp_mask;
+ int mt = gfpflags_to_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+ print_stack_trace(&trace, 0);
+
+ if (page_ext->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+}
+
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
@@ -157,7 +214,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page *page;
struct page_ext *page_ext;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
page = NULL;
@@ -305,7 +362,7 @@ static int __init pageowner_init(void)
{
struct dentry *dentry;
- if (!page_owner_inited) {
+ if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c
index 5bf5906ce..479e7ea2b 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/page_poison.c
@@ -6,22 +6,48 @@
#include <linux/poison.h>
#include <linux/ratelimit.h>
-static bool page_poisoning_enabled __read_mostly;
+static bool __page_poisoning_enabled __read_mostly;
+static bool want_page_poisoning __read_mostly;
-static bool need_page_poisoning(void)
+static int early_page_poison_param(char *buf)
{
- if (!debug_pagealloc_enabled())
- return false;
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ want_page_poisoning = true;
+ else if (strcmp(buf, "off") == 0)
+ want_page_poisoning = false;
- return true;
+ return 0;
+}
+early_param("page_poison", early_page_poison_param);
+
+bool page_poisoning_enabled(void)
+{
+ return __page_poisoning_enabled;
+}
+
+static bool need_page_poisoning(void)
+{
+ return want_page_poisoning;
}
static void init_page_poisoning(void)
{
- if (!debug_pagealloc_enabled())
- return;
+ /*
+ * page poisoning is debug page alloc for some arches. If either
+ * of those options are enabled, enable poisoning
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
+ if (!want_page_poisoning && !debug_pagealloc_enabled())
+ return;
+ } else {
+ if (!want_page_poisoning)
+ return;
+ }
- page_poisoning_enabled = true;
+ __page_poisoning_enabled = true;
}
struct page_ext_operations page_poisoning_ops = {
@@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page)
__clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
-static inline bool page_poison(struct page *page)
+bool page_is_poisoned(struct page *page)
{
struct page_ext *page_ext;
page_ext = lookup_page_ext(page);
+ if (!page_ext)
+ return false;
+
return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
@@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
unsigned char *start;
unsigned char *end;
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+ return;
+
start = memchr_inv(mem, PAGE_POISON, bytes);
if (!start)
return;
@@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
if (!__ratelimit(&ratelimit))
return;
else if (start == end && single_bit_flip(*start, PAGE_POISON))
- printk(KERN_ERR "pagealloc: single bit error\n");
+ pr_err("pagealloc: single bit error\n");
else
- printk(KERN_ERR "pagealloc: memory corruption\n");
+ pr_err("pagealloc: memory corruption\n");
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
end - start + 1, 1);
@@ -108,7 +140,7 @@ static void unpoison_page(struct page *page)
{
void *addr;
- if (!page_poison(page))
+ if (!page_is_poisoned(page))
return;
addr = kmap_atomic(page);
@@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n)
unpoison_page(page + i);
}
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void kernel_poison_pages(struct page *page, int numpages, int enable)
{
- if (!page_poisoning_enabled)
+ if (!page_poisoning_enabled())
return;
if (enable)
@@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
else
poison_pages(page, numpages);
}
+
+#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ /* This function does nothing, all work is done via poison pages */
+}
+#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 10e3d0b8a..d66911ff4 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
/* all units must be in a single group */
if (ai->nr_groups != 1) {
- printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+ pr_crit("can't handle more than one group\n");
return -EINVAL;
}
@@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
alloc_pages = roundup_pow_of_two(nr_pages);
if (alloc_pages > nr_pages)
- printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
- alloc_pages - nr_pages);
+ pr_warn("wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
return 0;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 2f040d0b8..ccc979953 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -53,6 +53,8 @@
* setup the first chunk containing the kernel static percpu area
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
@@ -889,8 +891,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
size = ALIGN(size, 2);
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
- WARN(true, "illegal size (%zu) or align (%zu) for "
- "percpu allocation\n", size, align);
+ WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+ size, align);
return NULL;
}
@@ -1034,11 +1036,11 @@ fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
if (!is_atomic && warn_limit) {
- pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
- size, align, is_atomic, err);
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
dump_stack();
if (!--warn_limit)
- pr_info("PERCPU: limit reached, disable warning\n");
+ pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
/* see the flag handling in pcpu_blance_workfn() */
@@ -1450,20 +1452,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
- printk(KERN_CONT "\n");
+ pr_cont("\n");
printk("%spcpu-alloc: ", lvl);
}
- printk(KERN_CONT "[%0*d] ", group_width, group);
+ pr_cont("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
- printk(KERN_CONT "%0*d ", cpu_width,
- gi->cpu_map[unit]);
+ pr_cont("%0*d ",
+ cpu_width, gi->cpu_map[unit]);
else
- printk(KERN_CONT "%s ", empty_str);
+ pr_cont("%s ", empty_str);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
/**
@@ -1539,8 +1541,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
- pr_emerg("PERCPU: failed to initialize, %s", #cond); \
- pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
+ pr_emerg("failed to initialize, %s\n", #cond); \
+ pr_emerg("cpu_possible_mask=%*pb\n", \
cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
@@ -1724,7 +1726,7 @@ static int __init percpu_alloc_setup(char *str)
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
else
- pr_warning("PERCPU: unknown allocator %s specified\n", str);
+ pr_warn("unknown allocator %s specified\n", str);
return 0;
}
@@ -2018,9 +2020,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
- pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
- "space 0x%lx\n", max_distance,
- VMALLOC_TOTAL);
+ pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
+ max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
@@ -2028,7 +2029,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
#endif
}
- pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
+ pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
@@ -2102,8 +2103,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
- pr_warning("PERCPU: failed to allocate %s page "
- "for cpu%u\n", psize_str, cpu);
+ pr_warn("failed to allocate %s page for cpu%u\n",
+ psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
@@ -2142,7 +2143,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
}
/* we're ready, commit */
- pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
+ pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, ai->static_size,
ai->reserved_size, ai->dyn_size);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 06a005b97..71c5f9109 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
-
-/*
- * ARCHes with special requirements for evicting THP backing TLB entries can
- * implement this. Otherwise also, it can help optimize normal TLB flush in
- * THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB if flush span is greater than a threshold, which will
- * likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desirable.
- * e.g. see arch/arc: flush_pmd_tlb_range
- */
-#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
-#endif
-
#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5d453e58d..07514d41e 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr,
int pages = min(nr_pages, max_pages_per_loop);
size_t bytes;
- /* Get the pages we're interested in */
- pages = get_user_pages_unlocked(task, mm, pa, pages,
- vm_write, 0, process_pages);
+ /*
+ * Get the pages we're interested in. We must
+ * add FOLL_REMOTE because task/mm might not
+ * current/current->mm
+ */
+ pages = __get_user_pages_unlocked(task, mm, pa, pages,
+ vm_write, 0, process_pages,
+ FOLL_REMOTE);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 942212970..daf6ff6e1 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -8,7 +8,7 @@
* improved on it.
*
* Copyright (C) 2007 SGI,
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
* Generalized, added support for multiple lists and
* constructors / destructors.
*/
diff --git a/mm/readahead.c b/mm/readahead.c
index 20e58e820..40be3ae0a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -47,11 +47,11 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
if (!trylock_page(page))
BUG();
page->mapping = mapping;
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0, PAGE_SIZE);
page->mapping = NULL;
unlock_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -93,14 +93,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
read_cache_pages_invalidate_page(mapping, page);
continue;
}
- page_cache_release(page);
+ put_page(page);
ret = filler(data, page);
if (unlikely(ret)) {
read_cache_pages_invalidate_pages(mapping, pages);
break;
}
- task_io_account_read(PAGE_CACHE_SIZE);
+ task_io_account_read(PAGE_SIZE);
}
return ret;
}
@@ -130,7 +130,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
mapping->a_ops->readpage(filp, page);
}
- page_cache_release(page);
+ put_page(page);
}
ret = 0;
@@ -163,7 +163,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (isize == 0)
goto out;
- end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ end_index = ((isize - 1) >> PAGE_SHIFT);
/*
* Preallocate as many pages as we will need.
@@ -216,7 +216,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
while (nr_to_read) {
int err;
- unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+ unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
@@ -425,7 +425,7 @@ ondemand_readahead(struct address_space *mapping,
* trivial case: (offset - prev_offset) == 1
* unaligned reads: (offset - prev_offset) == 0
*/
- prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
if (offset - prev_offset <= 1UL)
goto initial_readahead;
@@ -558,8 +558,8 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
if (f.file) {
if (f.file->f_mode & FMODE_READ) {
struct address_space *mapping = f.file->f_mapping;
- pgoff_t start = offset >> PAGE_CACHE_SHIFT;
- pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + count - 1) >> PAGE_SHIFT;
unsigned long len = end - start + 1;
ret = do_readahead(mapping, f.file, start, len);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 79f3bf047..3ebf9c4c2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -569,19 +569,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
}
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-static void percpu_flush_tlb_batch_pages(void *data)
-{
- /*
- * All TLB entries are flushed on the assumption that it is
- * cheaper to flush all TLBs and let them be refilled than
- * flushing individual PFNs. Note that we do not track mm's
- * to flush as that might simply be multiple full TLB flushes
- * for no gain.
- */
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- flush_tlb_local();
-}
-
/*
* Flush TLB entries for recently unmapped pages from remote CPUs. It is
* important if a PTE was dirty when it was unmapped that it's flushed
@@ -598,15 +585,14 @@ void try_to_unmap_flush(void)
cpu = get_cpu();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
- percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
- smp_call_function_many(&tlb_ubc->cpumask,
- percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+ if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
}
+
+ if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
cpumask_clear(&tlb_ubc->cpumask);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
@@ -1112,6 +1098,8 @@ void page_move_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_VMA(!anon_vma, vma);
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page))
+ address &= HPAGE_PMD_MASK;
VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -1287,21 +1275,17 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
@@ -1320,12 +1304,12 @@ static void page_remove_file_rmap(struct page *page)
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1435,6 +1419,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
goto out;
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_address(vma, address,
+ flags & TTU_MIGRATION, page);
+ /* check if we have anything to do after split */
+ if (page_mapcount(page) == 0)
+ goto out;
+ }
+
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -1551,7 +1543,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
discard:
page_remove_rmap(page, PageHuge(page));
- page_cache_release(page);
+ put_page(page);
out_unmap:
pte_unmap_unlock(pte, ptl);
@@ -1580,10 +1572,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return is_vma_temporary_stack(vma);
}
-static int page_not_mapped(struct page *page)
+static int page_mapcount_is_zero(struct page *page)
{
- return !page_mapped(page);
-};
+ return !page_mapcount(page);
+}
/**
* try_to_unmap - try to remove all page table mappings to a page
@@ -1610,12 +1602,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = &rp,
- .done = page_not_mapped,
+ .done = page_mapcount_is_zero,
.anon_lock = page_lock_anon_vma_read,
};
- VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
-
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1627,9 +1617,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
- ret = rmap_walk(page, &rwc);
+ if (flags & TTU_RMAP_LOCKED)
+ ret = rmap_walk_locked(page, &rwc);
+ else
+ ret = rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapped(page)) {
+ if (ret != SWAP_MLOCK && !page_mapcount(page)) {
ret = SWAP_SUCCESS;
if (rp.lazyfreed && !PageDirty(page))
ret = SWAP_LZFREE;
@@ -1637,6 +1630,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
return ret;
}
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
@@ -1719,14 +1717,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (locked) {
+ anon_vma = page_anon_vma(page);
+ /* anon_vma disappear under us? */
+ VM_BUG_ON_PAGE(!anon_vma, page);
+ } else {
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ }
if (!anon_vma)
return ret;
@@ -1746,7 +1751,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
if (rwc->done && rwc->done(page))
break;
}
- anon_vma_unlock_read(anon_vma);
+
+ if (!locked)
+ anon_vma_unlock_read(anon_vma);
return ret;
}
@@ -1763,9 +1770,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
pgoff_t pgoff;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
@@ -1782,7 +1790,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
return ret;
pgoff = page_to_pgoff(page);
- i_mmap_lock_read(mapping);
+ if (!locked)
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
@@ -1799,7 +1808,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
}
done:
- i_mmap_unlock_read(mapping);
+ if (!locked)
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -1808,9 +1818,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
if (unlikely(PageKsm(page)))
return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rwc);
+ return rmap_walk_anon(page, rwc, false);
+ else
+ return rmap_walk_file(page, rwc, false);
+}
+
+/* Like rmap_walk, but caller holds relevant rmap lock */
+int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+{
+ /* no ksm support for now */
+ VM_BUG_ON_PAGE(PageKsm(page), page);
+ if (PageAnon(page))
+ return rmap_walk_anon(page, rwc, true);
else
- return rmap_walk_file(page, rwc);
+ return rmap_walk_file(page, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/shmem.c b/mm/shmem.c
index f64ab5f8c..83e44bb6f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -75,8 +75,8 @@ static struct vfsmount *shm_mnt;
#include "internal.h"
-#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
-#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
+#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
+#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
@@ -176,13 +176,13 @@ static inline int shmem_reacct_size(unsigned long flags,
static inline int shmem_acct_block(unsigned long flags)
{
return (flags & VM_NORESERVE) ?
- security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+ security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0;
}
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
if (flags & VM_NORESERVE)
- vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+ vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
static const struct super_operations shmem_ops;
@@ -300,7 +300,7 @@ static int shmem_add_to_page_cache(struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- page_cache_get(page);
+ get_page(page);
page->mapping = mapping;
page->index = index;
@@ -318,7 +318,7 @@ static int shmem_add_to_page_cache(struct page *page,
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
- page_cache_release(page);
+ put_page(page);
}
return error;
}
@@ -338,7 +338,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
__dec_zone_page_state(page, NR_FILE_PAGES);
__dec_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
- page_cache_release(page);
+ put_page(page);
BUG_ON(error);
}
@@ -376,28 +376,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
if (iter.index >= end)
break;
page = radix_tree_deref_slot(slot);
- /*
- * This should only be possible to happen at index 0, so we
- * don't need to reset the counter, nor do we risk infinite
- * restarts.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
if (radix_tree_exceptional_entry(page))
swapped++;
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
@@ -479,10 +474,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
- unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
- unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+ pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pgoff_t end = (lend + 1) >> PAGE_SHIFT;
+ unsigned int partial_start = lstart & (PAGE_SIZE - 1);
+ unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
long nr_swaps_freed = 0;
@@ -535,7 +530,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
struct page *page = NULL;
shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
if (page) {
- unsigned int top = PAGE_CACHE_SIZE;
+ unsigned int top = PAGE_SIZE;
if (start > end) {
top = partial_end;
partial_end = 0;
@@ -543,7 +538,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
zero_user_segment(page, partial_start, top);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
if (partial_end) {
@@ -553,7 +548,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
zero_user_segment(page, 0, partial_end);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
if (start >= end)
@@ -838,7 +833,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return error;
}
@@ -1085,7 +1080,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
if (!newpage)
return -ENOMEM;
- page_cache_get(newpage);
+ get_page(newpage);
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
@@ -1116,7 +1111,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_replace_page(oldpage, newpage);
+ mem_cgroup_migrate(oldpage, newpage);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
@@ -1125,8 +1120,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
set_page_private(oldpage, 0);
unlock_page(oldpage);
- page_cache_release(oldpage);
- page_cache_release(oldpage);
+ put_page(oldpage);
+ put_page(oldpage);
return error;
}
@@ -1150,7 +1145,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
int once = 0;
int alloced = 0;
- if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
+ if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
repeat:
swap.val = 0;
@@ -1161,7 +1156,7 @@ repeat:
}
if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
- ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
goto unlock;
}
@@ -1174,7 +1169,7 @@ repeat:
if (sgp != SGP_READ)
goto clear;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
if (page || (sgp == SGP_READ && !swap.val)) {
@@ -1332,7 +1327,7 @@ clear:
/* Perhaps the file has been truncated since we checked */
if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
- ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
ClearPageDirty(page);
delete_from_page_cache(page);
@@ -1360,7 +1355,7 @@ failed:
unlock:
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (error == -ENOSPC && !once++) {
info = SHMEM_I(inode);
@@ -1584,7 +1579,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
/* i_mutex is held by caller */
if (unlikely(info->seals)) {
@@ -1608,16 +1603,16 @@ shmem_write_end(struct file *file, struct address_space *mapping,
i_size_write(inode, pos + copied);
if (!PageUptodate(page)) {
- if (copied < PAGE_CACHE_SIZE) {
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ if (copied < PAGE_SIZE) {
+ unsigned from = pos & (PAGE_SIZE - 1);
zero_user_segments(page, 0, from,
- from + copied, PAGE_CACHE_SIZE);
+ from + copied, PAGE_SIZE);
}
SetPageUptodate(page);
}
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -1642,8 +1637,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!iter_is_iovec(to))
sgp = SGP_DIRTY;
- index = *ppos >> PAGE_CACHE_SHIFT;
- offset = *ppos & ~PAGE_CACHE_MASK;
+ index = *ppos >> PAGE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
for (;;) {
struct page *page = NULL;
@@ -1651,11 +1646,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (index > end_index)
break;
if (index == end_index) {
- nr = i_size & ~PAGE_CACHE_MASK;
+ nr = i_size & ~PAGE_MASK;
if (nr <= offset)
break;
}
@@ -1673,14 +1668,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* We must evaluate after, since reads (unlike writes)
* are called without i_mutex protection against truncate
*/
- nr = PAGE_CACHE_SIZE;
+ nr = PAGE_SIZE;
i_size = i_size_read(inode);
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (index == end_index) {
- nr = i_size & ~PAGE_CACHE_MASK;
+ nr = i_size & ~PAGE_MASK;
if (nr <= offset) {
if (page)
- page_cache_release(page);
+ put_page(page);
break;
}
}
@@ -1701,7 +1696,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
mark_page_accessed(page);
} else {
page = ZERO_PAGE(0);
- page_cache_get(page);
+ get_page(page);
}
/*
@@ -1711,10 +1706,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = copy_page_to_iter(page, offset, nr, to);
retval += ret;
offset += ret;
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
+ index += offset >> PAGE_SHIFT;
+ offset &= ~PAGE_MASK;
- page_cache_release(page);
+ put_page(page);
if (!iov_iter_count(to))
break;
if (ret < nr) {
@@ -1724,7 +1719,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
cond_resched();
}
- *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+ *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
file_accessed(file);
return retval ? retval : error;
}
@@ -1762,9 +1757,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (splice_grow_spd(pipe, &spd))
return -ENOMEM;
- index = *ppos >> PAGE_CACHE_SHIFT;
- loff = *ppos & ~PAGE_CACHE_MASK;
- req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
+ loff = *ppos & ~PAGE_MASK;
+ req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT;
nr_pages = min(req_pages, spd.nr_pages_max);
spd.nr_pages = find_get_pages_contig(mapping, index,
@@ -1781,7 +1776,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
index++;
}
- index = *ppos >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
nr_pages = spd.nr_pages;
spd.nr_pages = 0;
@@ -1791,7 +1786,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (!len)
break;
- this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ this_len = min_t(unsigned long, len, PAGE_SIZE - loff);
page = spd.pages[page_nr];
if (!PageUptodate(page) || page->mapping != mapping) {
@@ -1800,19 +1795,19 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (error)
break;
unlock_page(page);
- page_cache_release(spd.pages[page_nr]);
+ put_page(spd.pages[page_nr]);
spd.pages[page_nr] = page;
}
isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index))
break;
if (end_index == index) {
unsigned int plen;
- plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ plen = ((isize - 1) & ~PAGE_MASK) + 1;
if (plen <= loff)
break;
@@ -1829,7 +1824,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
}
while (page_nr < nr_pages)
- page_cache_release(spd.pages[page_nr++]);
+ put_page(spd.pages[page_nr++]);
if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);
@@ -1911,10 +1906,10 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
else if (offset >= inode->i_size)
offset = -ENXIO;
else {
- start = offset >> PAGE_CACHE_SHIFT;
- end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ start = offset >> PAGE_SHIFT;
+ end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
new_offset = shmem_seek_hole_data(mapping, start, end, whence);
- new_offset <<= PAGE_CACHE_SHIFT;
+ new_offset <<= PAGE_SHIFT;
if (new_offset > offset) {
if (new_offset < inode->i_size)
offset = new_offset;
@@ -1949,12 +1944,13 @@ static void shmem_tag_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
page = radix_tree_deref_slot(slot);
if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
} else if (page_count(page) - page_mapcount(page) > 1) {
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_set(&mapping->page_tree, iter.index,
@@ -1964,8 +1960,7 @@ restart:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2002,14 +1997,15 @@ static int shmem_wait_for_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
start, SHMEM_TAG_PINNED) {
page = radix_tree_deref_slot(slot);
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
page = NULL;
}
@@ -2034,8 +2030,7 @@ restart:
continue_resched:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2210,8 +2205,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
- start = offset >> PAGE_CACHE_SHIFT;
- end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* Try to avoid a swapstorm if len is impossible to satisfy */
if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
error = -ENOSPC;
@@ -2244,8 +2239,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
if (error) {
/* Remove the !PageUptodate pages we added */
shmem_undo_range(inode,
- (loff_t)start << PAGE_CACHE_SHIFT,
- (loff_t)index << PAGE_CACHE_SHIFT, true);
+ (loff_t)start << PAGE_SHIFT,
+ (loff_t)index << PAGE_SHIFT, true);
goto undone;
}
@@ -2266,7 +2261,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
*/
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
cond_resched();
}
@@ -2287,7 +2282,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
buf->f_type = TMPFS_MAGIC;
- buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_bsize = PAGE_SIZE;
buf->f_namelen = NAME_MAX;
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
@@ -2530,7 +2525,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
struct shmem_inode_info *info;
len = strlen(symname) + 1;
- if (len > PAGE_CACHE_SIZE)
+ if (len > PAGE_SIZE)
return -ENAMETOOLONG;
inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE, 0);
@@ -2569,7 +2564,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
SetPageUptodate(page);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -2825,9 +2820,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if ((value = strchr(this_char,'=')) != NULL) {
*value++ = 0;
} else {
- printk(KERN_ERR
- "tmpfs: No value for mount option '%s'\n",
- this_char);
+ pr_err("tmpfs: No value for mount option '%s'\n",
+ this_char);
goto error;
}
@@ -2843,7 +2837,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (*rest)
goto bad_val;
sbinfo->max_blocks =
- DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
+ DIV_ROUND_UP(size, PAGE_SIZE);
} else if (!strcmp(this_char,"nr_blocks")) {
sbinfo->max_blocks = memparse(value, &rest);
if (*rest)
@@ -2882,8 +2876,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (mpol_parse_str(value, &mpol))
goto bad_val;
} else {
- printk(KERN_ERR "tmpfs: Bad mount option %s\n",
- this_char);
+ pr_err("tmpfs: Bad mount option %s\n", this_char);
goto error;
}
}
@@ -2891,7 +2884,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
return 0;
bad_val:
- printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
+ pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
value, this_char);
error:
mpol_put(mpol);
@@ -2949,7 +2942,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (sbinfo->max_blocks != shmem_default_max_blocks())
seq_printf(seq, ",size=%luk",
- sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
+ sbinfo->max_blocks << (PAGE_SHIFT - 10));
if (sbinfo->max_inodes != shmem_default_max_inodes())
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
@@ -3091,8 +3084,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
sbinfo->free_inodes = sbinfo->max_inodes;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
sb->s_time_gran = 1;
@@ -3288,14 +3281,14 @@ int __init shmem_init(void)
error = register_filesystem(&shmem_fs_type);
if (error) {
- printk(KERN_ERR "Could not register tmpfs\n");
+ pr_err("Could not register tmpfs\n");
goto out2;
}
shm_mnt = kern_mount(&shmem_fs_type);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
- printk(KERN_ERR "Could not kern_mount tmpfs\n");
+ pr_err("Could not kern_mount tmpfs\n");
goto out1;
}
return 0;
diff --git a/mm/slab.c b/mm/slab.c
index 621fbcb35..17e284897 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t;
#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
/*
- * true if a page was allocated from pfmemalloc reserves for network-based
- * swap
- */
-static bool pfmemalloc_active __read_mostly;
-
-/*
* struct array_cache
*
* Purpose:
@@ -195,10 +189,6 @@ struct array_cache {
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
- *
- * Entries should not be directly dereferenced as
- * entries belonging to slabs marked pfmemalloc will
- * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
@@ -207,33 +197,6 @@ struct alien_cache {
struct array_cache ac;
};
-#define SLAB_OBJ_PFMEMALLOC 1
-static inline bool is_obj_pfmemalloc(void *objp)
-{
- return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
-}
-
-static inline void set_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
- return;
-}
-
-static inline void clear_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
-}
-
-/*
- * bootstrap: The caches do not work without cpuarrays anymore, but the
- * cpuarrays are allocated from the generic caches...
- */
-#define BOOT_CPUCACHE_ENTRIES 1
-struct arraycache_init {
- struct array_cache cache;
- void *entries[BOOT_CPUCACHE_ENTRIES];
-};
-
/*
* Need this for bootstrapping a per node allocator.
*/
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
+#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
#define CFLGS_OFF_SLAB (0x80000000UL)
+#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
-#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
#define BATCHREFILL_LIMIT 16
/*
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
-
#ifdef CONFIG_DEBUG_SLAB_LEAK
-static void set_obj_status(struct page *page, int idx, int val)
+static inline bool is_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
- status[idx] = val;
+ return atomic_read(&cachep->store_user_clean) == 1;
}
-static inline unsigned int get_obj_status(struct page *page, int idx)
+static inline void set_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
+ atomic_set(&cachep->store_user_clean, 1);
+}
- return status[idx];
+static inline void set_store_user_dirty(struct kmem_cache *cachep)
+{
+ if (is_store_user_clean(cachep))
+ atomic_set(&cachep->store_user_clean, 0);
}
#else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
+static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
#endif
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
+#define BOOT_CPUCACHE_ENTRIES 1
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
@@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return this_cpu_ptr(cachep->cpu_cache);
}
-static size_t calculate_freelist_size(int nr_objs, size_t align)
-{
- size_t freelist_size;
-
- freelist_size = nr_objs * sizeof(freelist_idx_t);
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size += nr_objs * sizeof(char);
-
- if (align)
- freelist_size = ALIGN(freelist_size, align);
-
- return freelist_size;
-}
-
-static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
- size_t idx_size, size_t align)
-{
- int nr_objs;
- size_t remained_size;
- size_t freelist_size;
- int extra_space = 0;
-
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- extra_space = sizeof(char);
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = slab_size / (buffer_size + idx_size + extra_space);
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- remained_size = slab_size - nr_objs * buffer_size;
- freelist_size = calculate_freelist_size(nr_objs, align);
- if (remained_size < freelist_size)
- nr_objs--;
-
- return nr_objs;
-}
-
/*
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
-static void cache_estimate(unsigned long gfporder, size_t buffer_size,
- size_t align, int flags, size_t *left_over,
- unsigned int *num)
+static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
+ unsigned long flags, size_t *left_over)
{
- int nr_objs;
- size_t mgmt_size;
+ unsigned int num;
size_t slab_size = PAGE_SIZE << gfporder;
/*
@@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - One unsigned int for each object
- * - Padding to respect alignment of @align
* - @buffer_size bytes for each object
+ * - One freelist_idx_t for each object
+ *
+ * We don't need to consider alignment of freelist because
+ * freelist will be at the end of slab page. The objects will be
+ * at the correct alignment.
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
- if (flags & CFLGS_OFF_SLAB) {
- mgmt_size = 0;
- nr_objs = slab_size / buffer_size;
-
+ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
+ num = slab_size / buffer_size;
+ *left_over = slab_size % buffer_size;
} else {
- nr_objs = calculate_nr_objs(slab_size, buffer_size,
- sizeof(freelist_idx_t), align);
- mgmt_size = calculate_freelist_size(nr_objs, align);
+ num = slab_size / (buffer_size + sizeof(freelist_idx_t));
+ *left_over = slab_size %
+ (buffer_size + sizeof(freelist_idx_t));
}
- *num = nr_objs;
- *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+
+ return num;
}
#if DEBUG
@@ -565,7 +474,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
static void __slab_error(const char *function, struct kmem_cache *cachep,
char *msg)
{
- printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
+ pr_err("slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return ac;
}
-static inline bool is_slab_pfmemalloc(struct page *page)
+static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
- return PageSlabPfmemalloc(page);
-}
-
-/* Clears pfmemalloc_active if no slabs have pfmalloc set */
-static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
- struct array_cache *ac)
-{
- struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
- struct page *page;
- unsigned long flags;
-
- if (!pfmemalloc_active)
- return;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_partial, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_free, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- pfmemalloc_active = false;
-out:
- spin_unlock_irqrestore(&n->list_lock, flags);
-}
-
-static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
- gfp_t flags, bool force_refill)
-{
- int i;
- void *objp = ac->entry[--ac->avail];
-
- /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
- if (unlikely(is_obj_pfmemalloc(objp))) {
- struct kmem_cache_node *n;
-
- if (gfp_pfmemalloc_allowed(flags)) {
- clear_obj_pfmemalloc(&objp);
- return objp;
- }
-
- /* The caller cannot use PFMEMALLOC objects, find another one */
- for (i = 0; i < ac->avail; i++) {
- /* If a !PFMEMALLOC object is found, swap them */
- if (!is_obj_pfmemalloc(ac->entry[i])) {
- objp = ac->entry[i];
- ac->entry[i] = ac->entry[ac->avail];
- ac->entry[ac->avail] = objp;
- return objp;
- }
- }
-
- /*
- * If there are empty slabs on the slabs_free list and we are
- * being forced to refill the cache, mark this one !pfmemalloc.
- */
- n = get_node(cachep, numa_mem_id());
- if (!list_empty(&n->slabs_free) && force_refill) {
- struct page *page = virt_to_head_page(objp);
- ClearPageSlabPfmemalloc(page);
- clear_obj_pfmemalloc(&objp);
- recheck_pfmemalloc_active(cachep, ac);
- return objp;
- }
-
- /* No !PFMEMALLOC objects available */
- ac->avail++;
- objp = NULL;
- }
-
- return objp;
-}
-
-static inline void *ac_get_obj(struct kmem_cache *cachep,
- struct array_cache *ac, gfp_t flags, bool force_refill)
-{
- void *objp;
-
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_get_obj(cachep, ac, flags, force_refill);
- else
- objp = ac->entry[--ac->avail];
-
- return objp;
-}
-
-static noinline void *__ac_put_obj(struct kmem_cache *cachep,
- struct array_cache *ac, void *objp)
-{
- if (unlikely(pfmemalloc_active)) {
- /* Some pfmemalloc slabs exist, check if this is one */
- struct page *page = virt_to_head_page(objp);
- if (PageSlabPfmemalloc(page))
- set_obj_pfmemalloc(&objp);
- }
+ struct kmem_cache_node *n;
+ int page_node;
+ LIST_HEAD(list);
- return objp;
-}
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
-static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
- void *objp)
-{
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_put_obj(cachep, ac, objp);
+ spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, page_node, &list);
+ spin_unlock(&n->list_lock);
- ac->entry[ac->avail++] = objp;
+ slabs_destroy(cachep, &list);
}
/*
@@ -860,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return flags;
+ return flags & ~__GFP_NOFAIL;
}
#else /* CONFIG_NUMA */
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac_put_obj(cachep, ac, objp);
+ ac->entry[ac->avail++] = objp;
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
@@ -1031,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
/*
- * Construct gfp mask to allocate from a specific node but do not direct reclaim
- * or warn about failures. kswapd may still wake to reclaim in the background.
+ * Construct gfp mask to allocate from a specific node but do not reclaim or
+ * warn about failures.
*/
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
}
#endif
@@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
return;
- printk(KERN_WARNING
- "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nodeid, gfpflags);
- printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
+ pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nodeid, gfpflags, &gfpflags);
+ pr_warn(" cache: %s, object size: %d, order: %d\n",
cachep->name, cachep->size, cachep->gfporder);
for_each_kmem_cache_node(cachep, node, n) {
@@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
- printk(KERN_WARNING
- " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
}
@@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (page_is_pfmemalloc(page))
- pfmemalloc_active = true;
-
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
+
__SetPageSlab(page);
- if (page_is_pfmemalloc(page))
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (sk_memalloc_socks() && page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1636,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
*/
static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
- const unsigned long nr_freed = (1 << cachep->gfporder);
+ int order = cachep->gfporder;
+ unsigned long nr_freed = (1 << order);
- kmemcheck_free_shadow(page, cachep->gfporder);
+ kmemcheck_free_shadow(page, order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
@@ -1655,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_kmem_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(page, order, cachep);
+ __free_pages(page, order);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -1670,6 +1478,14 @@ static void kmem_rcu_free(struct rcu_head *head)
}
#if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+ if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+ (cachep->size % PAGE_SIZE) == 0)
+ return true;
+
+ return false;
+}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1519,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
}
*addr++ = 0x87654321;
}
+
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller)
+{
+ if (!is_debug_pagealloc_cache(cachep))
+ return;
+
+ if (caller)
+ store_stackinfo(cachep, objp, caller);
+
+ kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller) {}
+
#endif
static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1720,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit)
unsigned char error = 0;
int bad_count = 0;
- printk(KERN_ERR "%03x: ", offset);
+ pr_err("%03x: ", offset);
for (i = 0; i < limit; i++) {
if (data[offset + i] != POISON_FREE) {
error = data[offset + i];
@@ -1733,13 +1566,11 @@ static void dump_line(char *data, int offset, int limit)
if (bad_count == 1) {
error ^= POISON_FREE;
if (!(error & (error - 1))) {
- printk(KERN_ERR "Single bit error detected. Probably "
- "bad RAM.\n");
+ pr_err("Single bit error detected. Probably bad RAM.\n");
#ifdef CONFIG_X86
- printk(KERN_ERR "Run memtest86+ or a similar memory "
- "test tool.\n");
+ pr_err("Run memtest86+ or a similar memory test tool.\n");
#else
- printk(KERN_ERR "Run a memory test tool.\n");
+ pr_err("Run a memory test tool.\n");
#endif
}
}
@@ -1754,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
char *realobj;
if (cachep->flags & SLAB_RED_ZONE) {
- printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
- *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ pr_err("Redzone: 0x%llx/0x%llx\n",
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
if (cachep->flags & SLAB_STORE_USER) {
- printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
+ pr_err("Last user: [<%p>](%pSR)\n",
*dbg_userword(cachep, objp),
*dbg_userword(cachep, objp));
}
@@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
int size, i;
int lines = 0;
+ if (is_debug_pagealloc_cache(cachep))
+ return;
+
realobj = (char *)objp + obj_offset(cachep);
size = cachep->object_size;
@@ -1793,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Mismatch ! */
/* Print header */
if (lines == 0) {
- printk(KERN_ERR
- "Slab corruption (%s): %s start=%p, len=%d\n",
- print_tainted(), cachep->name, realobj, size);
+ pr_err("Slab corruption (%s): %s start=%p, len=%d\n",
+ print_tainted(), cachep->name,
+ realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
@@ -1822,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
if (objnr) {
objp = index_to_obj(cachep, page, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Prev obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
objp = index_to_obj(cachep, page, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Next obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Next obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
}
@@ -1842,28 +1674,24 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
struct page *page)
{
int i;
+
+ if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, page->freelist - obj_offset(cachep),
+ POISON_FREE);
+ }
+
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (cachep->size % PAGE_SIZE == 0 &&
- OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "start of a freed object "
- "was overwritten");
+ slab_error(cachep, "start of a freed object was overwritten");
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "end of a freed object "
- "was overwritten");
+ slab_error(cachep, "end of a freed object was overwritten");
}
}
}
@@ -1916,7 +1744,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
@@ -1926,9 +1753,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, size_t align, unsigned long flags)
+ size_t size, unsigned long flags)
{
- unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
@@ -1936,7 +1762,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
unsigned int num;
size_t remainder;
- cache_estimate(gfporder, size, align, flags, &remainder, &num);
+ num = cache_estimate(gfporder, size, flags, &remainder);
if (!num)
continue;
@@ -1945,19 +1771,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
break;
if (flags & CFLGS_OFF_SLAB) {
- size_t freelist_size_per_obj = sizeof(freelist_idx_t);
+ struct kmem_cache *freelist_cache;
+ size_t freelist_size;
+
+ freelist_size = num * sizeof(freelist_idx_t);
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+
/*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
+ * Needed to avoid possible looping condition
+ * in cache_grow()
*/
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size_per_obj += sizeof(char);
- offslab_limit = size;
- offslab_limit /= freelist_size_per_obj;
+ if (OFF_SLAB(freelist_cache))
+ continue;
- if (num > offslab_limit)
- break;
+ /* check if off slab has enough benefit */
+ if (freelist_cache->size > cachep->size / 2)
+ continue;
}
/* Found something acceptable - save it away */
@@ -2075,6 +1906,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
return cachep;
}
+static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+ return false;
+
+ left = calculate_slab_order(cachep, size,
+ flags | CFLGS_OBJFREELIST_SLAB);
+ if (!cachep->num)
+ return false;
+
+ if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_off_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ /*
+ * Always use on-slab management when SLAB_NOLEAKTRACE
+ * to avoid recursive calls into kmemleak.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ return false;
+
+ /*
+ * Size is large, assume best to place the slab management obj
+ * off-slab (should allow better packing of objs).
+ */
+ left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
+ if (!cachep->num)
+ return false;
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
+ */
+ if (left >= cachep->num * sizeof(freelist_idx_t))
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_on_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ left = calculate_slab_order(cachep, size, flags);
+ if (!cachep->num)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
/**
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
@@ -2099,7 +2003,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
@@ -2119,8 +2022,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(flags & SLAB_POISON);
#endif
/*
@@ -2152,6 +2053,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* 4) Store it.
*/
cachep->align = ralign;
+ cachep->colour_off = cache_line_size();
+ /* Offset must be a multiple of the alignment. */
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
if (slab_is_available())
gfp = GFP_KERNEL;
@@ -2179,36 +2084,9 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
size += BYTES_PER_WORD;
}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- /*
- * To activate debug pagealloc, off-slab management is necessary
- * requirement. In early phase of initialization, small sized slab
- * doesn't get initialized so it would not be possible. So, we need
- * to check size >= 256. It guarantees that all necessary small
- * sized slab is initialized in current slab initialization sequence.
- */
- if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
- size >= 256 && cachep->object_size > cache_line_size() &&
- ALIGN(size, cachep->align) < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
- size = PAGE_SIZE;
- }
-#endif
#endif
- /*
- * Determine if the slab management is 'on' or 'off' slab.
- * (bootstrapping cannot cope with offslab caches so don't do
- * it too early on. Always use on-slab management when
- * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
- */
- if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
- !(flags & SLAB_NOLEAKTRACE))
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
+ kasan_cache_create(cachep, &size, &flags);
size = ALIGN(size, cachep->align);
/*
@@ -2218,42 +2096,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
- left_over = calculate_slab_order(cachep, size, cachep->align, flags);
-
- if (!cachep->num)
- return -E2BIG;
-
- freelist_size = calculate_freelist_size(cachep->num, cachep->align);
-
+#if DEBUG
/*
- * If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
+ * To activate debug pagealloc, off-slab management is necessary
+ * requirement. In early phase of initialization, small sized slab
+ * doesn't get initialized so it would not be possible. So, we need
+ * to check size >= 256. It guarantees that all necessary small
+ * sized slab is initialized in current slab initialization sequence.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
- flags &= ~CFLGS_OFF_SLAB;
- left_over -= freelist_size;
+ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
+ size >= 256 && cachep->object_size > cache_line_size()) {
+ if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+ size_t tmp_size = ALIGN(size, PAGE_SIZE);
+
+ if (set_off_slab_cache(cachep, tmp_size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ cachep->obj_offset += tmp_size - size;
+ size = tmp_size;
+ goto done;
+ }
+ }
}
+#endif
- if (flags & CFLGS_OFF_SLAB) {
- /* really off slab. No need for manual alignment */
- freelist_size = calculate_freelist_size(cachep->num, 0);
+ if (set_objfreelist_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OBJFREELIST_SLAB;
+ goto done;
+ }
-#ifdef CONFIG_PAGE_POISONING
- /* If we're going to use the generic kernel_map_pages()
- * poisoning, then it's going to smash the contents of
- * the redzone and userword anyhow, so switch them off.
- */
- if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
+ if (set_off_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ goto done;
}
- cachep->colour_off = cache_line_size();
- /* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < cachep->align)
- cachep->colour_off = cachep->align;
- cachep->colour = left_over / cachep->colour_off;
- cachep->freelist_size = freelist_size;
+ if (set_on_slab_cache(cachep, size, flags))
+ goto done;
+
+ return -E2BIG;
+
+done:
+ cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
@@ -2261,16 +2143,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
- if (flags & CFLGS_OFF_SLAB) {
- cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
- /*
- * This is a possibility for one of the kmalloc_{dma,}_caches.
- * But since we go off slab only for object size greater than
- * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
- * in ascending order,this should not happen at all.
- * But leave a BUG_ON for some lucky dude.
- */
- BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+#if DEBUG
+ /*
+ * If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
+ (cachep->flags & SLAB_POISON) &&
+ is_debug_pagealloc_cache(cachep))
+ cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+
+ if (OFF_SLAB(cachep)) {
+ cachep->freelist_cache =
+ kmalloc_slab(cachep->freelist_size, 0u);
}
err = setup_cpu_cache(cachep, gfp);
@@ -2377,9 +2264,6 @@ static int drain_freelist(struct kmem_cache *cache,
}
page = list_entry(p, struct page, lru);
-#if DEBUG
- BUG_ON(page->active);
-#endif
list_del(&page->lru);
/*
* Safe to drop the lock. The slab is no longer linked
@@ -2454,18 +2338,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
void *freelist;
void *addr = page_address(page);
- if (OFF_SLAB(cachep)) {
+ page->s_mem = addr + colour_off;
+ page->active = 0;
+
+ if (OBJFREELIST_SLAB(cachep))
+ freelist = NULL;
+ else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
if (!freelist)
return NULL;
} else {
- freelist = addr + colour_off;
- colour_off += cachep->freelist_size;
+ /* We will use last bytes at the slab for freelist */
+ freelist = addr + (PAGE_SIZE << cachep->gfporder) -
+ cachep->freelist_size;
}
- page->active = 0;
- page->s_mem = addr + colour_off;
+
return freelist;
}
@@ -2480,17 +2369,14 @@ static inline void set_free_obj(struct page *page,
((freelist_idx_t *)(page->freelist))[idx] = val;
}
-static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
{
+#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
-#if DEBUG
- /* need to poison the objs? */
- if (cachep->flags & SLAB_POISON)
- poison_obj(cachep, objp, POISON_FREE);
+
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
@@ -2503,26 +2389,51 @@ static void cache_init_objs(struct kmem_cache *cachep,
* cache which they are a constructor for. Otherwise, deadlock.
* They must also be threaded.
*/
- if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
+ kasan_unpoison_object_data(cachep,
+ objp + obj_offset(cachep));
cachep->ctor(objp + obj_offset(cachep));
+ kasan_poison_object_data(
+ cachep, objp + obj_offset(cachep));
+ }
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " end of an object");
+ slab_error(cachep, "constructor overwrote the end of an object");
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " start of an object");
+ slab_error(cachep, "constructor overwrote the start of an object");
}
- if ((cachep->size % PAGE_SIZE) == 0 &&
- OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
-#else
- if (cachep->ctor)
- cachep->ctor(objp);
+ /* need to poison the objs? */
+ if (cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, objp, POISON_FREE);
+ slab_kernel_map(cachep, objp, 0, 0);
+ }
+ }
#endif
- set_obj_status(page, i, OBJECT_FREE);
+}
+
+static void cache_init_objs(struct kmem_cache *cachep,
+ struct page *page)
+{
+ int i;
+ void *objp;
+
+ cache_init_objs_debug(cachep, page);
+
+ if (OBJFREELIST_SLAB(cachep)) {
+ page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ obj_offset(cachep);
+ }
+
+ for (i = 0; i < cachep->num; i++) {
+ /* constructor could break poison info */
+ if (DEBUG == 0 && cachep->ctor) {
+ objp = index_to_obj(cachep, page, i);
+ kasan_unpoison_object_data(cachep, objp);
+ cachep->ctor(objp);
+ kasan_poison_object_data(cachep, objp);
+ }
+
set_free_obj(page, i, i);
}
}
@@ -2537,40 +2448,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
- int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
void *objp;
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
+
#if DEBUG
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ if (cachep->flags & SLAB_STORE_USER)
+ set_store_user_dirty(cachep);
#endif
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
- void *objp, int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
unsigned int objnr = obj_to_index(cachep, page, objp);
#if DEBUG
unsigned int i;
- /* Verify that the slab belongs to the intended node */
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
-
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
- printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
+ pr_err("slab: double free detected in cache '%s', objp %p\n",
+ cachep->name, objp);
BUG();
}
}
#endif
page->active--;
+ if (!page->freelist)
+ page->freelist = objp + obj_offset(cachep);
+
set_free_obj(page, page->active, objnr);
}
@@ -2645,11 +2557,12 @@ static int cache_grow(struct kmem_cache *cachep,
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!freelist)
+ if (OFF_SLAB(cachep) && !freelist)
goto opps1;
slab_map_pages(cachep, page, freelist);
+ kasan_poison_slab(page);
cache_init_objs(cachep, page);
if (gfpflags_allow_blocking(local_flags))
@@ -2681,7 +2594,7 @@ failed:
static void kfree_debugcheck(const void *objp)
{
if (!virt_addr_valid(objp)) {
- printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
+ pr_err("kfree_debugcheck: out of range ptr %lxh\n",
(unsigned long)objp);
BUG();
}
@@ -2705,8 +2618,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
else
slab_error(cache, "memory outside object was overwritten");
- printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
- obj, redzone1, redzone2);
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ obj, redzone1, redzone2);
}
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
@@ -2726,27 +2639,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
- if (cachep->flags & SLAB_STORE_USER)
+ if (cachep->flags & SLAB_STORE_USER) {
+ set_store_user_dirty(cachep);
*dbg_userword(cachep, objp) = (void *)caller;
+ }
objnr = obj_to_index(cachep, page, objp);
BUG_ON(objnr >= cachep->num);
BUG_ON(objp != index_to_obj(cachep, page, objnr));
- set_obj_status(page, objnr, OBJECT_FREE);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, caller);
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
- } else {
- poison_obj(cachep, objp, POISON_FREE);
- }
-#else
poison_obj(cachep, objp, POISON_FREE);
-#endif
+ slab_kernel_map(cachep, objp, 0, caller);
}
return objp;
}
@@ -2756,7 +2661,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#define cache_free_debugcheck(x,objp,z) (objp)
#endif
-static struct page *get_first_slab(struct kmem_cache_node *n)
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list)
+{
+#if DEBUG
+ void *next = *list;
+ void *objp;
+
+ while (next) {
+ objp = next - obj_offset(cachep);
+ next = *(void **)next;
+ poison_obj(cachep, objp, POISON_FREE);
+ }
+#endif
+}
+
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list)
+{
+ /* move slabp to correct slabp list: */
+ list_del(&page->lru);
+ if (page->active == cachep->num) {
+ list_add(&page->lru, &n->slabs_full);
+ if (OBJFREELIST_SLAB(cachep)) {
+#if DEBUG
+ /* Poisoning will be done without holding the lock */
+ if (cachep->flags & SLAB_POISON) {
+ void **objp = page->freelist;
+
+ *objp = *list;
+ *list = objp;
+ }
+#endif
+ page->freelist = NULL;
+ }
+ } else
+ list_add(&page->lru, &n->slabs_partial);
+}
+
+/* Try to find non-pfmemalloc slab if needed */
+static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
+ struct page *page, bool pfmemalloc)
+{
+ if (!page)
+ return NULL;
+
+ if (pfmemalloc)
+ return page;
+
+ if (!PageSlabPfmemalloc(page))
+ return page;
+
+ /* No need to keep pfmemalloc slab if we have enough free objects */
+ if (n->free_objects > n->free_limit) {
+ ClearPageSlabPfmemalloc(page);
+ return page;
+ }
+
+ /* Move pfmemalloc slab to the end of list to speed up next search */
+ list_del(&page->lru);
+ if (!page->active)
+ list_add_tail(&page->lru, &n->slabs_free);
+ else
+ list_add_tail(&page->lru, &n->slabs_partial);
+
+ list_for_each_entry(page, &n->slabs_partial, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ list_for_each_entry(page, &n->slabs_free, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ return NULL;
+}
+
+static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
@@ -2768,21 +2751,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n)
struct page, lru);
}
+ if (sk_memalloc_socks())
+ return get_valid_first_slab(n, page, pfmemalloc);
+
return page;
}
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
+static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, gfp_t flags)
+{
+ struct page *page;
+ void *obj;
+ void *list = NULL;
+
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ page = get_first_slab(n, true);
+ if (!page) {
+ spin_unlock(&n->list_lock);
+ return NULL;
+ }
+
+ obj = slab_get_obj(cachep, page);
+ n->free_objects--;
+
+ fixup_slab_list(cachep, n, page, &list);
+
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+
+ return obj;
+}
+
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
+ void *list = NULL;
check_irq_off();
node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
+
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
@@ -2808,7 +2821,7 @@ retry:
while (batchcount > 0) {
struct page *page;
/* Get slab alloc is to come from. */
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -2826,26 +2839,29 @@ retry:
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
- node));
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
}
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
if (unlikely(!ac->avail)) {
int x;
-force_grow:
+
+ /* Check if we can use obj in pfmemalloc slab */
+ if (sk_memalloc_socks()) {
+ void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
+
+ if (obj)
+ return obj;
+ }
+
x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
@@ -2853,7 +2869,7 @@ force_grow:
node = numa_mem_id();
/* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
+ if (!x && ac->avail == 0)
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
@@ -2861,7 +2877,7 @@ force_grow:
}
ac->touched = 1;
- return ac_get_obj(cachep, ac, flags, force_refill);
+ return ac->entry[--ac->avail];
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -2877,20 +2893,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
- struct page *page;
-
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
@@ -2899,25 +2906,21 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
- slab_error(cachep, "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR
- "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
- objp, *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ slab_error(cachep, "double free, or memory outside object was overwritten");
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
- page = virt_to_head_page(objp);
- set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
if (ARCH_SLAB_MINALIGN &&
((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
- printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
objp, (int)ARCH_SLAB_MINALIGN);
}
return objp;
@@ -2926,40 +2929,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
-static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
- if (unlikely(cachep == kmem_cache))
- return false;
-
- return should_failslab(cachep->object_size, flags, cachep->flags);
-}
-
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
- bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
- objp = ac_get_obj(cachep, ac, flags, false);
+ objp = ac->entry[--ac->avail];
- /*
- * Allow for the possibility all avail objects are not allowed
- * by the current flags
- */
- if (objp) {
- STATS_INC_ALLOCHIT(cachep);
- goto out;
- }
- force_refill = true;
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
}
STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags, force_refill);
+ objp = cache_alloc_refill(cachep, flags);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
@@ -3097,6 +3084,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
struct page *page;
struct kmem_cache_node *n;
void *obj;
+ void *list = NULL;
int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
@@ -3106,7 +3094,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
retry:
check_irq_off();
spin_lock(&n->list_lock);
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -3118,17 +3106,13 @@ retry:
BUG_ON(page->active == cachep->num);
- obj = slab_get_obj(cachep, page, nodeid);
+ obj = slab_get_obj(cachep, page);
n->free_objects--;
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
goto done;
must_grow:
@@ -3152,14 +3136,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
int slab_node = numa_mem_id();
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3188,16 +3168,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
- flags);
- if (likely(ptr)) {
- kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(ptr, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && ptr)
+ memset(ptr, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &ptr);
return ptr;
}
@@ -3240,30 +3215,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
void *objp;
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
- flags);
prefetchw(objp);
- if (likely(objp)) {
- kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(objp, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && objp)
+ memset(objp, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &objp);
return objp;
}
@@ -3281,13 +3247,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
void *objp;
struct page *page;
- clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
page = virt_to_head_page(objp);
list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp, node);
+ slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
n->free_objects++;
@@ -3317,9 +3282,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
LIST_HEAD(list);
batchcount = ac->batchcount;
-#if DEBUG
- BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
+
check_irq_off();
n = get_node(cachep, node);
spin_lock(&n->list_lock);
@@ -3366,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
{
struct array_cache *ac = cpu_cache_get(cachep);
+ kasan_slab_free(cachep, objp);
+
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3389,7 +3354,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
cache_flusharray(cachep, ac);
}
- ac_put_obj(cachep, ac, objp);
+ if (sk_memalloc_socks()) {
+ struct page *page = virt_to_head_page(objp);
+
+ if (unlikely(PageSlabPfmemalloc(page))) {
+ cache_free_pfmemalloc(cachep, page, objp);
+ return;
+ }
+ }
+
+ ac->entry[ac->avail++] = objp;
}
/**
@@ -3404,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
+ kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3411,16 +3386,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+static __always_inline void
+cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, unsigned long caller)
{
- __kmem_cache_free_bulk(s, size, p);
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+ void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ s = slab_pre_alloc_hook(s, flags);
+ if (!s)
+ return 0;
+
+ cache_alloc_debugcheck_before(s, flags);
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = __do_cache_alloc(s, flags);
+
+ if (unlikely(!objp))
+ goto error;
+ p[i] = objp;
+ }
+ local_irq_enable();
+
+ cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
+
+ /* Clear memory outside IRQ disabled section */
+ if (unlikely(flags & __GFP_ZERO))
+ for (i = 0; i < size; i++)
+ memset(p[i], 0, s->object_size);
+
+ slab_post_alloc_hook(s, flags, size, p);
+ /* FIXME: Trace call missing. Christoph would like a bulk variant */
+ return size;
+error:
+ local_irq_enable();
+ cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
+ slab_post_alloc_hook(s, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3432,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
ret = slab_alloc(cachep, flags, _RET_IP_);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
return ret;
@@ -3455,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3473,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc_node(_RET_IP_, ret,
size, cachep->size,
flags, nodeid);
@@ -3485,11 +3500,15 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
{
struct kmem_cache *cachep;
+ void *ret;
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+ ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
+ kasan_kmalloc(cachep, ret, size, flags);
+
+ return ret;
}
void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3523,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
return cachep;
ret = slab_alloc(cachep, flags, caller);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
size, cachep->size, flags);
@@ -3567,6 +3587,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+{
+ struct kmem_cache *s;
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = p[i];
+
+ if (!orig_s) /* called via kfree_bulk */
+ s = virt_to_cache(objp);
+ else
+ s = cache_from_obj(orig_s, objp);
+
+ debug_check_no_locks_freed(objp, s->object_size);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, s->object_size);
+
+ __cache_free(s, objp, _RET_IP_);
+ }
+ local_irq_enable();
+
+ /* FIXME: add tracing */
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
@@ -3812,7 +3858,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
- printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
+ pr_err("enable_cpucache failed for %s, error %d\n",
cachep->name, -err);
return err;
}
@@ -3968,7 +4014,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
name = cachep->name;
if (error)
- printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
+ pr_err("slab: cache %s error: %s\n", name, error);
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
@@ -3996,8 +4042,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
- "%4lu %4lu %4lu %4lu %4lu",
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
allocs, high, grown,
reaped, errors, max_freeable, node_allocs,
node_frees, overflows);
@@ -4102,15 +4147,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
struct page *page)
{
void *p;
- int i;
+ int i, j;
+ unsigned long v;
if (n[0] == n[1])
return;
for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- if (get_obj_status(page, i) != OBJECT_ACTIVE)
+ bool active = true;
+
+ for (j = page->active; j < c->num; j++) {
+ if (get_free_obj(page, j) == i) {
+ active = false;
+ break;
+ }
+ }
+
+ if (!active)
continue;
- if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+ /*
+ * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
+ * mapping is established when actual object allocation and
+ * we could mistakenly access the unmapped object in the cpu
+ * cache.
+ */
+ if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
+ continue;
+
+ if (!add_caller(n, v))
return;
}
}
@@ -4146,21 +4210,31 @@ static int leaks_show(struct seq_file *m, void *p)
if (!(cachep->flags & SLAB_RED_ZONE))
return 0;
- /* OK, we can do it */
+ /*
+ * Set store_user_clean and start to grab stored user information
+ * for all objects on this cache. If some alloc/free requests comes
+ * during the processing, information would be wrong so restart
+ * whole processing.
+ */
+ do {
+ set_store_user_clean(cachep);
+ drain_cpu_caches(cachep);
- x[1] = 0;
+ x[1] = 0;
- for_each_kmem_cache_node(cachep, node, n) {
+ for_each_kmem_cache_node(cachep, node, n) {
- check_irq_on();
- spin_lock_irq(&n->list_lock);
+ check_irq_on();
+ spin_lock_irq(&n->list_lock);
+
+ list_for_each_entry(page, &n->slabs_full, lru)
+ handle_slab(x, cachep, page);
+ list_for_each_entry(page, &n->slabs_partial, lru)
+ handle_slab(x, cachep, page);
+ spin_unlock_irq(&n->list_lock);
+ }
+ } while (!is_store_user_clean(cachep));
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
- }
name = cachep->name;
if (x[0] == x[1]) {
/* Increase the buffer size */
@@ -4240,10 +4314,18 @@ module_init(slab_proc_init);
*/
size_t ksize(const void *objp)
{
+ size_t size;
+
BUG_ON(!objp);
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;
- return virt_to_cache(objp)->object_size;
+ size = virt_to_cache(objp)->object_size;
+ /* We assume that ksize callers could use the whole allocated area,
+ * so we need to unpoison this area.
+ */
+ kasan_krealloc(objp, size, GFP_NOWAIT);
+
+ return size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 2eedacea4..5969769fb 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,10 @@ struct kmem_cache {
#endif
#include <linux/memcontrol.h>
+#include <linux/fault-inject.h>
+#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
#elif defined(CONFIG_SLUB_DEBUG)
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DEBUG_FREE)
+ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
#else
#define SLAB_DEBUG_FLAGS (0)
#endif
@@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
/*
* Generic implementation of bulk operations
* These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the objecct listed
+ * perform optimizations. In that case segments of the object listed
* may be allocated or freed using these operations.
*/
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
@@ -242,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
+ int ret;
+
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
- return __memcg_kmem_charge_memcg(page, gfp, order,
- s->memcg_params.memcg);
+
+ ret = __memcg_kmem_charge_memcg(page, gfp, order,
+ s->memcg_params.memcg);
+ if (ret)
+ return ret;
+
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ return 0;
+}
+
+static __always_inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ memcg_kmem_uncharge(page, order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
@@ -290,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
return 0;
}
+static inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+}
+
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
@@ -307,7 +337,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
* to not do even the assignment. In that case, slab_equal_or_root
* will also be a constant.
*/
- if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+ if (!memcg_kmem_enabled() &&
+ !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
return s;
page = virt_to_head_page(x);
@@ -321,6 +352,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
return s;
}
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifndef CONFIG_SLUB
+ return s->object_size;
+
+#else /* CONFIG_SLUB */
+# ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+# endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+#endif
+}
+
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(gfpflags_allow_blocking(flags));
+
+ if (should_failslab(s, flags))
+ return NULL;
+
+ return memcg_kmem_get_cache(s, flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p)
+{
+ size_t i;
+
+ flags &= gfp_allowed_mask;
+ for (i = 0; i < size; i++) {
+ void *object = p[i];
+
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+ kmemleak_alloc_recursive(object, s->object_size, 1,
+ s->flags, flags);
+ kasan_slab_alloc(s, object, flags);
+ }
+ memcg_kmem_put_cache(s);
+}
+
#ifndef CONFIG_SLOB
/*
* The slab lists for all objects.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 065b7bdab..3239bfd75 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache;
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB)
+ SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_NOTRACK | SLAB_ACCOUNT)
@@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
size_t i;
- for (i = 0; i < nr; i++)
- kmem_cache_free(s, p[i]);
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
}
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
@@ -438,7 +442,7 @@ out_unlock:
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
- printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+ pr_warn("kmem_cache_create(%s) failed with error %d\n",
name, err);
dump_stack();
}
@@ -506,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_online(memcg))
+ if (memcg->kmem_state != KMEM_ONLINE)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -722,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
err = shutdown_cache(s, &release, &need_rcu_barrier);
if (err) {
- pr_err("kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
+ pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
dump_stack();
}
out_unlock:
@@ -1009,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_kmem_pages(flags, order);
ret = page ? page_address(page) : NULL;
kmemleak_alloc(ret, size, 1, flags);
- kasan_kmalloc_large(ret, size);
+ kasan_kmalloc_large(ret, size, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -1043,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m)
#else
seq_puts(m, "slabinfo - version: 2.1\n");
#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
- "<objperslab> <pagesperslab>");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#ifdef CONFIG_DEBUG_SLAB
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
@@ -1190,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
ks = ksize(p);
if (ks >= new_size) {
- kasan_krealloc((void *)p, new_size);
+ kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
diff --git a/mm/slub.c b/mm/slub.c
index 741e50759..c910369e3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ p += s->red_left_pad;
+
+ return p;
+}
+
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
*/
#define MAX_PARTIAL 10
-#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
/*
+ * These debug flags cannot use CMPXCHG because there might be consistency
+ * issues when checking or reading debug information
+ */
+#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
+ SLAB_TRACE)
+
+
+/*
* Debugging flags that require metadata to be stored in the slab. These get
* disabled when slub_debug=O is used and a cache's min order increases with
* metadata.
@@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, const void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + page->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -256,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (!debug_pagealloc_enabled())
+ return get_freepointer(s, object);
+
probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
-#else
- p = get_freepointer(s, object);
-#endif
return p;
}
@@ -271,12 +268,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
/* Loop over all objects in a slab */
#define for_each_object(__p, __s, __addr, __objects) \
- for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
- __p += (__s)->size)
+ for (__p = fixup_red_left(__s, __addr); \
+ __p < (__addr) + (__objects) * (__s)->size; \
+ __p += (__s)->size)
#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = (__addr), __idx = 1; __idx <= __objects;\
- __p += (__s)->size, __idx++)
+ for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+ __idx <= __objects; \
+ __p += (__s)->size, __idx++)
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -284,30 +283,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->object_size;
-
-#endif
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
-}
-
static inline int order_objects(int order, unsigned long size, int reserved)
{
return ((PAGE_SIZE << order) - reserved) / size;
@@ -458,6 +433,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
set_bit(slab_index(p, s, addr), map);
}
+static inline int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
/*
* Debug settings:
*/
@@ -491,6 +482,26 @@ static inline void metadata_access_disable(void)
/*
* Object debugging
*/
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
static void print_section(char *text, u8 *addr, unsigned int length)
{
metadata_access_enable();
@@ -630,7 +641,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
- if (p > addr + 16)
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ else if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -647,9 +660,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- if (off != s->size)
+ if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, s->size - off);
+ print_section("Padding ", p + off, size_from_object(s) - off);
dump_stack();
}
@@ -679,6 +692,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
@@ -771,11 +787,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */
off += 2 * sizeof(struct track);
- if (s->size == off)
+ if (size_from_object(s) == off)
return 1;
return check_bytes_and_report(s, page, p, "Object padding",
- p + off, POISON_INUSE, s->size - off);
+ p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
@@ -820,6 +836,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -930,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
max_objects = MAX_OBJS_PER_PAGE;
if (page->objects != max_objects) {
- slab_err(s, page, "Wrong number of objects. Found %d but "
- "should be %d", page->objects, max_objects);
+ slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
+ page->objects, max_objects);
page->objects = max_objects;
slab_fix(s, "Number of objects adjusted.");
}
if (page->inuse != page->objects - nr) {
- slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", page->inuse, page->objects - nr);
+ slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
+ page->inuse, page->objects - nr);
page->inuse = page->objects - nr;
slab_fix(s, "Object count adjusted.");
}
@@ -1031,20 +1051,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
+static inline int alloc_consistency_checks(struct kmem_cache *s,
struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
- goto bad;
+ return 0;
if (!check_valid_pointer(s, page, object)) {
object_err(s, page, object, "Freelist Pointer check fails");
- goto bad;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_INACTIVE))
- goto bad;
+ return 0;
+
+ return 1;
+}
+
+static noinline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page,
+ void *object, unsigned long addr)
+{
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!alloc_consistency_checks(s, page, object, addr))
+ goto bad;
+ }
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
@@ -1067,42 +1099,26 @@ bad:
return 0;
}
-/* Supports checking bulk free of a constructed freelist */
-static noinline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr, unsigned long *flags)
+static inline int free_consistency_checks(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- void *object = head;
- int cnt = 0;
-
- spin_lock_irqsave(&n->list_lock, *flags);
- slab_lock(page);
-
- if (!check_slab(s, page))
- goto fail;
-
-next_object:
- cnt++;
-
if (!check_valid_pointer(s, page, object)) {
slab_err(s, page, "Invalid object pointer 0x%p", object);
- goto fail;
+ return 0;
}
if (on_freelist(s, page, object)) {
object_err(s, page, object, "Object already free");
- goto fail;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
- goto out;
+ return 0;
if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
- slab_err(s, page, "Attempt to free object(0x%p) "
- "outside of slab", object);
+ slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+ object);
} else if (!page->slab_cache) {
pr_err("SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -1110,7 +1126,37 @@ next_object:
} else
object_err(s, page, object,
"page slab pointer corrupt.");
- goto fail;
+ return 0;
+ }
+ return 1;
+}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ void *object = head;
+ int cnt = 0;
+ unsigned long uninitialized_var(flags);
+ int ret = 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ slab_lock(page);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, page))
+ goto out;
+ }
+
+next_object:
+ cnt++;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, page, object, addr))
+ goto out;
}
if (s->flags & SLAB_STORE_USER)
@@ -1124,23 +1170,18 @@ next_object:
object = get_freepointer(s, object);
goto next_object;
}
+ ret = 1;
+
out:
if (cnt != bulk_cnt)
slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
bulk_cnt, cnt);
slab_unlock(page);
- /*
- * Keep node_lock to preserve integrity
- * until the object is actually freed
- */
- return n;
-
-fail:
- slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, *flags);
- slab_fix(s, "Object at 0x%p not freed", object);
- return NULL;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ if (!ret)
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return ret;
}
static int __init setup_slub_debug(char *str)
@@ -1172,7 +1213,7 @@ static int __init setup_slub_debug(char *str)
for (; *str && *str != ','; str++) {
switch (tolower(*str)) {
case 'f':
- slub_debug |= SLAB_DEBUG_FREE;
+ slub_debug |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
slub_debug |= SLAB_RED_ZONE;
@@ -1231,10 +1272,10 @@ static inline void setup_object_debug(struct kmem_cache *s,
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
-static inline struct kmem_cache_node *free_debug_processing(
+static inline int free_debug_processing(
struct kmem_cache *s, struct page *page,
void *head, void *tail, int bulk_cnt,
- unsigned long addr, unsigned long *flags) { return NULL; }
+ unsigned long addr) { return 0; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
@@ -1272,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
kmemleak_alloc(ptr, size, 1, flags);
- kasan_kmalloc_large(ptr, size);
+ kasan_kmalloc_large(ptr, size, flags);
}
static inline void kfree_hook(const void *x)
@@ -1281,36 +1322,6 @@ static inline void kfree_hook(const void *x)
kasan_kfree_large(x);
}
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
-{
- flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
- might_sleep_if(gfpflags_allow_blocking(flags));
-
- if (should_failslab(s->object_size, flags, s->flags))
- return NULL;
-
- return memcg_kmem_get_cache(s, flags);
-}
-
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
-{
- size_t i;
-
- flags &= gfp_allowed_mask;
- for (i = 0; i < size; i++) {
- void *object = p[i];
-
- kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
- kmemleak_alloc_recursive(object, s->object_size, 1,
- s->flags, flags);
- kasan_slab_alloc(s, object);
- }
- memcg_kmem_put_cache(s);
-}
-
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
kmemleak_free_recursive(x, s->flags);
@@ -1415,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
@@ -1470,7 +1481,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- page->freelist = start;
+ page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
@@ -1506,7 +1517,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (kmem_cache_debug(s)) {
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
void *p;
slab_pad_check(s, page);
@@ -1528,7 +1539,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_kmem_pages(page, order);
+ memcg_uncharge_slab(page, order, s);
+ __free_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
return;
- pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nid, gfpflags);
+ pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nid, gfpflags, &gfpflags);
pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
s->name, s->object_size, s->size, oo_order(s->oo),
oo_order(s->min));
@@ -2584,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2612,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
if (kmem_cache_debug(s) &&
- !(n = free_debug_processing(s, page, head, tail, cnt,
- addr, &flags)))
+ !free_debug_processing(s, page, head, tail, cnt, addr))
return;
do {
@@ -2837,23 +2848,38 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
+ struct page *page;
/* Always re-init detached_freelist */
df->page = NULL;
do {
object = p[--size];
+ /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
} while (!object && size);
if (!object)
return 0;
- /* Support for memcg, compiler can optimize this out */
- df->s = cache_from_obj(s, object);
+ page = virt_to_head_page(object);
+ if (!s) {
+ /* Handle kalloc'ed objects */
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+ __free_kmem_pages(page, compound_order(page));
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+ /* Derive kmem_cache from object */
+ df->s = page->slab_cache;
+ } else {
+ df->s = cache_from_obj(s, object); /* Support for memcg */
+ }
/* Start new detached freelist */
+ df->page = page;
set_freepointer(df->s, object, NULL);
- df->page = virt_to_head_page(object);
df->tail = object;
df->freelist = object;
p[size] = NULL; /* mark object processed */
@@ -3156,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
+ kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+ GFP_KERNEL);
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3285,7 +3312,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size += 2 * sizeof(struct track);
- if (flags & SLAB_RED_ZONE)
+ if (flags & SLAB_RED_ZONE) {
/*
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
@@ -3294,6 +3321,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* of the object.
*/
size += sizeof(void *);
+
+ s->red_left_pad = sizeof(void *);
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
#endif
/*
@@ -3357,7 +3389,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+ if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
#endif
@@ -3408,10 +3440,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
- panic("Cannot create slab %s size=%lu realsize=%u "
- "order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)s->size, s->size,
- oo_order(s->oo), s->offset, flags);
+ panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
+ s->name, (unsigned long)s->size, s->size,
+ oo_order(s->oo), s->offset, flags);
return -EINVAL;
}
@@ -3531,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags)
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3576,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3605,7 +3636,7 @@ size_t ksize(const void *object)
size_t size = __ksize(object);
/* We assume that ksize callers could use whole allocated area,
so we need unpoison this area. */
- kasan_krealloc(object, size);
+ kasan_krealloc(object, size, GFP_NOWAIT);
return size;
}
EXPORT_SYMBOL(ksize);
@@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects);
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
static ssize_t sanity_checks_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- s->flags &= ~SLAB_DEBUG_FREE;
+ s->flags &= ~SLAB_CONSISTENCY_CHECKS;
if (buf[0] == '1') {
s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_DEBUG_FREE;
+ s->flags |= SLAB_CONSISTENCY_CHECKS;
}
return length;
}
@@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s,
s->flags &= ~SLAB_RED_ZONE;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_RED_ZONE;
}
calculate_sizes(s, -1);
@@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s,
s->flags &= ~SLAB_POISON;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_POISON;
}
calculate_sizes(s, -1);
@@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'd';
if (s->flags & SLAB_RECLAIM_ACCOUNT)
*p++ = 'a';
- if (s->flags & SLAB_DEBUG_FREE)
+ if (s->flags & SLAB_CONSISTENCY_CHECKS)
*p++ = 'F';
if (!(s->flags & SLAB_NOTRACK))
*p++ = 't';
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index b60802b3e..68885dcba 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
- printk(KERN_WARNING "[%lx-%lx] potential offnode "
- "page_structs\n", start, end - 1);
+ pr_warn("[%lx-%lx] potential offnode page_structs\n",
+ start, end - 1);
}
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
@@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3717ceed4..5d0cf4540 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
if (usemap_nid != nid) {
- printk(KERN_INFO
- "node %d must be removed before remove section %ld\n",
- nid, usemap_snr);
+ pr_info("node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
return;
}
/*
@@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
* gather other removable sections for dynamic partitioning.
* Just notify un-removable section's number here.
*/
- printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
- pgdat_snr, nid);
- printk(KERN_CONT
- " have a circular dependency on usemap and pgdat allocations\n");
+ pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
+ usemap_snr, pgdat_snr, nid);
}
#else
static unsigned long * __init
@@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data,
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
size * usemap_count);
if (!usemap) {
- printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ pr_warn("%s: allocation failed\n", __func__);
return;
}
@@ -428,8 +425,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
}
@@ -456,8 +453,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
if (map)
return map;
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
return NULL;
}
diff --git a/mm/swap.c b/mm/swap.c
index 09fe5e977..03aacbcb0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -114,7 +114,7 @@ void put_pages_list(struct list_head *pages)
victim = list_entry(pages->prev, struct page, lru);
list_del(&victim->lru);
- page_cache_release(victim);
+ put_page(victim);
}
}
EXPORT_SYMBOL(put_pages_list);
@@ -142,7 +142,7 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
return seg;
pages[seg] = kmap_to_page(kiov[seg].iov_base);
- page_cache_get(pages[seg]);
+ get_page(pages[seg]);
}
return seg;
@@ -236,7 +236,7 @@ void rotate_reclaimable_page(struct page *page)
struct pagevec *pvec;
unsigned long flags;
- page_cache_get(page);
+ get_page(page);
local_irq_save(flags);
pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page))
@@ -294,7 +294,7 @@ void activate_page(struct page *page)
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
- page_cache_get(page);
+ get_page(page);
if (!pagevec_add(pvec, page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
put_cpu_var(activate_page_pvecs);
@@ -389,7 +389,7 @@ static void __lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
- page_cache_get(page);
+ get_page(page);
if (!pagevec_space(pvec))
__pagevec_lru_add(pvec);
pagevec_add(pvec, page);
@@ -646,7 +646,7 @@ void deactivate_page(struct page *page)
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
- page_cache_get(page);
+ get_page(page);
if (!pagevec_add(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
put_cpu_var(lru_deactivate_pvecs);
@@ -698,7 +698,7 @@ void lru_add_drain_all(void)
}
/**
- * release_pages - batched page_cache_release()
+ * release_pages - batched put_page()
* @pages: array of pages to release
* @nr: number of pages
* @cold: whether the pages are cache cold
@@ -728,6 +728,11 @@ void release_pages(struct page **pages, int nr, bool cold)
zone = NULL;
}
+ if (is_huge_zero_page(page)) {
+ put_huge_zero_page();
+ continue;
+ }
+
page = compound_head(page);
if (!put_page_testzero(page))
continue;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index b5f7f24b8..310ac0b8f 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
return 0;
nomem:
- printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
- printk(KERN_INFO
- "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ pr_info("couldn't allocate enough memory for swap_cgroup\n");
+ pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
return -ENOMEM;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 69cb2464e..366ce3518 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -85,7 +85,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- page_cache_get(page);
+ get_page(page);
SetPageSwapCache(page);
set_page_private(page, entry.val);
@@ -109,7 +109,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON(error == -EEXIST);
set_page_private(page, 0UL);
ClearPageSwapCache(page);
- page_cache_release(page);
+ put_page(page);
}
return error;
@@ -226,7 +226,7 @@ void delete_from_swap_cache(struct page *page)
spin_unlock_irq(&address_space->tree_lock);
swapcache_free(entry);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -252,7 +252,7 @@ static inline void free_swap_cache(struct page *page)
void free_page_and_swap_cache(struct page *page)
{
free_swap_cache(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -380,7 +380,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
} while (err != -ENOMEM);
if (new_page)
- page_cache_release(new_page);
+ put_page(new_page);
return found_page;
}
@@ -495,7 +495,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
continue;
if (offset != entry_offset)
SetPageReadahead(page);
- page_cache_release(page);
+ put_page(page);
}
blk_finish_plug(&plug);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 98d348347..dddefaeaa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -48,6 +48,12 @@ static void free_swap_count_continuations(struct swap_info_struct *);
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
+/*
+ * Some modules use swappable objects and may try to swap them out under
+ * memory pressure (via the shrinker). Before doing so, they may wish to
+ * check to see if any swap space is available.
+ */
+EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
@@ -113,7 +119,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
ret = try_to_free_swap(page);
unlock_page(page);
}
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -970,18 +976,19 @@ out:
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
+ *
+ * NOTE: total_mapcount should not be relied upon by the caller if
+ * reuse_swap_page() returns false, but it may be always overwritten
+ * (see the other implementation for CONFIG_SWAP=n).
*/
-int reuse_swap_page(struct page *page)
+bool reuse_swap_page(struct page *page, int *total_mapcount)
{
int count;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
- return 0;
- /* The page is part of THP and cannot be reused */
- if (PageTransCompound(page))
- return 0;
- count = page_mapcount(page);
+ return false;
+ count = page_trans_huge_mapcount(page, total_mapcount);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
if (count == 1 && !PageWriteback(page)) {
@@ -1048,7 +1055,7 @@ int free_swap_and_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry),
entry.val);
if (page && !trylock_page(page)) {
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
}
@@ -1065,7 +1072,7 @@ int free_swap_and_cache(swp_entry_t entry)
SetPageDirty(page);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return p != NULL;
}
@@ -1566,7 +1573,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
}
if (retval) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
break;
}
@@ -1618,7 +1625,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
*/
SetPageDirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
/*
* Make sure that we aren't completely killing
@@ -2580,8 +2587,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
- pr_info("Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+ pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
@@ -2623,7 +2629,7 @@ bad_swap:
out:
if (page && !IS_ERR(page)) {
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
if (name)
putname(name);
diff --git a/mm/truncate.c b/mm/truncate.c
index e3ee0e27c..b00272810 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -118,7 +118,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
return -EIO;
if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0, PAGE_SIZE);
/*
* Some filesystems seem to re-dirty the page even after
@@ -159,8 +159,8 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
unmap_mapping_range(mapping,
- (loff_t)page->index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ (loff_t)page->index << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
return truncate_complete_page(mapping, page);
}
@@ -241,8 +241,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
return;
/* Offsets within partial pages */
- partial_start = lstart & (PAGE_CACHE_SIZE - 1);
- partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+ partial_start = lstart & (PAGE_SIZE - 1);
+ partial_end = (lend + 1) & (PAGE_SIZE - 1);
/*
* 'start' and 'end' always covers the range of pages to be fully
@@ -250,7 +250,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
* start of the range and 'partial_end' at the end of the range.
* Note that 'end' is exclusive while 'lend' is inclusive.
*/
- start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (lend == -1)
/*
* lend == -1 indicates end-of-file so we have to set 'end'
@@ -259,7 +259,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
*/
end = -1;
else
- end = (lend + 1) >> PAGE_CACHE_SHIFT;
+ end = (lend + 1) >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
index = start;
@@ -298,7 +298,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (partial_start) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
- unsigned int top = PAGE_CACHE_SIZE;
+ unsigned int top = PAGE_SIZE;
if (start > end) {
/* Truncation within a single page */
top = partial_end;
@@ -311,7 +311,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
do_invalidatepage(page, partial_start,
top - partial_start);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
if (partial_end) {
@@ -324,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
do_invalidatepage(page, 0,
partial_end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
/*
@@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg;
unsigned long flags;
if (page->mapping != mapping)
@@ -528,24 +527,21 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
- page_cache_release(page); /* pagecache ref */
+ put_page(page); /* pagecache ref */
return 1;
failed:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -612,18 +608,18 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Zap the rest of the file in one hit.
*/
unmap_mapping_range(mapping,
- (loff_t)index << PAGE_CACHE_SHIFT,
+ (loff_t)index << PAGE_SHIFT,
(loff_t)(1 + end - index)
- << PAGE_CACHE_SHIFT,
- 0);
+ << PAGE_SHIFT,
+ 0);
did_range_unmap = 1;
} else {
/*
* Just zap this page
*/
unmap_mapping_range(mapping,
- (loff_t)index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ (loff_t)index << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
}
BUG_ON(page_mapped(page));
@@ -748,14 +744,14 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
WARN_ON(to > inode->i_size);
- if (from >= to || bsize == PAGE_CACHE_SIZE)
+ if (from >= to || bsize == PAGE_SIZE)
return;
/* Page straddling @from will not have any hole block created? */
rounded_from = round_up(from, bsize);
- if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
+ if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
return;
- index = from >> PAGE_CACHE_SHIFT;
+ index = from >> PAGE_SHIFT;
page = find_lock_page(inode->i_mapping, index);
/* Page not cached? Nothing to do */
if (!page)
@@ -767,7 +763,7 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
if (page_mkclean(page))
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
EXPORT_SYMBOL(pagecache_isize_extended);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 806b0c758..af817e506 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -93,7 +93,7 @@ out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
out_release:
- page_cache_release(page);
+ put_page(page);
goto out;
}
@@ -230,8 +230,7 @@ retry:
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
- dst_addr))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
err = -ENOMEM;
break;
}
@@ -288,7 +287,7 @@ out_unlock:
up_read(&dst_mm->mmap_sem);
out:
if (page)
- page_cache_release(page);
+ put_page(page);
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
diff --git a/mm/util.c b/mm/util.c
index 4fb14ca5a..6cc81e7b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
- struct mm_struct *mm = current->mm;
- return get_user_pages_unlocked(current, mm, start, nr_pages,
- write, 0, pages);
+ return get_user_pages_unlocked(start, nr_pages, write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -396,6 +394,13 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
+int sysctl_overcommit_ratio __read_mostly = 50;
+unsigned long sysctl_overcommit_kbytes __read_mostly;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
+
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -437,6 +442,123 @@ unsigned long vm_commit_limit(void)
return allowed;
}
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
+
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+{
+ long free, allowed, reserve;
+
+ VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
+ -(s64)vm_committed_as_batch * num_online_cpus(),
+ "memory commitment underflow");
+
+ vm_acct_memory(pages);
+
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ return 0;
+
+ if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
+
+ free += get_nr_swap_pages();
+
+ /*
+ * Any slabs which are created with the
+ * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+ * which are reclaimable, under pressure. The dentry
+ * cache and most inode caches should fall into this
+ */
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (free <= totalreserve_pages)
+ goto error;
+ else
+ free -= totalreserve_pages;
+
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ if (free > pages)
+ return 0;
+
+ goto error;
+ }
+
+ allowed = vm_commit_limit();
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
+ }
+
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
+ return 0;
+error:
+ vm_unacct_memory(pages);
+
+ return -ENOMEM;
+}
+
/**
* get_cmdline() - copy the cmdline value to a buffer.
* @task: the task whose cmdline value to copy.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb42a5bff..ae7d20b44 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -469,8 +469,8 @@ overflow:
goto retry;
}
if (printk_ratelimit())
- pr_warn("vmap allocation for size %lu failed: "
- "use vmalloc=<size> to increase size.\n", size);
+ pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
+ size);
kfree(va);
return ERR_PTR(-EBUSY);
}
@@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va)
static void vmap_debug_free_range(unsigned long start, unsigned long end)
{
/*
- * Unmap page tables and force a TLB flush immediately if
- * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
- * bugs similarly to those in linear kernel virtual address
- * space after a page has been freed.
+ * Unmap page tables and force a TLB flush immediately if pagealloc
+ * debugging is enabled. This catches use after free bugs similarly to
+ * those in linear kernel virtual address space after a page has been
+ * freed.
*
- * All the lazy freeing logic is still retained, in order to
- * minimise intrusiveness of this debugging feature.
+ * All the lazy freeing logic is still retained, in order to minimise
+ * intrusiveness of this debugging feature.
*
- * This is going to be *slow* (linear kernel virtual address
- * debugging doesn't do a broadcast TLB flush so it is a lot
- * faster).
+ * This is going to be *slow* (linear kernel virtual address debugging
+ * doesn't do a broadcast TLB flush so it is a lot faster).
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
-#endif
+ if (debug_pagealloc_enabled()) {
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+ }
}
/*
@@ -1086,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
- BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+ BUG_ON(!PAGE_ALIGNED(addr));
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0da572753..87cbf6cd6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_ISOLATED_FILE);
+ nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON) +
- zone_page_state(zone, NR_ISOLATED_ANON);
+ nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
return nr;
}
bool zone_reclaimable(struct zone *zone)
{
- return zone_page_state(zone, NR_PAGES_SCANNED) <
+ return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
zone_reclaimable_pages(zone) * 6;
}
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
@@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker)
{
size_t size = sizeof(*shrinker->nr_deferred);
- /*
- * If we only have one possible node in the system anyway, save
- * ourselves the trouble and disable NUMA aware behavior. This way we
- * will save memory and some small loop time later.
- */
- if (nr_node_ids == 1)
- shrinker->flags &= ~SHRINKER_NUMA_AWARE;
-
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -390,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
*
* @memcg specifies the memory cgroup to target. If it is not NULL,
* only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise all shrinkers
- * are called, and memcg aware shrinkers are supposed to scan the
- * global list then.
+ * objects from the memory cgroup specified. Otherwise, only unaware
+ * shrinkers are called.
*
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
@@ -412,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_online(memcg))
+ if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
if (nr_scanned == 0)
@@ -436,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ /*
+ * If kernel memory accounting is disabled, we ignore
+ * SHRINKER_MEMCG_AWARE flag and call all shrinkers
+ * passing NULL for memcg.
+ */
+ if (memcg_kmem_enabled() &&
+ !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
continue;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
@@ -611,12 +608,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
unsigned long flags;
- struct mem_cgroup *memcg;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
@@ -643,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (!page_freeze_refs(page, 2))
+ if (!page_ref_freeze(page, 2))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
- page_unfreeze_refs(page, 2);
+ page_ref_unfreeze(page, 2);
goto cannot_free;
}
@@ -656,7 +651,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
@@ -682,9 +676,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow, memcg);
+ __delete_from_page_cache(page, shadow);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
@@ -694,7 +687,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
cannot_free:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -712,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
- page_unfreeze_refs(page, 1);
+ page_ref_unfreeze(page, 1);
return 1;
}
return 0;
@@ -1931,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
unsigned long inactive;
unsigned long active;
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
}
@@ -2071,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* system is under heavy pressure.
*/
if (!inactive_file_is_low(lruvec) &&
- get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2097,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2130,7 @@ out:
unsigned long size;
unsigned long scan;
- size = get_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
@@ -2990,18 +2982,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+ unsigned long balance_gap, int classzone_idx)
{
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx))
- return false;
+ unsigned long mark = high_wmark_pages(zone) + balance_gap;
- if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
- order, 0, classzone_idx) == COMPACT_SKIPPED)
- return false;
+ /*
+ * When checking from pgdat_balanced(), kswapd should stop and sleep
+ * when it reaches the high order-0 watermark and let kcompactd take
+ * over. Other callers such as wakeup_kswapd() want to determine the
+ * true high-order watermark.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+ mark += (1UL << order);
+ order = 0;
+ }
- return true;
+ return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
}
/*
@@ -3051,7 +3048,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
continue;
}
- if (zone_balanced(zone, order, 0, i))
+ if (zone_balanced(zone, order, false, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
@@ -3105,10 +3102,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
- struct scan_control *sc,
- unsigned long *nr_attempted)
+ struct scan_control *sc)
{
- int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
@@ -3116,17 +3111,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
- * Kswapd reclaims only single pages with compaction enabled. Trying
- * too hard to reclaim until contiguous free pages have become
- * available can hurt performance by evicting too much useful data
- * from memory. Do not reclaim more than needed for compaction.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order, 0, classzone_idx)
- != COMPACT_SKIPPED)
- testorder = 0;
-
- /*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
@@ -3140,15 +3124,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, testorder,
+ if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
- /* Account for the number of pages attempted to reclaim */
- *nr_attempted += sc->nr_to_reclaim;
-
clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
@@ -3158,7 +3139,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* waits.
*/
if (zone_reclaimable(zone) &&
- zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
@@ -3170,7 +3151,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -3187,8 +3168,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -3205,9 +3185,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long nr_attempted = 0;
bool raise_priority = true;
- bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
@@ -3242,7 +3220,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
break;
}
- if (!zone_balanced(zone, order, 0, 0)) {
+ if (!zone_balanced(zone, order, false, 0, 0)) {
end_zone = i;
break;
} else {
@@ -3258,24 +3236,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (i < 0)
goto out;
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- /*
- * If any zone is currently balanced then kswapd will
- * not call compaction as it is expected that the
- * necessary pages are already available.
- */
- if (pgdat_needs_compaction &&
- zone_watermark_ok(zone, order,
- low_wmark_pages(zone),
- *classzone_idx, 0))
- pgdat_needs_compaction = false;
- }
-
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
@@ -3319,8 +3279,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone,
- &sc, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone, &sc))
raise_priority = false;
}
@@ -3333,49 +3292,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
- /*
- * Fragmentation may mean that the system cannot be rebalanced
- * for high-order allocations in all zones. If twice the
- * allocation size has been reclaimed and the zones are still
- * not balanced then recheck the watermarks at order-0 to
- * prevent kswapd reclaiming excessively. Assume that a
- * process requested a high-order can direct reclaim/compact.
- */
- if (order && sc.nr_reclaimed >= 2UL << order)
- order = sc.order = 0;
-
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
- * Compact if necessary and kswapd is reclaiming at least the
- * high watermark number of pages as requsted
- */
- if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
- compact_pgdat(pgdat, order);
-
- /*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, *classzone_idx));
+ !pgdat_balanced(pgdat, order, classzone_idx));
out:
/*
- * Return the order we were reclaiming at so prepare_kswapd_sleep()
- * makes a decision on the order we were last reclaiming at. However,
- * if another caller entered the allocator slow path while kswapd
- * was awake, order will remain at the higher level
+ * Return the highest zone idx we were reclaiming at so
+ * prepare_kswapd_sleep() makes the same decisions as here.
*/
- *classzone_idx = end_zone;
- return order;
+ return end_zone;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+ int classzone_idx, int balanced_classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3386,7 +3325,22 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
+ /*
+ * Compaction records what page blocks it recently failed to
+ * isolate pages from and skips them in the future scanning.
+ * When kswapd is going to sleep, it is reasonable to assume
+ * that pages and compaction may succeed so reset the cache.
+ */
+ reset_isolation_suitable(pgdat);
+
+ /*
+ * We have freed the memory, now we should compact it to make
+ * allocation of the requested order possible.
+ */
+ wakeup_kcompactd(pgdat, order, classzone_idx);
+
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3396,7 +3350,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3409,14 +3364,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- /*
- * Compaction records what page blocks it recently failed to
- * isolate pages from and skips them in the future scanning.
- * When kswapd is going to sleep, it is reasonable to assume
- * that pages and compaction may succeed so reset the cache.
- */
- reset_isolation_suitable(pgdat);
-
if (!kthread_should_stop())
schedule();
@@ -3446,7 +3393,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
- unsigned balanced_order;
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
@@ -3479,24 +3425,19 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
- balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
- * If the last balance_pgdat was unsuccessful it's unlikely a
- * new request of a similar or harder type will succeed soon
- * so consider going to sleep on the basis we reclaimed at
+ * While we were reclaiming, there might have been another
+ * wakeup, so check the values.
*/
- if (balanced_classzone_idx >= new_classzone_idx &&
- balanced_order == new_order) {
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
- }
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
@@ -3506,7 +3447,7 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, balanced_order,
+ kswapd_try_to_sleep(pgdat, order, classzone_idx,
balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
@@ -3526,9 +3467,8 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = classzone_idx;
- balanced_order = balance_pgdat(pgdat, order,
- &balanced_classzone_idx);
+ balanced_classzone_idx = balance_pgdat(pgdat, order,
+ classzone_idx);
}
}
@@ -3563,7 +3503,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, 0, 0))
+ if (zone_balanced(zone, order, true, 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 084c6725b..5e4300482 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -826,6 +826,7 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
+ "compact_daemon_wake",
#endif
#ifdef CONFIG_HUGETLB_PAGE
@@ -847,6 +848,7 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc_failed",
"thp_split_page",
"thp_split_page_failed",
+ "thp_deferred_split_page",
"thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
@@ -924,19 +926,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
#endif
#ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Movable",
- "Reclaimable",
- "HighAtomic",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1133,7 +1122,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
#ifdef CONFIG_PAGE_OWNER
int mtype;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return;
drain_all_pages(NULL);
diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e55..8a75f8d29 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,25 @@
* refault distance will immediately activate the refaulting page.
*/
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+ ZONES_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the radix tree
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order __read_mostly;
+
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
{
+ eviction >>= bucket_order;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow,
- struct zone **zone,
- unsigned long *distance)
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
+ unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- unsigned long eviction;
- unsigned long refault;
- unsigned long mask;
- int zid, nid;
+ int memcgid, nid, zid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
- eviction = entry;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
- *zone = NODE_DATA(nid)->node_zones + zid;
-
- refault = atomic_long_read(&(*zone)->inactive_age);
- mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
- RADIX_TREE_EXCEPTIONAL_SHIFT);
- /*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
- *
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
- */
- *distance = (refault - eviction) & mask;
+ *memcgidp = memcgid;
+ *zonep = NODE_DATA(nid)->node_zones + zid;
+ *evictionp = entry << bucket_order;
}
/**
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page);
+ int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
+ struct lruvec *lruvec;
- eviction = atomic_long_inc_return(&zone->inactive_age);
- return pack_shadow(eviction, zone);
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ return pack_shadow(memcgid, zone, eviction);
}
/**
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
bool workingset_refault(void *shadow)
{
unsigned long refault_distance;
+ unsigned long active_file;
+ struct mem_cgroup *memcg;
+ unsigned long eviction;
+ struct lruvec *lruvec;
+ unsigned long refault;
struct zone *zone;
+ int memcgid;
+
+ unpack_shadow(shadow, &memcgid, &zone, &eviction);
+
+ rcu_read_lock();
+ /*
+ * Look up the memcg associated with the stored ID. It might
+ * have been deleted since the page's eviction.
+ *
+ * Note that in rare events the ID could have been recycled
+ * for a new cgroup that refaults a shared page. This is
+ * impossible to tell from the available data. However, this
+ * should be a rare and limited disturbance, and activations
+ * are always speculative anyway. Ultimately, it's the aging
+ * algorithm's job to shake out the minimum access frequency
+ * for the active cache.
+ *
+ * XXX: On !CONFIG_MEMCG, this will always return NULL; it
+ * would be better if the root_mem_cgroup existed in all
+ * configurations instead.
+ */
+ memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_disabled() && !memcg) {
+ rcu_read_unlock();
+ return false;
+ }
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ rcu_read_unlock();
+
+ /*
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases.
+ *
+ * There is a special case: usually, shadow entries have a
+ * short lifetime and are either refaulted or reclaimed along
+ * with the inode before they get too old. But it is not
+ * impossible for the inactive_age to lap a shadow entry in
+ * the field, which can then can result in a false small
+ * refault distance, leading to a false activation should this
+ * old entry actually refault again. However, earlier kernels
+ * used to deactivate unconditionally with *every* reclaim
+ * invocation for the longest time, so the occasional
+ * inappropriate activation leading to pressure on the active
+ * list is not a problem.
+ */
+ refault_distance = (refault - eviction) & EVICTION_MASK;
- unpack_shadow(shadow, &zone, &refault_distance);
inc_zone_state(zone, WORKINGSET_REFAULT);
- if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE);
return true;
}
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow)
*/
void workingset_activation(struct page *page)
{
- atomic_long_inc(&page_zone(page)->inactive_age);
+ struct lruvec *lruvec;
+
+ lock_page_memcg(page);
+ /*
+ * Filter non-memcg pages here, e.g. unmap can call
+ * mark_page_accessed() on VDSO pages.
+ *
+ * XXX: See workingset_refault() - this should return
+ * root_mem_cgroup even for !CONFIG_MEMCG.
+ */
+ if (!mem_cgroup_disabled() && !page_memcg(page))
+ goto out;
+ lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ atomic_long_inc(&lruvec->inactive_age);
+out:
+ unlock_page_memcg(page);
}
/*
@@ -278,7 +349,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
- pages = node_present_pages(sc->nid);
+ if (memcg_kmem_enabled())
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL_FILE);
+ else
+ pages = node_page_state(sc->nid, NR_ACTIVE_FILE) +
+ node_page_state(sc->nid, NR_INACTIVE_FILE);
+
/*
* Active cache pages are limited to 50% of memory, and shadow
* entries that represent a refault distance bigger than that
@@ -387,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
/*
@@ -398,8 +475,25 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ unsigned int timestamp_bits;
+ unsigned int max_order;
int ret;
+ BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ /*
+ * Calculate the eviction bucket size to cover the longest
+ * actionable refault distance, which is currently half of
+ * memory (totalram_pages/2). However, memory hotplug may add
+ * some more pages at runtime, so keep working with up to
+ * double the initial memory by using totalram_pages as-is.
+ */
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ max_order = fls_long(totalram_pages - 1);
+ if (max_order > timestamp_bits)
+ bucket_order = max_order - timestamp_bits;
+ printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+ timestamp_bits, max_order, bucket_order);
+
ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
if (ret)
goto err;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2d7c4c11f..fe47fbba9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -281,7 +281,6 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
- bool huge;
};
static int create_handle_cache(struct zs_pool *pool)
@@ -495,6 +494,8 @@ static void __exit zs_stat_exit(void)
debugfs_remove_recursive(zs_stat_root);
}
+static unsigned long zs_can_compact(struct size_class *class);
+
static int zs_stats_size_show(struct seq_file *s, void *v)
{
int i;
@@ -502,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
struct size_class *class;
int objs_per_zspage;
unsigned long class_almost_full, class_almost_empty;
- unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long obj_allocated, obj_used, pages_used, freeable;
unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+ unsigned long total_freeable = 0;
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
"class", "size", "almost_full", "almost_empty",
"obj_allocated", "obj_used", "pages_used",
- "pages_per_zspage");
+ "pages_per_zspage", "freeable");
for (i = 0; i < zs_size_classes; i++) {
class = pool->size_class[i];
@@ -522,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
obj_used = zs_stat_get(class, OBJ_USED);
+ freeable = zs_can_compact(class);
spin_unlock(&class->lock);
objs_per_zspage = get_maxobj_per_zspage(class->size,
@@ -529,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu"
+ " %10lu %10lu %16d %8lu\n",
i, class->size, class_almost_full, class_almost_empty,
obj_allocated, obj_used, pages_used,
- class->pages_per_zspage);
+ class->pages_per_zspage, freeable);
total_class_almost_full += class_almost_full;
total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
+ total_freeable += freeable;
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
"Total", "", total_class_almost_full,
total_class_almost_empty, total_objs,
- total_used_objs, total_pages);
+ total_used_objs, total_pages, "", total_freeable);
return 0;
}
@@ -1127,11 +1132,9 @@ static void __zs_unmap_object(struct mapping_area *area,
goto out;
buf = area->vm_buf;
- if (!area->huge) {
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
- }
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -1732,10 +1735,13 @@ static struct page *isolate_source_page(struct size_class *class)
static unsigned long zs_can_compact(struct size_class *class)
{
unsigned long obj_wasted;
+ unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ unsigned long obj_used = zs_stat_get(class, OBJ_USED);
- obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
- zs_stat_get(class, OBJ_USED);
+ if (obj_allocated <= obj_used)
+ return 0;
+ obj_wasted = obj_allocated - obj_used;
obj_wasted /= get_maxobj_per_zspage(class->size,
class->pages_per_zspage);
diff --git a/mm/zswap.c b/mm/zswap.c
index 340261946..de0f119b1 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -875,7 +875,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_EXIST:
/* page is already in the swap cache, ignore for now */
- page_cache_release(page);
+ put_page(page);
ret = -EEXIST;
goto fail;
@@ -903,7 +903,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
/* start writeback */
__swap_writepage(page, &wbc, end_swap_bio_write);
- page_cache_release(page);
+ put_page(page);
zswap_written_back_pages++;
spin_lock(&tree->lock);