summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c726
1 files changed, 308 insertions, 418 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3e27ba36..a2214c64e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -62,8 +62,8 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/page_owner.h>
-#include <linux/tuxonice.h>
#include <linux/kthread.h>
+#include <linux/memcontrol.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
- if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return true;
-
- return false;
-}
-
/*
* Returns false when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
@@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
- return false;
-}
-
static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
@@ -924,12 +911,6 @@ static void free_pages_check_bad(struct page *page)
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- if (unlikely(PageTOI_Untracked(page))) {
- // Make it writable and included in image if allocated.
- ClearPageTOI_Untracked(page);
- // If it gets allocated, it will be dirty from TOI's POV.
- SetPageTOI_Dirty(page);
- }
bad_page(page, bad_reason, bad_flags);
}
@@ -1013,6 +994,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+ if (compound)
+ ClearPageDoubleMap(page);
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
@@ -1023,8 +1006,10 @@ static __always_inline bool free_pages_prepare(struct page *page,
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
- if (PageAnonHead(page))
+ if (PageMappingFlags(page))
page->mapping = NULL;
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1091,9 +1076,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
- nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
while (count) {
struct page *page;
@@ -1148,9 +1133,9 @@ static void free_one_page(struct zone *zone,
{
unsigned long nr_scanned;
spin_lock(&zone->lock);
- nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
@@ -1731,6 +1716,19 @@ static bool check_new_pages(struct page *page, unsigned int order)
return false;
}
+inline void post_alloc_hook(struct page *page, unsigned int order,
+ gfp_t gfp_flags)
+{
+ set_page_private(page, 0);
+ set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
+ kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
+ kasan_alloc_pages(page, order);
+ set_page_owner(page, order, gfp_flags);
+}
+
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
@@ -1739,22 +1737,11 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
- if (unlikely(toi_incremental_support() && gfp_flags & ___GFP_TOI_NOTRACK)) {
- // Make the page writable if it's protected, and set it to be untracked.
- SetPageTOI_Untracked(p);
- toi_make_writable(init_mm.pgd, (unsigned long) page_address(p));
- }
if (poisoned)
poisoned &= page_is_poisoned(p);
}
- set_page_private(page, 0);
- set_page_refcounted(page);
-
- arch_alloc_page(page, order);
- kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
- kasan_alloc_pages(page, order);
+ post_alloc_hook(page, order, gfp_flags);
if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
@@ -1763,8 +1750,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
- set_page_owner(page, order, gfp_flags);
-
/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
@@ -2473,7 +2458,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
void split_page(struct page *page, unsigned int order)
{
int i;
- gfp_t gfp_mask;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2487,12 +2471,9 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- gfp_mask = get_page_owner_gfp(page);
- set_page_owner(page, 0, gfp_mask);
- for (i = 1; i < (1 << order); i++) {
+ for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- set_page_owner(page + i, 0, gfp_mask);
- }
+ split_page_owner(page, order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -2521,9 +2502,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
- set_page_owner(page, order, __GFP_MOVABLE);
-
- /* Set the pageblock if the isolated page is at least a pageblock */
+ /*
+ * Set the pageblock if the isolated page is at least half of a
+ * pageblock
+ */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -2539,33 +2521,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
/*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
- */
-int split_free_page(struct page *page)
-{
- unsigned int order;
- int nr_pages;
-
- order = page_order(page);
-
- nr_pages = __isolate_free_page(page, order);
- if (!nr_pages)
- return 0;
-
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
- return nr_pages;
-}
-
-/*
* Update NUMA hit/miss statistics
*
* Must be called with interrupts disabled.
@@ -2630,7 +2585,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
else
page = list_first_entry(list, struct page, lru);
- __dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru);
pcp->count--;
@@ -2656,16 +2610,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
spin_unlock(&zone->lock);
if (!page)
goto failed;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
}
- if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
- !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
- set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
@@ -2875,40 +2824,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return local_zone->node == zone->node;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return true;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
#endif /* CONFIG_NUMA */
-static void reset_alloc_batches(struct zone *preferred_zone)
-{
- struct zone *zone = preferred_zone->zone_pgdat->node_zones;
-
- do {
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
- } while (zone++ != preferred_zone);
-}
-
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -2919,10 +2846,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
- bool fair_skipped = false;
- bool apply_fair = (alloc_flags & ALLOC_FAIR);
+ struct pglist_data *last_pgdat_dirty_limit = NULL;
-zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@@ -2937,50 +2862,33 @@ zonelist_scan:
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
- * Distribute pages in proportion to the individual
- * zone size to ensure fair page aging. The zone a
- * page was allocated in should have no effect on the
- * time the page has in memory before being reclaimed.
- */
- if (apply_fair) {
- if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- fair_skipped = true;
- continue;
- }
- if (!zone_local(ac->preferred_zoneref->zone, zone)) {
- if (fair_skipped)
- goto reset_fair;
- apply_fair = false;
- }
- }
- /*
* When allocating a page cache page for writing, we
- * want to get it from a zone that is within its dirty
- * limit, such that no single zone holds more than its
+ * want to get it from a node that is within its dirty
+ * limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
- * The dirty limits take into account the zone's
+ * The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
- * This may look like it could increase pressure on
- * lower zones by failing allocations in higher zones
- * before they are full. But the pages that do spill
- * over are limited as the lower zones are protected
- * by this very same mechanism. It should not become
- * a practical burden to them.
- *
* XXX: For now, allow allocations to potentially
- * exceed the per-zone dirty limit in the slowpath
+ * exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
- * zones are together not big enough to reach the
+ * nodes are together not big enough to reach the
* global limit. The proper fix for these situations
- * will require awareness of zones in the
+ * will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
- if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
- continue;
+ if (ac->spread_dirty_pages) {
+ if (last_pgdat_dirty_limit == zone->zone_pgdat)
+ continue;
+
+ if (!node_dirty_ok(zone->zone_pgdat)) {
+ last_pgdat_dirty_limit = zone->zone_pgdat;
+ continue;
+ }
+ }
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_fast(zone, order, mark,
@@ -2992,16 +2900,16 @@ zonelist_scan:
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (zone_reclaim_mode == 0 ||
+ if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
- ret = zone_reclaim(zone, gfp_mask, order);
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
+ case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
- case ZONE_RECLAIM_FULL:
+ case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
@@ -3031,23 +2939,6 @@ try_this_zone:
}
}
- /*
- * The first pass makes sure allocations are spread fairly within the
- * local node. However, the local node might have free pages left
- * after the fairness batches are exhausted, and remote zones haven't
- * even been considered yet. Try once more without fairness, and
- * include remote zones now, before entering the slowpath and waking
- * kswapd: prefer spilling to a remote zone over swapping locally.
- */
- if (fair_skipped) {
-reset_fair:
- apply_fair = false;
- fair_skipped = false;
- reset_alloc_batches(ac->preferred_zoneref->zone);
- z = ac->preferred_zoneref;
- goto zonelist_scan;
- }
-
return NULL;
}
@@ -3117,6 +3008,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct oom_control oc = {
.zonelist = ac->zonelist,
.nodemask = ac->nodemask,
+ .memcg = NULL,
.gfp_mask = gfp_mask,
.order = order,
};
@@ -3191,7 +3083,6 @@ out:
return page;
}
-
/*
* Maximum number of compaction retries wit a progress before OOM
* killer is consider as the only way to move forward.
@@ -3203,17 +3094,16 @@ out:
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
- int contended_compaction;
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- mode, &contended_compaction);
+ prio);
current->flags &= ~PF_MEMALLOC;
if (*compact_result <= COMPACT_INACTIVE)
@@ -3225,8 +3115,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -3243,24 +3132,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTFAIL);
- /*
- * In all zones where compaction was attempted (and not
- * deferred or skipped), lock contention has been detected.
- * For THP allocation we do not want to disrupt the others
- * so we fallback to base pages instead.
- */
- if (contended_compaction == COMPACT_CONTENDED_LOCK)
- *compact_result = COMPACT_CONTENDED;
-
- /*
- * If compaction was aborted due to need_resched(), we do not
- * want to further increase allocation latency, unless it is
- * khugepaged trying to collapse.
- */
- if (contended_compaction == COMPACT_CONTENDED_SCHED
- && !(current->flags & PF_KTHREAD))
- *compact_result = COMPACT_CONTENDED;
-
cond_resched();
return NULL;
@@ -3270,7 +3141,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
*compact_result = COMPACT_SKIPPED;
return NULL;
@@ -3281,7 +3152,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
- enum migrate_mode *migrate_mode,
+ enum compact_priority *compact_priority,
int compaction_retries)
{
struct zone *zone;
@@ -3348,8 +3219,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return NULL;
retry:
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -3370,10 +3240,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
+ pg_data_t *last_pgdat = NULL;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->high_zoneidx, ac->nodemask)
- wakeup_kswapd(zone, order, ac_classzone_idx(ac));
+ ac->high_zoneidx, ac->nodemask) {
+ if (last_pgdat != zone->zone_pgdat)
+ wakeup_kswapd(zone, order, ac->high_zoneidx);
+ last_pgdat = zone->zone_pgdat;
+ }
}
static inline unsigned int
@@ -3407,16 +3281,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (gfp_mask & __GFP_MEMALLOC)
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- }
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -3426,12 +3290,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
- return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
-}
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
-static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
-{
- return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+ if (gfp_mask & __GFP_MEMALLOC)
+ return true;
+ if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ return true;
+ if (!in_interrupt() &&
+ ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
+ return true;
+
+ return false;
}
/*
@@ -3467,10 +3338,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false;
/*
- * Keep reclaiming pages while there is a chance this will lead somewhere.
- * If none of the target zones can satisfy our allocation request even
- * if all reclaimable pages are considered then we are screwed and have
- * to go OOM.
+ * Keep reclaiming pages while there is a chance this will lead
+ * somewhere. If none of the target zones can satisfy our allocation
+ * request even if all reclaimable pages are considered then we are
+ * screwed and have to go OOM.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
@@ -3495,14 +3366,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* prevent from pre mature OOM
*/
if (!did_some_progress) {
- unsigned long writeback;
- unsigned long dirty;
+ unsigned long write_pending;
- writeback = zone_page_state_snapshot(zone,
- NR_WRITEBACK);
- dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+ write_pending = zone_page_state_snapshot(zone,
+ NR_ZONE_WRITE_PENDING);
- if (2*(writeback + dirty) > reclaimable) {
+ if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
@@ -3537,7 +3406,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
enum compact_result compact_result;
int compaction_retries = 0;
int no_progress_loops = 0;
@@ -3561,42 +3430,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry:
+ /*
+ * The fast path uses conservative alloc_flags to succeed only until
+ * kswapd needs to be woken up, and to avoid the cost of setting up
+ * alloc_flags precisely. So we do that now.
+ */
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/*
- * OK, we're below the kswapd watermark and have kicked background
- * reclaim. Now things get more complex, so set up alloc_flags according
- * to how we want to proceed.
+ * The adjusted alloc_flags might result in immediate success, so try
+ * that first
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ if (page)
+ goto got_pg;
+
+ /*
+ * For costly allocations, try direct compaction first, as it's likely
+ * that we have enough base pages and don't need to reclaim. Don't try
+ * that for allocations that are allowed to ignore watermarks, as the
+ * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ */
+ if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
+ !gfp_pfmemalloc_allowed(gfp_mask)) {
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ alloc_flags, ac,
+ INIT_COMPACT_PRIORITY,
+ &compact_result);
+ if (page)
+ goto got_pg;
+
+ /*
+ * Checks for costly allocations with __GFP_NORETRY, which
+ * includes THP page fault allocations
+ */
+ if (gfp_mask & __GFP_NORETRY) {
+ /*
+ * If compaction is deferred for high-order allocations,
+ * it is because sync compaction recently failed. If
+ * this is the case and the caller requested a THP
+ * allocation, we do not want to heavily disrupt the
+ * system, so we fail the allocation instead of entering
+ * direct reclaim.
+ */
+ if (compact_result == COMPACT_DEFERRED)
+ goto nopage;
+
+ /*
+ * Looks like reclaim/compaction is worth trying, but
+ * sync compaction could be very expensive, so keep
+ * using async compaction.
+ */
+ compact_priority = INIT_COMPACT_PRIORITY;
+ }
+ }
+
+retry:
+ /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ wake_all_kswapds(order, ac);
+
+ if (gfp_pfmemalloc_allowed(gfp_mask))
+ alloc_flags = ALLOC_NO_WATERMARKS;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
- /* This is the last chance, in general, before the goto nopage. */
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ /* Attempt with potentially adjusted zonelist and alloc_flags */
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
- /* Allocate without watermarks if the context allows */
- if (alloc_flags & ALLOC_NO_WATERMARKS) {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
- if (page)
- goto got_pg;
- }
-
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) {
/*
@@ -3626,38 +3541,6 @@ retry:
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /*
- * Try direct compaction. The first pass is asynchronous. Subsequent
- * attempts after direct reclaim are synchronous
- */
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
- migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
-
- /* Checks for THP-specific high-order allocations */
- if (is_thp_gfp_mask(gfp_mask)) {
- /*
- * If compaction is deferred for high-order allocations, it is
- * because sync compaction recently failed. If this is the case
- * and the caller requested a THP allocation, we do not want
- * to heavily disrupt the system, so we fail the allocation
- * instead of entering direct reclaim.
- */
- if (compact_result == COMPACT_DEFERRED)
- goto nopage;
-
- /*
- * Compaction is contended so rather back off than cause
- * excessive stalls.
- */
- if(compact_result == COMPACT_CONTENDED)
- goto nopage;
- }
-
- if (order && compaction_made_progress(compact_result))
- compaction_retries++;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3665,16 +3548,25 @@ retry:
if (page)
goto got_pg;
+ /* Try direct compaction and then allocating */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+ compact_priority, &compact_result);
+ if (page)
+ goto got_pg;
+
+ if (order && compaction_made_progress(compact_result))
+ compaction_retries++;
+
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
- goto noretry;
+ goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
- goto noretry;
+ goto nopage;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3698,7 +3590,7 @@ retry:
*/
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
- compact_result, &migration_mode,
+ compact_result, &compact_priority,
compaction_retries))
goto retry;
@@ -3713,25 +3605,6 @@ retry:
goto retry;
}
-noretry:
- /*
- * High-order allocations do not necessarily loop after direct reclaim
- * and reclaim/compaction depends on compaction being called after
- * reclaim so call directly if necessary.
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse. All other requests should tolerate
- * at least light sync migration.
- */
- if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_ASYNC;
- else
- migration_mode = MIGRATE_SYNC_LIGHT;
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
- ac, migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
got_pg:
@@ -3747,7 +3620,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
struct page *page;
unsigned int cpuset_mems_cookie;
- unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
@@ -3834,6 +3707,12 @@ no_zone:
}
out:
+ if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
+ unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ __free_pages(page, order);
+ page = NULL;
+ }
+
if (kmemcheck_enabled && page)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
@@ -3989,56 +3868,6 @@ void __free_page_frag(void *addr)
}
EXPORT_SYMBOL(__free_page_frag);
-/*
- * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
- * equivalent to alloc_pages.
- *
- * It should be used when the caller would like to use kmalloc, but since the
- * allocation is large, it has to fall back to the page allocator.
- */
-struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
-{
- struct page *page;
-
- page = alloc_pages(gfp_mask, order);
- if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
- __free_pages(page, order);
- page = NULL;
- }
- return page;
-}
-
-struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-{
- struct page *page;
-
- page = alloc_pages_node(nid, gfp_mask, order);
- if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
- __free_pages(page, order);
- page = NULL;
- }
- return page;
-}
-
-/*
- * __free_kmem_pages and free_kmem_pages will free pages allocated with
- * alloc_kmem_pages.
- */
-void __free_kmem_pages(struct page *page, unsigned int order)
-{
- memcg_kmem_uncharge(page, order);
- __free_pages(page, order);
-}
-
-void free_kmem_pages(unsigned long addr, unsigned int order)
-{
- if (addr != 0) {
- VM_BUG_ON(!virt_addr_valid((void *)addr));
- __free_kmem_pages(virt_to_page((void *)addr), order);
- }
-}
-
static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
@@ -4184,7 +4013,7 @@ long si_mem_available(void)
int lru;
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_page_state(NR_LRU_BASE + lru);
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
wmark_low += zone->watermark[WMARK_LOW];
@@ -4220,7 +4049,7 @@ EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
- val->sharedram = global_page_state(NR_SHMEM);
+ val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
@@ -4242,8 +4071,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += pgdat->node_zones[zone_type].managed_pages;
val->totalram = managed_pages;
- val->sharedram = node_page_state(nid, NR_SHMEM);
- val->freeram = node_page_state(nid, NR_FREE_PAGES);
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
@@ -4326,6 +4155,7 @@ void show_free_areas(unsigned int filter)
unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
+ pg_data_t *pgdat;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4341,26 +4171,73 @@ void show_free_areas(unsigned int filter)
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
- global_page_state(NR_ACTIVE_ANON),
- global_page_state(NR_INACTIVE_ANON),
- global_page_state(NR_ISOLATED_ANON),
- global_page_state(NR_ACTIVE_FILE),
- global_page_state(NR_INACTIVE_FILE),
- global_page_state(NR_ISOLATED_FILE),
- global_page_state(NR_UNEVICTABLE),
- global_page_state(NR_FILE_DIRTY),
- global_page_state(NR_WRITEBACK),
- global_page_state(NR_UNSTABLE_NFS),
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
- global_page_state(NR_FILE_MAPPED),
- global_page_state(NR_SHMEM),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
global_page_state(NR_FREE_PAGES),
free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
+ for_each_online_pgdat(pgdat) {
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " unstable:%lukB"
+ " pages_scanned:%lu"
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
+ * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+#endif
+ K(node_page_state(pgdat, NR_SHMEM)),
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+ node_page_state(pgdat, NR_PAGES_SCANNED),
+ !pgdat_reclaimable(pgdat) ? "yes" : "no");
+ }
+
for_each_populated_zone(zone) {
int i;
@@ -4382,61 +4259,41 @@ void show_free_areas(unsigned int filter)
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
- " isolated(anon):%lukB"
- " isolated(file):%lukB"
+ " writepending:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " dirty:%lukB"
- " writeback:%lukB"
- " mapped:%lukB"
- " shmem:%lukB"
" slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
- " unstable:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
" free_cma:%lukB"
- " writeback_tmp:%lukB"
- " pages_scanned:%lu"
- " all_unreclaimable? %s"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
- K(zone_page_state(zone, NR_ACTIVE_ANON)),
- K(zone_page_state(zone, NR_INACTIVE_ANON)),
- K(zone_page_state(zone, NR_ACTIVE_FILE)),
- K(zone_page_state(zone, NR_INACTIVE_FILE)),
- K(zone_page_state(zone, NR_UNEVICTABLE)),
- K(zone_page_state(zone, NR_ISOLATED_ANON)),
- K(zone_page_state(zone, NR_ISOLATED_FILE)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_FILE_DIRTY)),
- K(zone_page_state(zone, NR_WRITEBACK)),
- K(zone_page_state(zone, NR_FILE_MAPPED)),
- K(zone_page_state(zone, NR_SHMEM)),
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
- zone_page_state(zone, NR_KERNEL_STACK) *
- THREAD_SIZE / 1024,
+ zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
- K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
- K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
- K(zone_page_state(zone, NR_PAGES_SCANNED)),
- (!zone_reclaimable(zone) ? "yes" : "no")
- );
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %ld", zone->lowmem_reserve[i]);
@@ -4478,7 +4335,7 @@ void show_free_areas(unsigned int filter)
hugetlb_show_meminfo();
- printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
show_swap_cache_info();
}
@@ -4503,7 +4360,7 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
- if (populated_zone(zone)) {
+ if (managed_zone(zone)) {
zoneref_set_zone(zone,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
@@ -4741,7 +4598,7 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
for (j = 0; j < nr_nodes; j++) {
node = node_order[j];
z = &NODE_DATA(node)->node_zones[zone_type];
- if (populated_zone(z)) {
+ if (managed_zone(z)) {
zoneref_set_zone(z,
&zonelist->_zonerefs[pos++]);
check_highest_zone(zone_type);
@@ -4853,6 +4710,8 @@ int local_memory_node(int node)
}
#endif
+static void setup_min_unmapped_ratio(void);
+static void setup_min_slab_ratio(void);
#else /* CONFIG_NUMA */
static void set_zonelist_order(void)
@@ -5357,13 +5216,18 @@ static void __meminit setup_zone_pageset(struct zone *zone)
*/
void __init setup_per_cpu_pageset(void)
{
+ struct pglist_data *pgdat;
struct zone *zone;
for_each_populated_zone(zone)
setup_zone_pageset(zone);
+
+ for_each_online_pgdat(pgdat)
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
}
-static noinline __init_refok
+static noinline __ref
int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
{
int i;
@@ -5918,6 +5782,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kcompactd_wait);
#endif
pgdat_page_ext_init(pgdat);
+ spin_lock_init(&pgdat->lru_lock);
+ lruvec_init(node_lruvec(pgdat));
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -5967,21 +5833,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
- / 100;
- zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
+ zone->zone_pgdat = pgdat;
spin_lock_init(&zone->lock);
- spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
- zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- /* For bootup, initialized properly in watermark setup */
- mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
-
- lruvec_init(&zone->lruvec);
if (!size)
continue;
@@ -5993,7 +5851,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
}
}
-static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
unsigned long __maybe_unused offset = 0;
@@ -6047,11 +5905,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
+ pgdat->per_cpu_nodestats = NULL;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
@@ -6433,15 +6292,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
- arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
- arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
- for (i = 1; i < MAX_NR_ZONES; i++) {
+
+ start_pfn = find_min_pfn_with_active_regions();
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
- arch_zone_lowest_possible_pfn[i] =
- arch_zone_highest_possible_pfn[i-1];
- arch_zone_highest_possible_pfn[i] =
- max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
+
+ end_pfn = max(max_zone_pfn[i], start_pfn);
+ arch_zone_lowest_possible_pfn[i] = start_pfn;
+ arch_zone_highest_possible_pfn[i] = end_pfn;
+
+ start_pfn = end_pfn;
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
@@ -6705,6 +6567,9 @@ static void calculate_totalreserve_pages(void)
enum zone_type i, j;
for_each_online_pgdat(pgdat) {
+
+ pgdat->totalreserve_pages = 0;
+
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
long max = 0;
@@ -6721,7 +6586,7 @@ static void calculate_totalreserve_pages(void)
if (max > zone->managed_pages)
max = zone->managed_pages;
- zone->totalreserve_pages = max;
+ pgdat->totalreserve_pages += max;
reserve_pages += max;
}
@@ -6822,10 +6687,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6892,6 +6753,12 @@ int __meminit init_per_zone_wmark_min(void)
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
+
+#ifdef CONFIG_NUMA
+ setup_min_unmapped_ratio();
+ setup_min_slab_ratio();
+#endif
+
return 0;
}
core_initcall(init_per_zone_wmark_min)
@@ -6933,35 +6800,58 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
}
#ifdef CONFIG_NUMA
+static void setup_min_unmapped_ratio(void)
+{
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ for_each_online_pgdat(pgdat)
+ pgdat->min_unmapped_pages = 0;
+
+ for_each_zone(zone)
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
+ sysctl_min_unmapped_ratio) / 100;
+}
+
+
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
- for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
- sysctl_min_unmapped_ratio) / 100;
+ setup_min_unmapped_ratio();
+
return 0;
}
+static void setup_min_slab_ratio(void)
+{
+ pg_data_t *pgdat;
+ struct zone *zone;
+
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
+ for_each_zone(zone)
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
+ sysctl_min_slab_ratio) / 100;
+}
+
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- struct zone *zone;
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
- for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
- sysctl_min_slab_ratio) / 100;
+ setup_min_slab_ratio();
+
return 0;
}
#endif