From d0b2f91bede3bd5e3d24dd6803e56eee959c1797 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Thu, 20 Oct 2016 00:10:27 -0300 Subject: Linux-libre 4.8.2-gnu --- mm/vmscan.c | 1058 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 558 insertions(+), 500 deletions(-) (limited to 'mm/vmscan.c') diff --git a/mm/vmscan.c b/mm/vmscan.c index d12a0a63d..0fe8b7113 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -84,6 +84,9 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ int priority; + /* The highest zone to isolate pages for reclaim from */ + enum zone_type reclaim_idx; + unsigned int may_writepage:1; /* Can mapped pages be reclaimed? */ @@ -191,26 +194,44 @@ static bool sane_reclaim(struct scan_control *sc) } #endif +/* + * This misses isolated pages which are not accounted for to save counters. + * As the data only determines if reclaim or compaction continues, it is + * not expected that isolated pages will be a dominating factor. + */ unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + - zone_page_state_snapshot(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); + if (get_nr_swap_pages() > 0) + nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); + + return nr; +} + +unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) +{ + unsigned long nr; + + nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + + node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + - zone_page_state_snapshot(zone, NR_ISOLATED_ANON); + nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + + node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); return nr; } -bool zone_reclaimable(struct zone *zone) +bool pgdat_reclaimable(struct pglist_data *pgdat) { - return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < - zone_reclaimable_pages(zone) * 6; + return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < + pgdat_reclaimable_pages(pgdat) * 6; } unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -218,7 +239,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); - return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru); + return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } /* @@ -593,7 +614,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page); - inc_zone_page_state(page, NR_VMSCAN_WRITE); + inc_node_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -877,7 +898,7 @@ static void page_check_dirty_writeback(struct page *page, * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, + struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, @@ -917,7 +938,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON_PAGE(PageActive(page), page); - VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -996,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && - test_bit(ZONE_WRITEBACK, &zone->flags)) { + test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; goto keep_locked; @@ -1055,8 +1075,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Adding to swap updated mapping */ mapping = page_mapping(page); + } else if (unlikely(PageTransHuge(page))) { + /* Split file THP */ + if (split_huge_page_to_list(page, page_list)) + goto keep_locked; } + VM_BUG_ON_PAGE(PageTransHuge(page), page); + /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. @@ -1086,14 +1112,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_is_file_cache(page) && (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() * except we already have the page isolated * and know it's dirty */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + inc_node_page_state(page, NR_VMSCAN_IMMEDIATE); SetPageReclaim(page); goto keep_locked; @@ -1254,17 +1280,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, list_for_each_entry_safe(page, next, page_list, lru) { if (page_is_file_cache(page) && !PageDirty(page) && - !isolated_balloon_page(page)) { + !__PageMovable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); } } - ret = shrink_page_list(&clean_pages, zone, &sc, + ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, TTU_UNMAP|TTU_IGNORE_ACCESS, &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; } @@ -1342,8 +1368,31 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) return ret; } + /* - * zone->lru_lock is heavily contended. Some of the functions that + * Update LRU sizes after isolating pages. The LRU size updates must + * be complete before mem_cgroup_update_lru_size due to a santity check. + */ +static __always_inline void update_lru_sizes(struct lruvec *lruvec, + enum lru_list lru, unsigned long *nr_zone_taken, + unsigned long nr_taken) +{ + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_zone_taken[zid]) + continue; + + __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); + } + +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); +#endif +} + +/* + * zone_lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * @@ -1369,10 +1418,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, { struct list_head *src = &lruvec->lists[lru]; unsigned long nr_taken = 0; - unsigned long scan; + unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; + unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; + unsigned long scan, nr_pages; + LIST_HEAD(pages_skipped); for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src); scan++) { + !list_empty(src);) { struct page *page; page = lru_to_page(src); @@ -1380,9 +1432,23 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON_PAGE(!PageLRU(page), page); + if (page_zonenum(page) > sc->reclaim_idx) { + list_move(&page->lru, &pages_skipped); + nr_skipped[page_zonenum(page)]++; + continue; + } + + /* + * Account for scanned and skipped separetly to avoid the pgdat + * being prematurely marked unreclaimable by pgdat_reclaimable. + */ + scan++; + switch (__isolate_lru_page(page, mode)) { case 0: - nr_taken += hpage_nr_pages(page); + nr_pages = hpage_nr_pages(page); + nr_taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; list_move(&page->lru, dst); break; @@ -1396,9 +1462,38 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } } + /* + * Splice any skipped pages to the start of the LRU list. Note that + * this disrupts the LRU order when reclaiming for lower zones but + * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX + * scanning would soon rescan the same pages to skip and put the + * system at risk of premature OOM. + */ + if (!list_empty(&pages_skipped)) { + int zid; + unsigned long total_skipped = 0; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + if (!nr_skipped[zid]) + continue; + + __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); + total_skipped += nr_skipped[zid]; + } + + /* + * Account skipped pages as a partial scan as the pgdat may be + * close to unreclaimable. If the LRU list is empty, account + * skipped pages as a full scan. + */ + scan += list_empty(src) ? total_skipped : total_skipped >> 2; + + list_splice(&pages_skipped, src); + } *nr_scanned = scan; - trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan, + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); + update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); return nr_taken; } @@ -1438,8 +1533,8 @@ int isolate_lru_page(struct page *page) struct zone *zone = page_zone(page); struct lruvec *lruvec; - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); + spin_lock_irq(zone_lru_lock(zone)); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); if (PageLRU(page)) { int lru = page_lru(page); get_page(page); @@ -1447,7 +1542,7 @@ int isolate_lru_page(struct page *page) del_page_from_lru_list(page, lruvec, lru); ret = 0; } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); } return ret; } @@ -1459,23 +1554,23 @@ int isolate_lru_page(struct page *page) * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. */ -static int too_many_isolated(struct zone *zone, int file, +static int too_many_isolated(struct pglist_data *pgdat, int file, struct scan_control *sc) { unsigned long inactive, isolated; - if (current_is_kswapd() || sc->hibernation_mode) + if (current_is_kswapd()) return 0; if (!sane_reclaim(sc)) return 0; if (file) { - inactive = zone_page_state(zone, NR_INACTIVE_FILE); - isolated = zone_page_state(zone, NR_ISOLATED_FILE); + inactive = node_page_state(pgdat, NR_INACTIVE_FILE); + isolated = node_page_state(pgdat, NR_ISOLATED_FILE); } else { - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - isolated = zone_page_state(zone, NR_ISOLATED_ANON); + inactive = node_page_state(pgdat, NR_INACTIVE_ANON); + isolated = node_page_state(pgdat, NR_ISOLATED_ANON); } /* @@ -1493,7 +1588,7 @@ static noinline_for_stack void putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); LIST_HEAD(pages_to_free); /* @@ -1506,13 +1601,13 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); SetPageLRU(page); lru = page_lru(page); @@ -1529,10 +1624,10 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); } @@ -1557,8 +1652,32 @@ static int current_may_throttle(void) bdi_write_congested(current->backing_dev_info); } +static bool inactive_reclaimable_pages(struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru) +{ + int zid; + struct zone *zone; + int file = is_file_lru(lru); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!global_reclaim(sc)) + return true; + + for (zid = sc->reclaim_idx; zid >= 0; zid--) { + zone = &pgdat->node_zones[zid]; + if (!managed_zone(zone)) + continue; + + if (zone_page_state_snapshot(zone, NR_ZONE_LRU_BASE + + LRU_FILE * file) >= SWAP_CLUSTER_MAX) + return true; + } + + return false; +} + /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ static noinline_for_stack unsigned long @@ -1576,10 +1695,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; - while (unlikely(too_many_isolated(zone, file, sc))) { + if (!inactive_reclaimable_pages(lruvec, sc, lru)) + return 0; + + while (unlikely(too_many_isolated(pgdat, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ @@ -1594,48 +1716,45 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); + __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); + __count_vm_events(PGSCAN_DIRECT, nr_scanned); } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, &nr_dirty, &nr_unqueued_dirty, &nr_congested, &nr_writeback, &nr_immediate, false); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); if (global_reclaim(sc)) { if (current_is_kswapd()) - __count_zone_vm_events(PGSTEAL_KSWAPD, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); else - __count_zone_vm_events(PGSTEAL_DIRECT, zone, - nr_reclaimed); + __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); @@ -1655,7 +1774,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback == nr_taken) - set_bit(ZONE_WRITEBACK, &zone->flags); + set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* * Legacy memcg will stall in page writeback so avoid forcibly @@ -1667,16 +1786,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * backed by a congested BDI and wait_iff_congested will stall. */ if (nr_dirty && nr_dirty == nr_congested) - set_bit(ZONE_CONGESTED, &zone->flags); + set_bit(PGDAT_CONGESTED, &pgdat->flags); /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_DIRTY and kswapd will start writing pages from + * the pgdat PGDAT_DIRTY and kswapd will start writing pages from * reclaim context. */ if (nr_unqueued_dirty == nr_taken) - set_bit(ZONE_DIRTY, &zone->flags); + set_bit(PGDAT_DIRTY, &pgdat->flags); /* * If kswapd scans pages marked marked for immediate @@ -1695,9 +1814,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, */ if (!sc->hibernation_mode && !current_is_kswapd() && current_may_throttle()) - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + nr_scanned, nr_reclaimed, sc->priority, file); return nr_reclaimed; } @@ -1709,9 +1829,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * processes, from rmap. * * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone->lru_lock across the whole operation. But if + * appropriate to hold zone_lru_lock across the whole operation. But if * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone->lru_lock around each page. It's impossible to balance + * should drop zone_lru_lock around each page. It's impossible to balance * this, so instead we remove the pages from the LRU while processing them. * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. @@ -1725,20 +1845,20 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *pages_to_free, enum lru_list lru) { - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long pgmoved = 0; struct page *page; int nr_pages; while (!list_empty(list)) { page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); nr_pages = hpage_nr_pages(page); - update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1748,10 +1868,10 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, pages_to_free); } @@ -1777,7 +1897,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long nr_rotated = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); lru_add_drain(); @@ -1786,20 +1906,19 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!sc->may_writepage) isolate_mode |= ISOLATE_CLEAN; - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - update_lru_size(lruvec, lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); - __count_zone_vm_events(PGREFILL, zone, nr_scanned); + __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); + __count_vm_events(PGREFILL, nr_scanned); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -1844,7 +1963,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); /* * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This @@ -1855,8 +1974,8 @@ static void shrink_active_list(unsigned long nr_to_scan, move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&zone->lru_lock); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); + spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); @@ -1888,12 +2007,15 @@ static void shrink_active_list(unsigned long nr_to_scan, * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_list_is_low(struct lruvec *lruvec, bool file) +static bool inactive_list_is_low(struct lruvec *lruvec, bool file, + struct scan_control *sc) { unsigned long inactive_ratio; unsigned long inactive; unsigned long active; unsigned long gb; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + int zid; /* * If we don't have swap space, anonymous page deactivation @@ -1905,6 +2027,27 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file) inactive = lruvec_lru_size(lruvec, file * LRU_FILE); active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); + /* + * For zone-constrained allocations, it is necessary to check if + * deactivations are required for lowmem to be reclaimed. This + * calculates the inactive/active pages available in eligible zones. + */ + for (zid = sc->reclaim_idx + 1; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + unsigned long inactive_zone, active_zone; + + if (!managed_zone(zone)) + continue; + + inactive_zone = zone_page_state(zone, + NR_ZONE_LRU_BASE + (file * LRU_FILE)); + active_zone = zone_page_state(zone, + NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + + inactive -= min(inactive, inactive_zone); + active -= min(active, active_zone); + } + gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); @@ -1918,7 +2061,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru))) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -1950,7 +2093,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ - struct zone *zone = lruvec_zone(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; @@ -1971,7 +2114,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * well. */ if (current_is_kswapd()) { - if (!zone_reclaimable(zone)) + if (!pgdat_reclaimable(pgdat)) force_scan = true; if (!mem_cgroup_online(memcg)) force_scan = true; @@ -2017,14 +2160,24 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - unsigned long zonefile; - unsigned long zonefree; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long total_high_wmark = 0; + + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!managed_zone(zone)) + continue; - zonefree = zone_page_state(zone, NR_FREE_PAGES); - zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + total_high_wmark += high_wmark_pages(zone); + } - if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { scan_balance = SCAN_ANON; goto out; } @@ -2039,7 +2192,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true) && + if (!inactive_list_is_low(lruvec, true, sc) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2071,7 +2224,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(&pgdat->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; @@ -2092,7 +2245,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); fraction[0] = ap; fraction[1] = fp; @@ -2151,12 +2304,12 @@ out: } /* - * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. + * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ -static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, +static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *lru_pages) { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2262,7 +2415,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false)) + if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2287,16 +2440,14 @@ static bool in_reclaim_compaction(struct scan_control *sc) * calls try_to_compact_zone() that it will have enough free pages to succeed. * It will give up earlier than that if there is difficulty reclaiming pages. */ -static inline bool should_continue_reclaim(struct zone *zone, +static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long nr_reclaimed, unsigned long nr_scanned, struct scan_control *sc) { unsigned long pages_for_compaction; unsigned long inactive_lru_pages; - - if (nr_reclaimed && nr_scanned && sc->nr_to_reclaim >= sc->nr_reclaimed) - return true; + int z; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) @@ -2330,25 +2481,32 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); + inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); if (get_nr_swap_pages() > 0) - inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); + inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order, 0, 0)) { - case COMPACT_PARTIAL: - case COMPACT_CONTINUE: - return false; - default: - return true; + for (z = 0; z <= sc->reclaim_idx; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!managed_zone(zone)) + continue; + + switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + /* check next zone */ + ; + } } + return true; } -static bool shrink_zone(struct zone *zone, struct scan_control *sc, - bool is_classzone) +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; @@ -2357,10 +2515,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, do { struct mem_cgroup *root = sc->target_mem_cgroup; struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, + .pgdat = pgdat, .priority = sc->priority, }; - unsigned long zone_lru_pages = 0; + unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; @@ -2381,11 +2539,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_zone_memcg(zone, memcg, sc, &lru_pages); - zone_lru_pages += lru_pages; + shrink_node_memcg(pgdat, memcg, sc, &lru_pages); + node_lru_pages += lru_pages; - if (memcg && is_classzone) - shrink_slab(sc->gfp_mask, zone_to_nid(zone), + if (memcg) + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->nr_scanned - scanned, lru_pages); @@ -2397,7 +2555,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the - * zone. + * node. * * Limit reclaim, on the other hand, only cares about * nr_to_reclaim pages to be reclaimed and it will @@ -2415,10 +2573,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Shrink the slab caches in the same proportion that * the eligible LRU pages were scanned. */ - if (global_reclaim(sc) && is_classzone) - shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + if (global_reclaim(sc)) + shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, sc->nr_scanned - nr_scanned, - zone_lru_pages); + node_lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -2433,7 +2591,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; - } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); return reclaimable; @@ -2443,9 +2601,9 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Returns true if compaction should go ahead for a high-order request, or * the high-order allocation would succeed without compaction. */ -static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx) +static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { - unsigned long balance_gap, watermark; + unsigned long watermark; bool watermark_ok; /* @@ -2454,23 +2612,21 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( - zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx); + watermark = high_wmark_pages(zone) + (2UL << sc->order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, order)) + if (compaction_deferred(zone, sc->order)) return watermark_ok; /* * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED) + if (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2481,14 +2637,6 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over high_wmark_pages(zone). - * Because: - * a) The caller may be trying to free *extra* pages to satisfy a higher-order - * allocation or - * b) The target zone may be at high_wmark_pages(zone) but the lower zones - * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' - * zone defense algorithm. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ @@ -2499,7 +2647,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; gfp_t orig_mask; - enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + pg_data_t *last_pgdat = NULL; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2507,21 +2655,13 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * highmem pages could be pinning lowmem pages storing buffer_heads */ orig_mask = sc->gfp_mask; - if (buffer_heads_over_limit) + if (buffer_heads_over_limit) { sc->gfp_mask |= __GFP_HIGHMEM; + sc->reclaim_idx = gfp_zone(sc->gfp_mask); + } for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - enum zone_type classzone_idx; - - if (!populated_zone(zone)) - continue; - - classzone_idx = requested_highidx; - while (!populated_zone(zone->zone_pgdat->node_zones + - classzone_idx)) - classzone_idx--; - + sc->reclaim_idx, sc->nodemask) { /* * Take care memory controller reclaiming has small influence * to global LRU. @@ -2532,7 +2672,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) continue; if (sc->priority != DEF_PRIORITY && - !zone_reclaimable(zone)) + !pgdat_reclaimable(zone->zone_pgdat)) continue; /* Let kswapd poll it */ /* @@ -2546,12 +2686,20 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && - zonelist_zone_idx(z) <= requested_highidx && - compaction_ready(zone, sc->order, requested_highidx)) { + compaction_ready(zone, sc)) { sc->compaction_ready = true; continue; } + /* + * Shrink each node in the zonelist once. If the + * zonelist is ordered by zone (not the default) then a + * node may be shrunk multiple times but in that case + * the user prefers lower zones being preserved. + */ + if (zone->zone_pgdat == last_pgdat) + continue; + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2559,7 +2707,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * and balancing, not for a memcg's limit. */ nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, sc->order, sc->gfp_mask, &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; @@ -2567,7 +2715,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); + /* See comment about same check for global reclaim above */ + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + shrink_node(zone->zone_pgdat, sc); } /* @@ -2599,17 +2751,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; - -#ifdef CONFIG_FREEZER - if (unlikely(pm_freezing && !sc->hibernation_mode)) - return 0; -#endif - retry: delayacct_freepages_start(); if (global_reclaim(sc)) - count_vm_event(ALLOCSTALL); + __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, @@ -2675,8 +2821,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone) || - zone_reclaimable_pages(zone) == 0) + if (!managed_zone(zone) || + pgdat_reclaimable_pages(pgdat) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2691,7 +2837,7 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) /* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - pgdat->classzone_idx = min(pgdat->classzone_idx, + pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx, (enum zone_type)ZONE_NORMAL); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -2799,6 +2945,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, .priority = DEF_PRIORITY, @@ -2817,7 +2964,8 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask); + gfp_mask, + sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -2828,9 +2976,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_MEMCG -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, +unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, - struct zone *zone, + pg_data_t *pgdat, unsigned long *nr_scanned) { struct scan_control sc = { @@ -2838,6 +2986,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; unsigned long lru_pages; @@ -2847,16 +2996,17 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, sc.may_writepage, - sc.gfp_mask); + sc.gfp_mask, + sc.reclaim_idx); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. - * if we don't reclaim here, the shrink_zone from balance_pgdat + * if we don't reclaim here, the shrink_node from balance_pgdat * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_zone_memcg(zone, memcg, &sc, &lru_pages); + shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2876,6 +3026,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, @@ -2894,7 +3045,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, - sc.gfp_mask); + sc.gfp_mask, + sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -2904,7 +3056,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct zone *zone, struct scan_control *sc) +static void age_active_anon(struct pglist_data *pgdat, + struct scan_control *sc) { struct mem_cgroup *memcg; @@ -2913,9 +3066,9 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false)) + if (inactive_list_is_low(lruvec, false, sc)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2923,82 +3076,21 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, bool highorder, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone) + balance_gap; + unsigned long mark = high_wmark_pages(zone); + + if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return false; /* - * When checking from pgdat_balanced(), kswapd should stop and sleep - * when it reaches the high order-0 watermark and let kcompactd take - * over. Other callers such as wakeup_kswapd() want to determine the - * true high-order watermark. + * If any eligible zone is balanced then the node is not considered + * to be congested or dirty */ - if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { - mark += (1UL << order); - order = 0; - } - - return zone_watermark_ok_safe(zone, order, mark, classzone_idx); -} - -/* - * pgdat_balanced() is used when checking if a node is balanced. - * - * For order-0, all zones must be balanced! - * - * For high-order allocations only zones that meet watermarks and are in a - * zone allowed by the callers classzone_idx are added to balanced_pages. The - * total of balanced pages must be at least 25% of the zones allowed by - * classzone_idx for the node to be considered balanced. Forcing all zones to - * be balanced for high orders can cause excessive reclaim when there are - * imbalanced zones. - * The choice of 25% is due to - * o a 16M DMA zone that is balanced will not balance a zone on any - * reasonable sized machine - * o On all other machines, the top zone must be at least a reasonable - * percentage of the middle zones. For example, on 32-bit x86, highmem - * would need to be at least 256M for it to be balance a whole node. - * Similarly, on x86-64 the Normal zone would need to be at least 1G - * to balance a node on its own. These seemed like reasonable ratios. - */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) -{ - unsigned long managed_pages = 0; - unsigned long balanced_pages = 0; - int i; - - /* Check the watermark levels */ - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - managed_pages += zone->managed_pages; - - /* - * A special case here: - * - * balance_pgdat() skips over all_unreclaimable after - * DEF_PRIORITY. Effectively, it considers them balanced so - * they must be considered balanced here as well! - */ - if (!zone_reclaimable(zone)) { - balanced_pages += zone->managed_pages; - continue; - } - - if (zone_balanced(zone, order, false, 0, i)) - balanced_pages += zone->managed_pages; - else if (!order) - return false; - } + clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); + clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - if (order) - return balanced_pages >= (managed_pages >> 2); - else - return true; + return true; } /* @@ -3007,12 +3099,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * * Returns true if kswapd is ready to sleep */ -static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, - int classzone_idx) +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ - if (remaining) - return false; + int i; /* * The throttled processes are normally woken up in balance_pgdat() as @@ -3030,91 +3119,81 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - return pgdat_balanced(pgdat, order, classzone_idx); + for (i = 0; i <= classzone_idx; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + if (!zone_balanced(zone, order, classzone_idx)) + return false; + } + + return true; } /* - * kswapd shrinks the zone by the number of pages required to reach - * the high watermark. + * kswapd shrinks a node of pages that are at or below the highest usable + * zone that is currently unbalanced. * * Returns true if kswapd scanned at least the requested number of pages to * reclaim or if the lack of progress was due to pages under writeback. * This is used to determine if the scanning priority needs to be raised. */ -static bool kswapd_shrink_zone(struct zone *zone, - int classzone_idx, +static bool kswapd_shrink_node(pg_data_t *pgdat, struct scan_control *sc) { - unsigned long balance_gap; - bool lowmem_pressure; + struct zone *zone; + int z; - /* Reclaim above the high watermark. */ - sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + /* Reclaim a number of pages proportional to the number of zones */ + sc->nr_to_reclaim = 0; + for (z = 0; z <= sc->reclaim_idx; z++) { + zone = pgdat->node_zones + z; + if (!managed_zone(zone)) + continue; - /* - * We put equal pressure on every zone, unless one zone has way too - * many pages free already. The "too many pages" is defined as the - * high wmark plus a "gap" where the gap is either the low - * watermark or 1% of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( - zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); + } /* - * If there is no low memory pressure or the zone is balanced then no - * reclaim is necessary + * Historically care was taken to put equal pressure on all zones but + * now pressure is applied based on node LRU order. */ - lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, sc->order, false, - balance_gap, classzone_idx)) - return true; - - shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - - clear_bit(ZONE_WRITEBACK, &zone->flags); + shrink_node(pgdat, sc); /* - * If a zone reaches its high watermark, consider it to be no longer - * congested. It's possible there are dirty pages backed by congested - * BDIs but as pressure is relieved, speculatively avoid congestion - * waits. + * Fragmentation may mean that the system cannot be rebalanced for + * high-order allocations. If twice the allocation size has been + * reclaimed then recheck watermarks only at order-0 to prevent + * excessive reclaim. Assume that a process requested a high-order + * can direct reclaim/compact. */ - if (zone_reclaimable(zone) && - zone_balanced(zone, sc->order, false, 0, classzone_idx)) { - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); - } + if (sc->order && sc->nr_reclaimed >= 2UL << sc->order) + sc->order = 0; return sc->nr_scanned >= sc->nr_to_reclaim; } /* - * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at high_wmark_pages(zone). - * - * Returns the highest zone idx kswapd was reclaiming at + * For kswapd, balance_pgdat() will reclaim pages across a node from zones + * that are eligible for use by the caller until at least one zone is + * balanced. * - * There is special handling here for zones which are full of pinned pages. - * This can happen if the pages are all mlocked, or if they are all used by - * device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb. - * What we do is to detect the case where all pages in the zone have been - * scanned twice and there has been zero successful reclaim. Mark the zone as - * dead and from now on, only perform a short scan. Basically we're polling - * the zone for when the problem goes away. + * Returns the order kswapd finished reclaiming at. * * kswapd scans the zones in the highmem->normal->dma direction. It skips * zones which have free_pages > high_wmark_pages(zone), but once a zone is - * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the - * lower zones regardless of the number of free pages in the lower zones. This - * interoperates with the page allocator fallback scheme to ensure that aging - * of pages is balanced across the zones. + * found to have free_pages <= high_wmark_pages(zone), any page is that zone + * or lower is eligible for reclaim until at least one usable zone is + * balanced. */ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, @@ -3129,100 +3208,77 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) bool raise_priority = true; sc.nr_reclaimed = 0; + sc.reclaim_idx = classzone_idx; /* - * Scan in the highmem->dma direction for the highest - * zone which needs scanning + * If the number of buffer_heads exceeds the maximum allowed + * then consider reclaiming from all zones. This has a dual + * purpose -- on 64-bit systems it is expected that + * buffer_heads are stripped during active rotation. On 32-bit + * systems, highmem pages can pin lowmem memory and shrinking + * buffers can relieve lowmem pressure. Reclaim may still not + * go ahead if all eligible zones for the original allocation + * request are balanced to avoid excessive reclaim from kswapd. */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) - continue; - - /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. - */ - age_active_anon(zone, &sc); + if (buffer_heads_over_limit) { + for (i = MAX_NR_ZONES - 1; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; - /* - * If the number of buffer_heads in the machine - * exceeds the maximum allowed level and this node - * has a highmem zone, force kswapd to reclaim from - * it to relieve lowmem pressure. - */ - if (buffer_heads_over_limit && is_highmem_idx(i)) { - end_zone = i; + sc.reclaim_idx = i; break; } + } - if (!zone_balanced(zone, order, false, 0, 0)) { - end_zone = i; - break; - } else { - /* - * If balanced, clear the dirty and congested - * flags - */ - clear_bit(ZONE_CONGESTED, &zone->flags); - clear_bit(ZONE_DIRTY, &zone->flags); - } + /* + * Only reclaim if there are no eligible zones. Check from + * high to low zone as allocations prefer higher zones. + * Scanning from low to high zone would allow congestion to be + * cleared during a very small window when a small low + * zone was balanced even under extreme pressure when the + * overall node may be congested. Note that sc.reclaim_idx + * is not used as buffer_heads_over_limit may have adjusted + * it. + */ + for (i = classzone_idx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + if (zone_balanced(zone, sc.order, classzone_idx)) + goto out; } - if (i < 0) - goto out; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. All + * pages are rotated regardless of classzone as this is + * about consistent aging. + */ + age_active_anon(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2) + if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) sc.may_writepage = 1; + /* Call soft limit reclaim before calling shrink_node. */ + sc.nr_scanned = 0; + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, + sc.gfp_mask, &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + /* - * Now scan the zone in the dma->highmem direction, stopping - * at the last zone which needs scanning. - * - * We do this because the page allocator works in the opposite - * direction. This prevents the page allocator from allocating - * pages behind kswapd's direction of progress, which would - * cause too much scanning of the lower zones. + * There should be no need to raise the scanning priority if + * enough pages are already being scanned that that high + * watermark would be met at 100% efficiency. */ - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - if (sc.priority != DEF_PRIORITY && - !zone_reclaimable(zone)) - continue; - - sc.nr_scanned = 0; - - nr_soft_scanned = 0; - /* - * Call soft limit reclaim before calling shrink_zone. - */ - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - order, sc.gfp_mask, - &nr_soft_scanned); - sc.nr_reclaimed += nr_soft_reclaimed; - - /* - * There should be no need to raise the scanning - * priority if enough pages are already being scanned - * that that high watermark would be met at 100% - * efficiency. - */ - if (kswapd_shrink_zone(zone, end_zone, &sc)) - raise_priority = false; - } + if (kswapd_shrink_node(pgdat, &sc)) + raise_priority = false; /* * If the low watermark is met there is no need for processes @@ -3243,19 +3299,20 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; - } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, classzone_idx)); + } while (sc.priority >= 1); out: /* - * Return the highest zone idx we were reclaiming at so - * prepare_kswapd_sleep() makes the same decisions as here. + * Return the order kswapd stopped reclaiming at as + * prepare_kswapd_sleep() takes it into account. If another caller + * entered the allocator slow path while kswapd was awake, order will + * remain at the higher level. */ - return end_zone; + return sc.order; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, - int classzone_idx, int balanced_classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, + unsigned int classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3266,8 +3323,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, - balanced_classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to * isolate pages from and skips them in the future scanning. @@ -3280,9 +3336,20 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, * We have freed the memory, now we should compact it to make * allocation of the requested order possible. */ - wakeup_kcompactd(pgdat, order, classzone_idx); + wakeup_kcompactd(pgdat, alloc_order, classzone_idx); remaining = schedule_timeout(HZ/10); + + /* + * If woken prematurely then reset kswapd_classzone_idx and + * order. The values will either be from a wakeup request or + * the previous request that slept prematurely. + */ + if (remaining) { + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); + } + finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); } @@ -3291,8 +3358,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, - balanced_classzone_idx)) { + if (!remaining && + prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3333,9 +3400,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, */ static int kswapd(void *p) { - unsigned long order, new_order; - int classzone_idx, new_classzone_idx; - int balanced_classzone_idx; + unsigned int alloc_order, reclaim_order, classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3365,38 +3430,20 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = new_order = 0; - classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; - balanced_classzone_idx = classzone_idx; + pgdat->kswapd_order = alloc_order = reclaim_order = 0; + pgdat->kswapd_classzone_idx = classzone_idx = 0; for ( ; ; ) { bool ret; - /* - * While we were reclaiming, there might have been another - * wakeup, so check the values. - */ - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; +kswapd_try_sleep: + kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, + classzone_idx); - if (order < new_order || classzone_idx > new_classzone_idx) { - /* - * Don't sleep if someone wants a larger 'order' - * allocation or has tigher zone constraints - */ - order = new_order; - classzone_idx = new_classzone_idx; - } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx, - balanced_classzone_idx); - order = pgdat->kswapd_max_order; - classzone_idx = pgdat->classzone_idx; - new_order = order; - new_classzone_idx = classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + /* Read the new order and classzone_idx */ + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = pgdat->kswapd_classzone_idx; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = 0; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3406,11 +3453,25 @@ static int kswapd(void *p) * We can speed up thawing tasks if we don't call balance_pgdat * after returning from the refrigerator */ - if (!ret) { - trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = balance_pgdat(pgdat, order, - classzone_idx); - } + if (ret) + continue; + + /* + * Reclaim begins at the requested order but if a high-order + * reclaim fails then kswapd falls back to reclaiming for + * order-0. If that happens, kswapd will consider sleeping + * for the order it finished reclaiming at (reclaim_order) + * but kcompactd is woken to compact for the original + * request (alloc_order). + */ + trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, + alloc_order); + reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); + if (reclaim_order < alloc_order) + goto kswapd_try_sleep; + + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3426,26 +3487,28 @@ static int kswapd(void *p) void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + int z; - if (!populated_zone(zone)) - return; - -#ifdef CONFIG_FREEZER - if (pm_freezing) + if (!managed_zone(zone)) return; -#endif if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - if (pgdat->kswapd_max_order < order) { - pgdat->kswapd_max_order = order; - pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); - } + pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, true, 0, 0)) - return; + + /* Only wake kswapd if all zones are unbalanced */ + for (z = 0; z <= classzone_idx; z++) { + zone = pgdat->node_zones + z; + if (!managed_zone(zone)) + continue; + + if (zone_balanced(zone, order, classzone_idx)) + return; + } trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); @@ -3460,12 +3523,13 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, gfp_t mask) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, + .reclaim_idx = MAX_NR_ZONES - 1, .priority = DEF_PRIORITY, .may_writepage = 1, .may_unmap = 1, @@ -3489,11 +3553,6 @@ unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, gfp_t mask) return nr_reclaimed; } - -unsigned long shrink_all_memory(unsigned long nr_to_reclaim) -{ - return shrink_memory_mask(nr_to_reclaim, GFP_HIGHUSER_MOVABLE); -} #endif /* CONFIG_HIBERNATION */ /* It's optimal to keep kswapds on the same CPUs as their memory, but @@ -3572,12 +3631,12 @@ module_init(kswapd_init) #ifdef CONFIG_NUMA /* - * Zone reclaim mode + * Node reclaim mode * - * If non-zero call zone_reclaim when the number of free pages falls below + * If non-zero call node_reclaim when the number of free pages falls below * the watermarks. */ -int zone_reclaim_mode __read_mostly; +int node_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ @@ -3585,14 +3644,14 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* - * Priority for ZONE_RECLAIM. This determines the fraction of pages + * Priority for NODE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. */ -#define ZONE_RECLAIM_PRIORITY 4 +#define NODE_RECLAIM_PRIORITY 4 /* - * Percentage of pages in a zone that must be unmapped for zone_reclaim to + * Percentage of pages in a zone that must be unmapped for node_reclaim to * occur. */ int sysctl_min_unmapped_ratio = 1; @@ -3603,11 +3662,11 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; -static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat) { - unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); - unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ACTIVE_FILE); + unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED); + unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) + + node_page_state(pgdat, NR_ACTIVE_FILE); /* * It's possible for there to be more file mapped pages than @@ -3618,7 +3677,7 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static unsigned long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat) { unsigned long nr_pagecache_reclaimable; unsigned long delta = 0; @@ -3626,17 +3685,17 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) /* * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about - * pages like swapcache and zone_unmapped_file_pages() provides + * pages like swapcache and node_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_UNMAP) - nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + if (node_reclaim_mode & RECLAIM_UNMAP) + nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES); else - nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat); /* If we can't clean pages, remove dirty pages from consideration */ - if (!(zone_reclaim_mode & RECLAIM_WRITE)) - delta += zone_page_state(zone, NR_FILE_DIRTY); + if (!(node_reclaim_mode & RECLAIM_WRITE)) + delta += node_page_state(pgdat, NR_FILE_DIRTY); /* Watch for any possible underflows due to delta */ if (unlikely(delta > nr_pagecache_reclaimable)) @@ -3646,22 +3705,24 @@ static unsigned long zone_pagecache_reclaimable(struct zone *zone) } /* - * Try to free up some pages from this zone through reclaim. + * Try to free up some pages from this node through reclaim. */ -static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; + int classzone_idx = gfp_zone(gfp_mask); struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, - .priority = ZONE_RECLAIM_PRIORITY, - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), + .priority = NODE_RECLAIM_PRIORITY, + .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, + .reclaim_idx = classzone_idx, }; cond_resched(); @@ -3675,13 +3736,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { + if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc, true); + shrink_node(pgdat, &sc); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } @@ -3691,49 +3752,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return sc.nr_reclaimed >= nr_pages; } -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) { - int node_id; int ret; /* - * Zone reclaim reclaims unmapped file backed pages and + * Node reclaim reclaims unmapped file backed pages and * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately - * thrown out if the zone is overallocated. So we do not reclaim - * if less than a specified percentage of the zone is used by + * thrown out if the node is overallocated. So we do not reclaim + * if less than a specified percentage of the node is used by * unmapped file backed pages. */ - if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) - return ZONE_RECLAIM_FULL; + if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && + sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + return NODE_RECLAIM_FULL; - if (!zone_reclaimable(zone)) - return ZONE_RECLAIM_FULL; + if (!pgdat_reclaimable(pgdat)) + return NODE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) - return ZONE_RECLAIM_NOSCAN; + return NODE_RECLAIM_NOSCAN; /* - * Only run zone reclaim on the local zone or on zones that do not + * Only run node reclaim on the local node or on nodes that do not * have associated processors. This will favor the local processor * over remote processors and spread off node memory allocations * as wide as possible. */ - node_id = zone_to_nid(zone); - if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return ZONE_RECLAIM_NOSCAN; + if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id()) + return NODE_RECLAIM_NOSCAN; - if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) - return ZONE_RECLAIM_NOSCAN; + if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags)) + return NODE_RECLAIM_NOSCAN; - ret = __zone_reclaim(zone, gfp_mask, order); - clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); + ret = __node_reclaim(pgdat, gfp_mask, order); + clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); @@ -3772,24 +3831,23 @@ int page_evictable(struct page *page) void check_move_unevictable_pages(struct page **pages, int nr_pages) { struct lruvec *lruvec; - struct zone *zone = NULL; + struct pglist_data *pgdat = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; - struct zone *pagezone; + struct pglist_data *pagepgdat = page_pgdat(page); pgscanned++; - pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); + if (pagepgdat != pgdat) { + if (pgdat) + spin_unlock_irq(&pgdat->lru_lock); + pgdat = pagepgdat; + spin_lock_irq(&pgdat->lru_lock); } - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, pgdat); if (!PageLRU(page) || !PageUnevictable(page)) continue; @@ -3805,10 +3863,10 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) } } - if (zone) { + if (pgdat) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(&pgdat->lru_lock); } } #endif /* CONFIG_SHMEM */ -- cgit v1.2.3-54-g00ecf