diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 451 |
1 files changed, 223 insertions, 228 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fcb5b8cb4..4be518d4e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = { * their hierarchy representation */ -struct mem_cgroup_tree_per_zone { +struct mem_cgroup_tree_per_node { struct rb_root rb_root; spinlock_t lock; }; -struct mem_cgroup_tree_per_node { - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; -}; - struct mem_cgroup_tree { struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; }; @@ -323,15 +319,6 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /* !CONFIG_SLOB */ -static struct mem_cgroup_per_zone * -mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) -{ - int nid = zone_to_nid(zone); - int zid = zone_idx(zone); - - return &memcg->nodeinfo[nid]->zoneinfo[zid]; -} - /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest @@ -383,37 +370,35 @@ ino_t page_cgroup_ino(struct page *page) return ino; } -static struct mem_cgroup_per_zone * -mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) +static struct mem_cgroup_per_node * +mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); - int zid = page_zonenum(page); - return &memcg->nodeinfo[nid]->zoneinfo[zid]; + return memcg->nodeinfo[nid]; } -static struct mem_cgroup_tree_per_zone * -soft_limit_tree_node_zone(int nid, int zid) +static struct mem_cgroup_tree_per_node * +soft_limit_tree_node(int nid) { - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; + return soft_limit_tree.rb_tree_per_node[nid]; } -static struct mem_cgroup_tree_per_zone * +static struct mem_cgroup_tree_per_node * soft_limit_tree_from_page(struct page *page) { int nid = page_to_nid(page); - int zid = page_zonenum(page); - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; + return soft_limit_tree.rb_tree_per_node[nid]; } -static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz, +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; - struct mem_cgroup_per_zone *mz_node; + struct mem_cgroup_per_node *mz_node; if (mz->on_tree) return; @@ -423,7 +408,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, return; while (*p) { parent = *p; - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + mz_node = rb_entry(parent, struct mem_cgroup_per_node, tree_node); if (mz->usage_in_excess < mz_node->usage_in_excess) p = &(*p)->rb_left; @@ -439,8 +424,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, mz->on_tree = true; } -static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) { if (!mz->on_tree) return; @@ -448,8 +433,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, mz->on_tree = false; } -static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, - struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, + struct mem_cgroup_tree_per_node *mctz) { unsigned long flags; @@ -473,8 +458,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long excess; - struct mem_cgroup_per_zone *mz; - struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_per_node *mz; + struct mem_cgroup_tree_per_node *mctz; mctz = soft_limit_tree_from_page(page); /* @@ -482,7 +467,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { - mz = mem_cgroup_page_zoneinfo(memcg, page); + mz = mem_cgroup_page_nodeinfo(memcg, page); excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or @@ -507,24 +492,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { - struct mem_cgroup_tree_per_zone *mctz; - struct mem_cgroup_per_zone *mz; - int nid, zid; + struct mem_cgroup_tree_per_node *mctz; + struct mem_cgroup_per_node *mz; + int nid; for_each_node(nid) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - mctz = soft_limit_tree_node_zone(nid, zid); - mem_cgroup_remove_exceeded(mz, mctz); - } + mz = mem_cgroup_nodeinfo(memcg, nid); + mctz = soft_limit_tree_node(nid); + mem_cgroup_remove_exceeded(mz, mctz); } } -static struct mem_cgroup_per_zone * -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +static struct mem_cgroup_per_node * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { struct rb_node *rightmost = NULL; - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; retry: mz = NULL; @@ -532,7 +515,7 @@ retry: if (!rightmost) goto done; /* Nothing to reclaim from */ - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct @@ -546,10 +529,10 @@ done: return mz; } -static struct mem_cgroup_per_zone * -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +static struct mem_cgroup_per_node * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); @@ -643,20 +626,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { unsigned long nr = 0; - int zid; + struct mem_cgroup_per_node *mz; + enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - struct mem_cgroup_per_zone *mz; - enum lru_list lru; - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - nr += mz->lru_size[lru]; - } + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + mz = mem_cgroup_nodeinfo(memcg, nid); + nr += mz->lru_size[lru]; } return nr; } @@ -809,9 +788,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); if (reclaim) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; - mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); + mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); iter = &mz->iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) @@ -910,19 +889,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) { struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup_reclaim_iter *iter; - struct mem_cgroup_per_zone *mz; - int nid, zid; + struct mem_cgroup_per_node *mz; + int nid; int i; while ((memcg = parent_mem_cgroup(memcg))) { for_each_node(nid) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + mz = mem_cgroup_nodeinfo(memcg, nid); + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); } } } @@ -944,39 +921,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) iter = mem_cgroup_iter(NULL, iter, NULL)) /** - * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg - * @zone: zone of the wanted lruvec - * @memcg: memcg of the wanted lruvec - * - * Returns the lru list vector holding pages for the given @zone and - * @mem. This can be the global zone lruvec, if the memory controller - * is disabled. - */ -struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_zone *mz; - struct lruvec *lruvec; - - if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; - goto out; - } - - mz = mem_cgroup_zone_zoneinfo(memcg, zone); - lruvec = &mz->lruvec; -out: - /* - * Since a node can be onlined after the mem_cgroup was created, - * we have to be prepared to initialize lruvec->zone here; - * and if offlined then reonlined, we need to reinitialize it. - */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; - return lruvec; -} - -/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page @@ -985,14 +929,14 @@ out: * and putback protocol: the LRU lock must be held, and the page must * either be PageLRU() or the caller must have isolated/allocated it. */ -struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) +struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; struct mem_cgroup *memcg; struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &zone->lruvec; + lruvec = &pgdat->lruvec; goto out; } @@ -1004,7 +948,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) if (!memcg) memcg = root_mem_cgroup; - mz = mem_cgroup_page_zoneinfo(memcg, page); + mz = mem_cgroup_page_nodeinfo(memcg, page); lruvec = &mz->lruvec; out: /* @@ -1012,8 +956,8 @@ out: * we have to be prepared to initialize lruvec->zone here; * and if offlined then reonlined, we need to reinitialize it. */ - if (unlikely(lruvec->zone != zone)) - lruvec->zone = zone; + if (unlikely(lruvec->pgdat != pgdat)) + lruvec->pgdat = pgdat; return lruvec; } @@ -1030,17 +974,15 @@ out: void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { - struct mem_cgroup_per_zone *mz; + struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; bool empty; - __update_lru_size(lruvec, lru, nr_pages); - if (mem_cgroup_disabled()) return; - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); lru_size = mz->lru_size + lru; empty = list_empty(lruvec->lists + lru); @@ -1259,6 +1201,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct oom_control oc = { .zonelist = NULL, .nodemask = NULL, + .memcg = memcg, .gfp_mask = gfp_mask, .order = order, }; @@ -1275,13 +1218,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ - if (fatal_signal_pending(current) || task_will_free_mem(current)) { + if (task_will_free_mem(current)) { mark_oom_victim(current); - try_oom_reaper(current); + wake_oom_reaper(current); goto unlock; } - check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); + check_panic_on_oom(&oc, CONSTRAINT_MEMCG); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1289,7 +1232,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_start(&iter->css, &it); while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(&oc, task, totalpages)) { + switch (oom_scan_process_thread(&oc, task)) { case OOM_SCAN_SELECT: if (chosen) put_task_struct(chosen); @@ -1329,7 +1272,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, if (chosen) { points = chosen_points * 1000 / totalpages; - oom_kill_process(&oc, chosen, points, totalpages, memcg, + oom_kill_process(&oc, chosen, points, totalpages, "Memory cgroup out of memory"); } unlock: @@ -1432,7 +1375,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) #endif static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, - struct zone *zone, + pg_data_t *pgdat, gfp_t gfp_mask, unsigned long *total_scanned) { @@ -1442,7 +1385,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, unsigned long excess; unsigned long nr_scanned; struct mem_cgroup_reclaim_cookie reclaim = { - .zone = zone, + .pgdat = pgdat, .priority = 0, }; @@ -1472,8 +1415,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, } continue; } - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, - zone, &nr_scanned); + total += mem_cgroup_shrink_node(victim, gfp_mask, false, + pgdat, &nr_scanned); *total_scanned += nr_scanned; if (!soft_limit_excess(root_memcg)) break; @@ -2119,11 +2062,11 @@ static void lock_page_lru(struct page *page, int *isolated) { struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); + spin_lock_irq(zone_lru_lock(zone)); if (PageLRU(page)) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_lru(page)); *isolated = 1; @@ -2138,12 +2081,12 @@ static void unlock_page_lru(struct page *page, int isolated) if (isolated) { struct lruvec *lruvec; - lruvec = mem_cgroup_page_lruvec(page, zone); + lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, page_lru(page)); } - spin_unlock_irq(&zone->lru_lock); + spin_unlock_irq(zone_lru_lock(zone)); } static void commit_charge(struct page *page, struct mem_cgroup *memcg, @@ -2285,20 +2228,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, current->memcg_kmem_skip_account = 0; } -/* +static inline bool memcg_kmem_bypass(void) +{ + if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + return true; + return false; +} + +/** + * memcg_kmem_get_cache: select the correct per-memcg cache for allocation + * @cachep: the original global kmem cache + * * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. * - * If the cache does not exist yet, if we are the first user of it, - * we either create it immediately, if possible, or create it asynchronously - * in a workqueue. - * In the latter case, we will let the current allocation go through with - * the original cache. + * If the cache does not exist yet, if we are the first user of it, we + * create it asynchronously in a workqueue and let the current allocation + * go through with the original cache. * - * Can't be called in interrupt context or from kernel threads. - * This function needs to be called with rcu_read_lock() held. + * This function takes a reference to the cache it returns to assure it + * won't get destroyed while we are working with it. Once the caller is + * done with it, memcg_kmem_put_cache() must be called to release the + * reference. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) +struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2306,10 +2259,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) VM_BUG_ON(!is_root_cache(cachep)); - if (cachep->flags & SLAB_ACCOUNT) - gfp |= __GFP_ACCOUNT; - - if (!(gfp & __GFP_ACCOUNT)) + if (memcg_kmem_bypass()) return cachep; if (current->memcg_kmem_skip_account) @@ -2342,14 +2292,27 @@ out: return cachep; } -void __memcg_kmem_put_cache(struct kmem_cache *cachep) +/** + * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache + * @cachep: the cache returned by memcg_kmem_get_cache + */ +void memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) css_put(&cachep->memcg_params.memcg->css); } -int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, - struct mem_cgroup *memcg) +/** + * memcg_kmem_charge: charge a kmem page + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * @memcg: memory cgroup to charge + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { unsigned int nr_pages = 1 << order; struct page_counter *counter; @@ -2370,19 +2333,37 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, return 0; } -int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) +/** + * memcg_kmem_charge: charge a kmem page to the current memory cgroup + * @page: page to charge + * @gfp: reclaim mode + * @order: allocation order + * + * Returns 0 on success, an error code on failure. + */ +int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; int ret = 0; + if (memcg_kmem_bypass()) + return 0; + memcg = get_mem_cgroup_from_mm(current->mm); - if (!mem_cgroup_is_root(memcg)) - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (!mem_cgroup_is_root(memcg)) { + ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (!ret) + __SetPageKmemcg(page); + } css_put(&memcg->css); return ret; } - -void __memcg_kmem_uncharge(struct page *page, int order) +/** + * memcg_kmem_uncharge: uncharge a kmem page + * @page: page to uncharge + * @order: allocation order + */ +void memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; unsigned int nr_pages = 1 << order; @@ -2400,6 +2381,11 @@ void __memcg_kmem_uncharge(struct page *page, int order) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; + + /* slab pages do not have PageKmemcg flag set */ + if (PageKmemcg(page)) + __ClearPageKmemcg(page); + css_put_many(&memcg->css, nr_pages); } #endif /* !CONFIG_SLOB */ @@ -2408,7 +2394,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock and migration entries setup in all page mappings. + * zone_lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -2578,22 +2564,31 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { unsigned long nr_reclaimed = 0; - struct mem_cgroup_per_zone *mz, *next_mz = NULL; + struct mem_cgroup_per_node *mz, *next_mz = NULL; unsigned long reclaimed; int loop = 0; - struct mem_cgroup_tree_per_zone *mctz; + struct mem_cgroup_tree_per_node *mctz; unsigned long excess; unsigned long nr_scanned; if (order > 0) return 0; - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + mctz = soft_limit_tree_node(pgdat->node_id); + + /* + * Do not even bother to check the largest node if the root + * is empty. Do it lockless to prevent lock bouncing. Races + * are acceptable as soft limit is best effort anyway. + */ + if (RB_EMPTY_ROOT(&mctz->rb_root)) + return 0; + /* * This loop can run a while, specially if mem_cgroup's continuously * keep exceeding their soft limit and putting the system under @@ -2608,7 +2603,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, break; nr_scanned = 0; - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; @@ -3229,22 +3224,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_DEBUG_VM { - int nid, zid; - struct mem_cgroup_per_zone *mz; + pg_data_t *pgdat; + struct mem_cgroup_per_node *mz; struct zone_reclaim_stat *rstat; unsigned long recent_rotated[2] = {0, 0}; unsigned long recent_scanned[2] = {0, 0}; - for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; - rstat = &mz->lruvec.reclaim_stat; + for_each_online_pgdat(pgdat) { + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + rstat = &mz->lruvec.reclaim_stat; - recent_rotated[0] += rstat->recent_rotated[0]; - recent_rotated[1] += rstat->recent_rotated[1]; - recent_scanned[0] += rstat->recent_scanned[0]; - recent_scanned[1] += rstat->recent_scanned[1]; - } + recent_rotated[0] += rstat->recent_rotated[0]; + recent_rotated[1] += rstat->recent_rotated[1]; + recent_scanned[0] += rstat->recent_scanned[0]; + recent_scanned[1] += rstat->recent_scanned[1]; + } seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); @@ -4101,24 +4095,6 @@ static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) atomic_add(n, &memcg->id.ref); } -static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) -{ - while (!atomic_inc_not_zero(&memcg->id.ref)) { - /* - * The root cgroup cannot be destroyed, so it's refcount must - * always be >= 1. - */ - if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { - VM_BUG_ON(1); - break; - } - memcg = parent_mem_cgroup(memcg); - if (!memcg) - memcg = root_mem_cgroup; - } - return memcg; -} - static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) { if (atomic_sub_and_test(n, &memcg->id.ref)) { @@ -4152,11 +4128,10 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return idr_find(&mem_cgroup_idr, id); } -static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; - struct mem_cgroup_per_zone *mz; - int zone, tmp = node; + int tmp = node; /* * This routine is called against possible nodes. * But it's BUG to call kmalloc() against offline node. @@ -4171,18 +4146,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = &pn->zoneinfo[zone]; - lruvec_init(&mz->lruvec); - mz->usage_in_excess = 0; - mz->on_tree = false; - mz->memcg = memcg; - } + lruvec_init(&pn->lruvec); + pn->usage_in_excess = 0; + pn->on_tree = false; + pn->memcg = memcg; + memcg->nodeinfo[node] = pn; return 0; } -static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) +static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { kfree(memcg->nodeinfo[node]); } @@ -4193,7 +4166,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) memcg_wb_domain_exit(memcg); for_each_node(node) - free_mem_cgroup_per_zone_info(memcg, node); + free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->stat); kfree(memcg); } @@ -4222,7 +4195,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; for_each_node(node) - if (alloc_mem_cgroup_per_zone_info(memcg, node)) + if (alloc_mem_cgroup_per_node_info(memcg, node)) goto fail; if (memcg_wb_domain_init(memcg, GFP_KERNEL)) @@ -4450,7 +4423,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, #ifdef CONFIG_SWAP static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent); @@ -4469,7 +4442,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, } #else static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + pte_t ptent, swp_entry_t *entry) { return NULL; } @@ -4512,7 +4485,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /** * mem_cgroup_move_account - move account of the page * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) + * @compound: charge the page as compound or small page * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @@ -4634,7 +4607,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (pte_present(ptent)) page = mc_handle_present_pte(vma, addr, ptent); else if (is_swap_pte(ptent)) - page = mc_handle_swap_pte(vma, addr, ptent, &ent); + page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) page = mc_handle_file_pte(vma, addr, ptent, &ent); @@ -5240,7 +5213,7 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "file %llu\n", (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); + (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); @@ -5376,6 +5349,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * @mm: mm context of the victim * @gfp_mask: reclaim mode * @memcgp: charged memcg return + * @compound: charge the page as compound or small page * * Try to charge @page to the memcg that @mm belongs to, reclaiming * pages according to @gfp_mask if necessary. @@ -5438,6 +5412,7 @@ out: * @page: page to charge * @memcg: memcg to charge the page to * @lrucare: page might be on LRU already + * @compound: charge the page as compound or small page * * Finalize a charge transaction started by mem_cgroup_try_charge(), * after page->mapping has been set up. This must happen atomically @@ -5489,6 +5464,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * mem_cgroup_cancel_charge - cancel a page charge * @page: page to charge * @memcg: memcg to charge the page to + * @compound: charge the page as compound or small page * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ @@ -5512,15 +5488,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, struct page *dummy_page) + unsigned long nr_huge, unsigned long nr_kmem, + struct page *dummy_page) { - unsigned long nr_pages = nr_anon + nr_file; + unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) + page_counter_uncharge(&memcg->kmem, nr_kmem); memcg_oom_recover(memcg); } @@ -5543,6 +5522,7 @@ static void uncharge_list(struct list_head *page_list) unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; + unsigned long nr_kmem = 0; unsigned long pgpgout = 0; struct list_head *next; struct page *page; @@ -5553,8 +5533,6 @@ static void uncharge_list(struct list_head *page_list) */ next = page_list->next; do { - unsigned int nr_pages = 1; - page = list_entry(next, struct page, lru); next = page->lru.next; @@ -5573,31 +5551,36 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); - pgpgout = nr_anon = nr_file = nr_huge = 0; + nr_huge, nr_kmem, page); + pgpgout = nr_anon = nr_file = + nr_huge = nr_kmem = 0; } memcg = page->mem_cgroup; } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - nr_huge += nr_pages; - } + if (!PageKmemcg(page)) { + unsigned int nr_pages = 1; - if (PageAnon(page)) - nr_anon += nr_pages; - else - nr_file += nr_pages; + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + nr_huge += nr_pages; + } + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + pgpgout++; + } else { + nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } page->mem_cgroup = NULL; - - pgpgout++; } while (next != page_list); if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, page); + nr_huge, nr_kmem, page); } /** @@ -5819,18 +5802,12 @@ static int __init mem_cgroup_init(void) for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; - int zone; rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { - struct mem_cgroup_tree_per_zone *rtpz; - - rtpz = &rtpn->rb_tree_per_zone[zone]; - rtpz->rb_root = RB_ROOT; - spin_lock_init(&rtpz->lock); - } + rtpn->rb_root = RB_ROOT; + spin_lock_init(&rtpn->lock); soft_limit_tree.rb_tree_per_node[node] = rtpn; } @@ -5839,6 +5816,24 @@ static int __init mem_cgroup_init(void) subsys_initcall(mem_cgroup_init); #ifdef CONFIG_MEMCG_SWAP +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + /** * mem_cgroup_swapout - transfer a memsw charge to swap * @page: page whose memsw charge to transfer |