diff options
Diffstat (limited to 'mm/slab.c')
-rw-r--r-- | mm/slab.c | 1162 |
1 files changed, 622 insertions, 540 deletions
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t; #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* - * true if a page was allocated from pfmemalloc reserves for network-based - * swap - */ -static bool pfmemalloc_active __read_mostly; - -/* * struct array_cache * * Purpose: @@ -195,10 +189,6 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * - * Entries should not be directly dereferenced as - * entries belonging to slabs marked pfmemalloc will - * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; @@ -207,33 +197,6 @@ struct alien_cache { struct array_cache ac; }; -#define SLAB_OBJ_PFMEMALLOC 1 -static inline bool is_obj_pfmemalloc(void *objp) -{ - return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; -} - -static inline void set_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); - return; -} - -static inline void clear_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); -} - -/* - * bootstrap: The caches do not work without cpuarrays anymore, but the - * cpuarrays are allocated from the generic caches... - */ -#define BOOT_CPUCACHE_ENTRIES 1 -struct arraycache_init { - struct array_cache cache; - void *entries[BOOT_CPUCACHE_ENTRIES]; -}; - /* * Need this for bootstrapping a per node allocator. */ @@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) +#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) #define CFLGS_OFF_SLAB (0x80000000UL) +#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#define OBJECT_FREE (0) -#define OBJECT_ACTIVE (1) - #ifdef CONFIG_DEBUG_SLAB_LEAK -static void set_obj_status(struct page *page, int idx, int val) +static inline bool is_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; - status[idx] = val; + return atomic_read(&cachep->store_user_clean) == 1; } -static inline unsigned int get_obj_status(struct page *page, int idx) +static inline void set_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; + atomic_set(&cachep->store_user_clean, 1); +} - return status[idx]; +static inline void set_store_user_dirty(struct kmem_cache *cachep) +{ + if (is_store_user_clean(cachep)) + atomic_set(&cachep->store_user_clean, 0); } #else -static inline void set_obj_status(struct page *page, int idx, int val) {} +static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } +#define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, @@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return this_cpu_ptr(cachep->cpu_cache); } -static size_t calculate_freelist_size(int nr_objs, size_t align) -{ - size_t freelist_size; - - freelist_size = nr_objs * sizeof(freelist_idx_t); - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size += nr_objs * sizeof(char); - - if (align) - freelist_size = ALIGN(freelist_size, align); - - return freelist_size; -} - -static int calculate_nr_objs(size_t slab_size, size_t buffer_size, - size_t idx_size, size_t align) -{ - int nr_objs; - size_t remained_size; - size_t freelist_size; - int extra_space = 0; - - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - extra_space = sizeof(char); - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = slab_size / (buffer_size + idx_size + extra_space); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - remained_size = slab_size - nr_objs * buffer_size; - freelist_size = calculate_freelist_size(nr_objs, align); - if (remained_size < freelist_size) - nr_objs--; - - return nr_objs; -} - /* * Calculate the number of objects and left-over bytes for a given buffer size. */ -static void cache_estimate(unsigned long gfporder, size_t buffer_size, - size_t align, int flags, size_t *left_over, - unsigned int *num) +static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, + unsigned long flags, size_t *left_over) { - int nr_objs; - size_t mgmt_size; + unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One unsigned int for each object - * - Padding to respect alignment of @align * - @buffer_size bytes for each object + * - One freelist_idx_t for each object + * + * We don't need to consider alignment of freelist because + * freelist will be at the end of slab page. The objects will be + * at the correct alignment. * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ - if (flags & CFLGS_OFF_SLAB) { - mgmt_size = 0; - nr_objs = slab_size / buffer_size; - + if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { + num = slab_size / buffer_size; + *left_over = slab_size % buffer_size; } else { - nr_objs = calculate_nr_objs(slab_size, buffer_size, - sizeof(freelist_idx_t), align); - mgmt_size = calculate_freelist_size(nr_objs, align); + num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + *left_over = slab_size % + (buffer_size + sizeof(freelist_idx_t)); } - *num = nr_objs; - *left_over = slab_size - nr_objs*buffer_size - mgmt_size; + + return num; } #if DEBUG @@ -565,7 +474,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) { - printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + pr_err("slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries, return ac; } -static inline bool is_slab_pfmemalloc(struct page *page) +static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + struct page *page, void *objp) { - return PageSlabPfmemalloc(page); -} - -/* Clears pfmemalloc_active if no slabs have pfmalloc set */ -static void recheck_pfmemalloc_active(struct kmem_cache *cachep, - struct array_cache *ac) -{ - struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); - struct page *page; - unsigned long flags; - - if (!pfmemalloc_active) - return; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_partial, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_free, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - pfmemalloc_active = false; -out: - spin_unlock_irqrestore(&n->list_lock, flags); -} - -static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, - gfp_t flags, bool force_refill) -{ - int i; - void *objp = ac->entry[--ac->avail]; - - /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ - if (unlikely(is_obj_pfmemalloc(objp))) { - struct kmem_cache_node *n; - - if (gfp_pfmemalloc_allowed(flags)) { - clear_obj_pfmemalloc(&objp); - return objp; - } - - /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 0; i < ac->avail; i++) { - /* If a !PFMEMALLOC object is found, swap them */ - if (!is_obj_pfmemalloc(ac->entry[i])) { - objp = ac->entry[i]; - ac->entry[i] = ac->entry[ac->avail]; - ac->entry[ac->avail] = objp; - return objp; - } - } - - /* - * If there are empty slabs on the slabs_free list and we are - * being forced to refill the cache, mark this one !pfmemalloc. - */ - n = get_node(cachep, numa_mem_id()); - if (!list_empty(&n->slabs_free) && force_refill) { - struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(page); - clear_obj_pfmemalloc(&objp); - recheck_pfmemalloc_active(cachep, ac); - return objp; - } - - /* No !PFMEMALLOC objects available */ - ac->avail++; - objp = NULL; - } - - return objp; -} - -static inline void *ac_get_obj(struct kmem_cache *cachep, - struct array_cache *ac, gfp_t flags, bool force_refill) -{ - void *objp; - - if (unlikely(sk_memalloc_socks())) - objp = __ac_get_obj(cachep, ac, flags, force_refill); - else - objp = ac->entry[--ac->avail]; - - return objp; -} - -static noinline void *__ac_put_obj(struct kmem_cache *cachep, - struct array_cache *ac, void *objp) -{ - if (unlikely(pfmemalloc_active)) { - /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); - if (PageSlabPfmemalloc(page)) - set_obj_pfmemalloc(&objp); - } + struct kmem_cache_node *n; + int page_node; + LIST_HEAD(list); - return objp; -} + page_node = page_to_nid(page); + n = get_node(cachep, page_node); -static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) -{ - if (unlikely(sk_memalloc_socks())) - objp = __ac_put_obj(cachep, ac, objp); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); - ac->entry[ac->avail++] = objp; + slabs_destroy(cachep, &list); } /* @@ -860,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static inline gfp_t gfp_exact_node(gfp_t flags) { - return flags; + return flags & ~__GFP_NOFAIL; } #else /* CONFIG_NUMA */ @@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, page_node, &list); } - ac_put_obj(cachep, ac, objp); + ac->entry[ac->avail++] = objp; spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { @@ -1031,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not direct reclaim - * or warn about failures. kswapd may still wake to reclaim in the background. + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); } #endif @@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) return; - printk(KERN_WARNING - "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nodeid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nodeid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { @@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) num_slabs += active_slabs; num_objs = num_slabs * cachep->num; - printk(KERN_WARNING - " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } @@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (page_is_pfmemalloc(page)) - pfmemalloc_active = true; - nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); + __SetPageSlab(page); - if (page_is_pfmemalloc(page)) + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1636,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - const unsigned long nr_freed = (1 << cachep->gfporder); + int order = cachep->gfporder; + unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, cachep->gfporder); + kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), @@ -1655,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_kmem_pages(page, cachep->gfporder); + memcg_uncharge_slab(page, order, cachep); + __free_pages(page, order); } static void kmem_rcu_free(struct rcu_head *head) @@ -1670,6 +1478,14 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1703,6 +1519,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, } *addr++ = 0x87654321; } + +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + if (caller) + store_stackinfo(cachep, objp, caller); + + kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) {} + #endif static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1720,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x: ", offset); + pr_err("%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; @@ -1733,13 +1566,11 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably " - "bad RAM.\n"); + pr_err("Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory " - "test tool.\n"); + pr_err("Run memtest86+ or a similar memory test tool.\n"); #else - printk(KERN_ERR "Run a memory test tool.\n"); + pr_err("Run a memory test tool.\n"); #endif } } @@ -1754,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + pr_err("Redzone: 0x%llx/0x%llx\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + pr_err("Last user: [<%p>](%pSR)\n", *dbg_userword(cachep, objp), *dbg_userword(cachep, objp)); } @@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int size, i; int lines = 0; + if (is_debug_pagealloc_cache(cachep)) + return; + realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; @@ -1793,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - printk(KERN_ERR - "Slab corruption (%s): %s start=%p, len=%d\n", - print_tainted(), cachep->name, realobj, size); + pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1822,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Prev obj: start=%p, len=%d\n", - realobj, size); + pr_err("Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Next obj: start=%p, len=%d\n", - realobj, size); + pr_err("Next obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -1842,28 +1674,24 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct page *page) { int i; + + if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { + poison_obj(cachep, page->freelist - obj_offset(cachep), + POISON_FREE); + } + for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->size % PAGE_SIZE == 0 && - OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object " - "was overwritten"); + slab_error(cachep, "start of a freed object was overwritten"); if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object " - "was overwritten"); + slab_error(cachep, "end of a freed object was overwritten"); } } } @@ -1916,7 +1744,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. - * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. @@ -1926,9 +1753,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, size_t align, unsigned long flags) + size_t size, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1936,7 +1762,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, align, flags, &remainder, &num); + num = cache_estimate(gfporder, size, flags, &remainder); if (!num) continue; @@ -1945,19 +1771,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { - size_t freelist_size_per_obj = sizeof(freelist_idx_t); + struct kmem_cache *freelist_cache; + size_t freelist_size; + + freelist_size = num * sizeof(freelist_idx_t); + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). + * Needed to avoid possible looping condition + * in cache_grow() */ - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size_per_obj += sizeof(char); - offslab_limit = size; - offslab_limit /= freelist_size_per_obj; + if (OFF_SLAB(freelist_cache)) + continue; - if (num > offslab_limit) - break; + /* check if off slab has enough benefit */ + if (freelist_cache->size > cachep->size / 2) + continue; } /* Found something acceptable - save it away */ @@ -2075,6 +1906,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + return false; + + left = calculate_slab_order(cachep, size, + flags | CFLGS_OBJFREELIST_SLAB); + if (!cachep->num) + return false; + + if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_off_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + /* + * Always use on-slab management when SLAB_NOLEAKTRACE + * to avoid recursive calls into kmemleak. + */ + if (flags & SLAB_NOLEAKTRACE) + return false; + + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); + if (!cachep->num) + return false; + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (left >= cachep->num * sizeof(freelist_idx_t)) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_on_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + left = calculate_slab_order(cachep, size, flags); + if (!cachep->num) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2099,7 +2003,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size; size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; @@ -2119,8 +2022,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(flags & SLAB_POISON); #endif /* @@ -2152,6 +2053,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * 4) Store it. */ cachep->align = ralign; + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; if (slab_is_available()) gfp = GFP_KERNEL; @@ -2179,36 +2084,9 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - /* - * To activate debug pagealloc, off-slab management is necessary - * requirement. In early phase of initialization, small sized slab - * doesn't get initialized so it would not be possible. So, we need - * to check size >= 256. It guarantees that all necessary small - * sized slab is initialized in current slab initialization sequence. - */ - if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && - size >= 256 && cachep->object_size > cache_line_size() && - ALIGN(size, cachep->align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); - size = PAGE_SIZE; - } -#endif #endif - /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) - */ - if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ - flags |= CFLGS_OFF_SLAB; + kasan_cache_create(cachep, &size, &flags); size = ALIGN(size, cachep->align); /* @@ -2218,42 +2096,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - left_over = calculate_slab_order(cachep, size, cachep->align, flags); - - if (!cachep->num) - return -E2BIG; - - freelist_size = calculate_freelist_size(cachep->num, cachep->align); - +#if DEBUG /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. + * To activate debug pagealloc, off-slab management is necessary + * requirement. In early phase of initialization, small sized slab + * doesn't get initialized so it would not be possible. So, we need + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. */ - if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { - flags &= ~CFLGS_OFF_SLAB; - left_over -= freelist_size; + if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); + + if (set_off_slab_cache(cachep, tmp_size, flags)) { + flags |= CFLGS_OFF_SLAB; + cachep->obj_offset += tmp_size - size; + size = tmp_size; + goto done; + } + } } +#endif - if (flags & CFLGS_OFF_SLAB) { - /* really off slab. No need for manual alignment */ - freelist_size = calculate_freelist_size(cachep->num, 0); + if (set_objfreelist_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OBJFREELIST_SLAB; + goto done; + } -#ifdef CONFIG_PAGE_POISONING - /* If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif + if (set_off_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OFF_SLAB; + goto done; } - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - cachep->colour = left_over / cachep->colour_off; - cachep->freelist_size = freelist_size; + if (set_on_slab_cache(cachep, size, flags)) + goto done; + + return -E2BIG; + +done: + cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) @@ -2261,16 +2143,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); - if (flags & CFLGS_OFF_SLAB) { - cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); - /* - * This is a possibility for one of the kmalloc_{dma,}_caches. - * But since we go off slab only for object size greater than - * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created - * in ascending order,this should not happen at all. - * But leave a BUG_ON for some lucky dude. - */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + if (OFF_SLAB(cachep)) { + cachep->freelist_cache = + kmalloc_slab(cachep->freelist_size, 0u); } err = setup_cpu_cache(cachep, gfp); @@ -2377,9 +2264,6 @@ static int drain_freelist(struct kmem_cache *cache, } page = list_entry(p, struct page, lru); -#if DEBUG - BUG_ON(page->active); -#endif list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked @@ -2454,18 +2338,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); - if (OFF_SLAB(cachep)) { + page->s_mem = addr + colour_off; + page->active = 0; + + if (OBJFREELIST_SLAB(cachep)) + freelist = NULL; + else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); if (!freelist) return NULL; } else { - freelist = addr + colour_off; - colour_off += cachep->freelist_size; + /* We will use last bytes at the slab for freelist */ + freelist = addr + (PAGE_SIZE << cachep->gfporder) - + cachep->freelist_size; } - page->active = 0; - page->s_mem = addr + colour_off; + return freelist; } @@ -2480,17 +2369,14 @@ static inline void set_free_obj(struct page *page, ((freelist_idx_t *)(page->freelist))[idx] = val; } -static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) { +#if DEBUG int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); -#if DEBUG - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) - poison_obj(cachep, objp, POISON_FREE); + if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2503,26 +2389,51 @@ static void cache_init_objs(struct kmem_cache *cachep, * cache which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { + kasan_unpoison_object_data(cachep, + objp + obj_offset(cachep)); cachep->ctor(objp + obj_offset(cachep)); + kasan_poison_object_data( + cachep, objp + obj_offset(cachep)); + } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " end of an object"); + slab_error(cachep, "constructor overwrote the end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " start of an object"); + slab_error(cachep, "constructor overwrote the start of an object"); } - if ((cachep->size % PAGE_SIZE) == 0 && - OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); -#else - if (cachep->ctor) - cachep->ctor(objp); + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0, 0); + } + } #endif - set_obj_status(page, i, OBJECT_FREE); +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + void *objp; + + cache_init_objs_debug(cachep, page); + + if (OBJFREELIST_SLAB(cachep)) { + page->freelist = index_to_obj(cachep, page, cachep->num - 1) + + obj_offset(cachep); + } + + for (i = 0; i < cachep->num; i++) { + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) { + objp = index_to_obj(cachep, page, i); + kasan_unpoison_object_data(cachep, objp); + cachep->ctor(objp); + kasan_poison_object_data(cachep, objp); + } + set_free_obj(page, i, i); } } @@ -2537,40 +2448,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, - int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; + #if DEBUG - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); + if (cachep->flags & SLAB_STORE_USER) + set_store_user_dirty(cachep); #endif return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct page *page, - void *objp, int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, + struct page *page, void *objp) { unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG unsigned int i; - /* Verify that the slab belongs to the intended node */ - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + pr_err("slab: double free detected in cache '%s', objp %p\n", + cachep->name, objp); BUG(); } } #endif page->active--; + if (!page->freelist) + page->freelist = objp + obj_offset(cachep); + set_free_obj(page, page->active, objnr); } @@ -2645,11 +2557,12 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!freelist) + if (OFF_SLAB(cachep) && !freelist) goto opps1; slab_map_pages(cachep, page, freelist); + kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -2681,7 +2594,7 @@ failed: static void kfree_debugcheck(const void *objp) { if (!virt_addr_valid(objp)) { - printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + pr_err("kfree_debugcheck: out of range ptr %lxh\n", (unsigned long)objp); BUG(); } @@ -2705,8 +2618,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", - obj, redzone1, redzone2); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + obj, redzone1, redzone2); } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, @@ -2726,27 +2639,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) + if (cachep->flags & SLAB_STORE_USER) { + set_store_user_dirty(cachep); *dbg_userword(cachep, objp) = (void *)caller; + } objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); - set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); - } else { - poison_obj(cachep, objp, POISON_FREE); - } -#else poison_obj(cachep, objp, POISON_FREE); -#endif + slab_kernel_map(cachep, objp, 0, caller); } return objp; } @@ -2756,7 +2661,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif -static struct page *get_first_slab(struct kmem_cache_node *n) +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list) +{ +#if DEBUG + void *next = *list; + void *objp; + + while (next) { + objp = next - obj_offset(cachep); + next = *(void **)next; + poison_obj(cachep, objp, POISON_FREE); + } +#endif +} + +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list) +{ + /* move slabp to correct slabp list: */ + list_del(&page->lru); + if (page->active == cachep->num) { + list_add(&page->lru, &n->slabs_full); + if (OBJFREELIST_SLAB(cachep)) { +#if DEBUG + /* Poisoning will be done without holding the lock */ + if (cachep->flags & SLAB_POISON) { + void **objp = page->freelist; + + *objp = *list; + *list = objp; + } +#endif + page->freelist = NULL; + } + } else + list_add(&page->lru, &n->slabs_partial); +} + +/* Try to find non-pfmemalloc slab if needed */ +static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, + struct page *page, bool pfmemalloc) +{ + if (!page) + return NULL; + + if (pfmemalloc) + return page; + + if (!PageSlabPfmemalloc(page)) + return page; + + /* No need to keep pfmemalloc slab if we have enough free objects */ + if (n->free_objects > n->free_limit) { + ClearPageSlabPfmemalloc(page); + return page; + } + + /* Move pfmemalloc slab to the end of list to speed up next search */ + list_del(&page->lru); + if (!page->active) + list_add_tail(&page->lru, &n->slabs_free); + else + list_add_tail(&page->lru, &n->slabs_partial); + + list_for_each_entry(page, &n->slabs_partial, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + list_for_each_entry(page, &n->slabs_free, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + return NULL; +} + +static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; @@ -2768,21 +2751,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n) struct page, lru); } + if (sk_memalloc_socks()) + return get_valid_first_slab(n, page, pfmemalloc); + return page; } -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, - bool force_refill) +static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + struct kmem_cache_node *n, gfp_t flags) +{ + struct page *page; + void *obj; + void *list = NULL; + + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + + spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { + spin_unlock(&n->list_lock); + return NULL; + } + + obj = slab_get_obj(cachep, page); + n->free_objects--; + + fixup_slab_list(cachep, n, page, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +} + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; struct array_cache *ac; int node; + void *list = NULL; check_irq_off(); node = numa_mem_id(); - if (unlikely(force_refill)) - goto force_grow; + retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; @@ -2808,7 +2821,7 @@ retry: while (batchcount > 0) { struct page *page; /* Get slab alloc is to come from. */ - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -2826,26 +2839,29 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page, - node)); + ac->entry[ac->avail++] = slab_get_obj(cachep, page); } - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); } must_grow: n->free_objects -= ac->avail; alloc_done: spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { int x; -force_grow: + + /* Check if we can use obj in pfmemalloc slab */ + if (sk_memalloc_socks()) { + void *obj = cache_alloc_pfmemalloc(cachep, n, flags); + + if (obj) + return obj; + } + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ @@ -2853,7 +2869,7 @@ force_grow: node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && (ac->avail == 0 || force_refill)) + if (!x && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -2861,7 +2877,7 @@ force_grow: } ac->touched = 1; - return ac_get_obj(cachep, ac, flags, force_refill); + return ac->entry[--ac->avail]; } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -2877,20 +2893,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { - struct page *page; - if (!objp) return objp; if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) @@ -2899,25 +2906,21 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR - "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + slab_error(cachep, "double free, or memory outside object was overwritten"); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } - page = virt_to_head_page(objp); - set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -2926,40 +2929,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif -static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) -{ - if (unlikely(cachep == kmem_cache)) - return false; - - return should_failslab(cachep->object_size, flags, cachep->flags); -} - static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; - bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; - objp = ac_get_obj(cachep, ac, flags, false); + objp = ac->entry[--ac->avail]; - /* - * Allow for the possibility all avail objects are not allowed - * by the current flags - */ - if (objp) { - STATS_INC_ALLOCHIT(cachep); - goto out; - } - force_refill = true; + STATS_INC_ALLOCHIT(cachep); + goto out; } STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags, force_refill); + objp = cache_alloc_refill(cachep, flags); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. @@ -3097,6 +3084,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, struct page *page; struct kmem_cache_node *n; void *obj; + void *list = NULL; int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); @@ -3106,7 +3094,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -3118,17 +3106,13 @@ retry: BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, page, nodeid); + obj = slab_get_obj(cachep, page); n->free_objects--; - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); goto done; must_grow: @@ -3152,14 +3136,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3188,16 +3168,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, - flags); - if (likely(ptr)) { - kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(ptr, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && ptr) + memset(ptr, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &ptr); return ptr; } @@ -3240,30 +3215,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) void *objp; flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, - flags); prefetchw(objp); - if (likely(objp)) { - kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(objp, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && objp) + memset(objp, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &objp); return objp; } @@ -3281,13 +3247,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, void *objp; struct page *page; - clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; page = virt_to_head_page(objp); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp, node); + slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); n->free_objects++; @@ -3317,9 +3282,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) LIST_HEAD(list); batchcount = ac->batchcount; -#if DEBUG - BUG_ON(!batchcount || batchcount > ac->avail); -#endif + check_irq_off(); n = get_node(cachep, node); spin_lock(&n->list_lock); @@ -3366,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, { struct array_cache *ac = cpu_cache_get(cachep); + kasan_slab_free(cachep, objp); + check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3389,7 +3354,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac_put_obj(cachep, ac, objp); + if (sk_memalloc_socks()) { + struct page *page = virt_to_head_page(objp); + + if (unlikely(PageSlabPfmemalloc(page))) { + cache_free_pfmemalloc(cachep, page, objp); + return; + } + } + + ac->entry[ac->avail++] = objp; } /** @@ -3404,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3411,16 +3386,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +static __always_inline void +cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, unsigned long caller) { - __kmem_cache_free_bulk(s, size, p); + size_t i; + + for (i = 0; i < size; i++) + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); } -EXPORT_SYMBOL(kmem_cache_free_bulk); int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) + void **p) { - return __kmem_cache_alloc_bulk(s, flags, size, p); + size_t i; + + s = slab_pre_alloc_hook(s, flags); + if (!s) + return 0; + + cache_alloc_debugcheck_before(s, flags); + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = __do_cache_alloc(s, flags); + + if (unlikely(!objp)) + goto error; + p[i] = objp; + } + local_irq_enable(); + + cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); + + /* Clear memory outside IRQ disabled section */ + if (unlikely(flags & __GFP_ZERO)) + for (i = 0; i < size; i++) + memset(p[i], 0, s->object_size); + + slab_post_alloc_hook(s, flags, size, p); + /* FIXME: Trace call missing. Christoph would like a bulk variant */ + return size; +error: + local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3432,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3455,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3473,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3485,11 +3500,15 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; + void *ret; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(cachep, flags, node, size); + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); + kasan_kmalloc(cachep, ret, size, flags); + + return ret; } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3523,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -3567,6 +3587,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + struct kmem_cache *s; + size_t i; + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = p[i]; + + if (!orig_s) /* called via kfree_bulk */ + s = virt_to_cache(objp); + else + s = cache_from_obj(orig_s, objp); + + debug_check_no_locks_freed(objp, s->object_size); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, s->object_size); + + __cache_free(s, objp, _RET_IP_); + } + local_irq_enable(); + + /* FIXME: add tracing */ +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. @@ -3812,7 +3858,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) - printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); return err; } @@ -3968,7 +4014,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) name = cachep->name; if (error) - printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + pr_err("slab: cache %s error: %s\n", name, error); sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; @@ -3996,8 +4042,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " - "%4lu %4lu %4lu %4lu %4lu", + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees, overflows); @@ -4102,15 +4147,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i; + int i, j; + unsigned long v; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - if (get_obj_status(page, i) != OBJECT_ACTIVE) + bool active = true; + + for (j = page->active; j < c->num; j++) { + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + + if (!active) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + /* + * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table + * mapping is established when actual object allocation and + * we could mistakenly access the unmapped object in the cpu + * cache. + */ + if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) + continue; + + if (!add_caller(n, v)) return; } } @@ -4146,21 +4210,31 @@ static int leaks_show(struct seq_file *m, void *p) if (!(cachep->flags & SLAB_RED_ZONE)) return 0; - /* OK, we can do it */ + /* + * Set store_user_clean and start to grab stored user information + * for all objects on this cache. If some alloc/free requests comes + * during the processing, information would be wrong so restart + * whole processing. + */ + do { + set_store_user_clean(cachep); + drain_cpu_caches(cachep); - x[1] = 0; + x[1] = 0; - for_each_kmem_cache_node(cachep, node, n) { + for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); - spin_lock_irq(&n->list_lock); + check_irq_on(); + spin_lock_irq(&n->list_lock); + + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + } while (!is_store_user_clean(cachep)); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } name = cachep->name; if (x[0] == x[1]) { /* Increase the buffer size */ @@ -4240,10 +4314,18 @@ module_init(slab_proc_init); */ size_t ksize(const void *objp) { + size_t size; + BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return virt_to_cache(objp)->object_size; + size = virt_to_cache(objp)->object_size; + /* We assume that ksize callers could use the whole allocated area, + * so we need to unpoison this area. + */ + kasan_krealloc(objp, size, GFP_NOWAIT); + + return size; } EXPORT_SYMBOL(ksize); |