diff options
-rw-r--r-- | Documentation/vm/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/vm/uksm.txt | 59 | ||||
-rw-r--r-- | fs/exec.c | 4 | ||||
-rw-r--r-- | fs/proc/meminfo.c | 6 | ||||
-rw-r--r-- | include/asm-generic/pgtable.h | 17 | ||||
-rw-r--r-- | include/linux/ksm.h | 44 | ||||
-rw-r--r-- | include/linux/mm_types.h | 3 | ||||
-rw-r--r-- | include/linux/mmzone.h | 5 | ||||
-rw-r--r-- | include/linux/sradix-tree.h | 77 | ||||
-rw-r--r-- | include/linux/uksm.h | 146 | ||||
-rw-r--r-- | kernel/fork.c | 4 | ||||
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/sradix-tree.c | 476 | ||||
-rw-r--r-- | mm/Kconfig | 26 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/memory.c | 36 | ||||
-rw-r--r-- | mm/mmap.c | 44 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/uksm.c | 5525 | ||||
-rw-r--r-- | mm/vmstat.c | 3 |
20 files changed, 29 insertions, 6457 deletions
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX index 09eaa9a12..6a5e2a102 100644 --- a/Documentation/vm/00-INDEX +++ b/Documentation/vm/00-INDEX @@ -18,8 +18,6 @@ idle_page_tracking.txt - description of the idle page tracking feature. ksm.txt - how to use the Kernel Samepage Merging feature. -uksm.txt - - Introduction to Ultra KSM numa - information about NUMA specific code in the Linux vm. numa_memory_policy.txt diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt deleted file mode 100644 index 387d96ae2..000000000 --- a/Documentation/vm/uksm.txt +++ /dev/null @@ -1,59 +0,0 @@ -The Ultra Kernel Samepage Merging feature ----------------------------------------------- -/* - * Ultra KSM. Copyright (C) 2011-2012 Nai Xia - * - * This is an improvement upon KSM. Some basic data structures and routines - * are borrowed from ksm.c . - * - * Its new features: - * 1. Full system scan: - * It automatically scans all user processes' anonymous VMAs. Kernel-user - * interaction to submit a memory area to KSM is no longer needed. - * - * 2. Rich area detection: - * It automatically detects rich areas containing abundant duplicated - * pages based. Rich areas are given a full scan speed. Poor areas are - * sampled at a reasonable speed with very low CPU consumption. - * - * 3. Ultra Per-page scan speed improvement: - * A new hash algorithm is proposed. As a result, on a machine with - * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it - * can scan memory areas that does not contain duplicated pages at speed of - * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of - * 477MB/sec ~ 923MB/sec. - * - * 4. Thrashing area avoidance: - * Thrashing area(an VMA that has frequent Ksm page break-out) can be - * filtered out. My benchmark shows it's more efficient than KSM's per-page - * hash value based volatile page detection. - * - * - * 5. Misc changes upon KSM: - * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page - * comparison. It's much faster than default C version on x86. - * * rmap_item now has an struct *page member to loosely cache a - * address-->page mapping, which reduces too much time-costly - * follow_page(). - * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. - * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ - * ksm is needed for this case. - * - * 6. Full Zero Page consideration(contributed by Figo Zhang) - * Now uksmd consider full zero pages as special pages and merge them to an - * special unswappable uksm zero page. - */ - -ChangeLog: - -2012-05-05 The creation of this Doc -2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. -2012-05-28 UKSM 0.1.1.2 bug fix release -2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 -2012-07-2 UKSM 0.1.2-beta2 -2012-07-10 UKSM 0.1.2-beta3 -2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. -2012-10-13 UKSM 0.1.2.1 Bug fixes. -2012-12-31 UKSM 0.1.2.2 Minor bug fixes. -2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". -2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include <linux/slab.h> @@ -56,7 +56,6 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> -#include <linux/ksm.h> #include <trace/events/fs.h> @@ -1167,7 +1166,6 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ current->self_exec_id++; - flush_signal_handlers(current, 0); do_close_on_exec(current->files); } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 70b9f953b..df4661aba 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -118,9 +118,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "SUnreclaim: %8lu kB\n" "KernelStack: %8lu kB\n" "PageTables: %8lu kB\n" -#ifdef CONFIG_UKSM - "KsmZeroPages: %8lu kB\n" -#endif #ifdef CONFIG_QUICKLIST "Quicklists: %8lu kB\n" #endif @@ -179,9 +176,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_SLAB_UNRECLAIMABLE)), global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024, K(global_page_state(NR_PAGETABLE)), -#ifdef CONFIG_UKSM - K(global_page_state(NR_UKSM_ZERO_PAGES)), -#endif #ifdef CONFIG_QUICKLIST K(quicklist_total_size()), #endif diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f387ed12c..c370b261c 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -601,25 +601,12 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif -#ifdef CONFIG_UKSM -static inline int is_uksm_zero_pfn(unsigned long pfn) -{ - extern unsigned long uksm_zero_pfn; - return pfn == uksm_zero_pfn; -} -#else -static inline int is_uksm_zero_pfn(unsigned long pfn) -{ - return 0; -} -#endif - #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; - return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); + return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) @@ -628,7 +615,7 @@ static inline int is_zero_pfn(unsigned long pfn) static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; - return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); + return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 06861d8ef..7ae216a39 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -19,6 +19,21 @@ struct mem_cgroup; #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); +int __ksm_enter(struct mm_struct *mm); +void __ksm_exit(struct mm_struct *mm); + +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + return __ksm_enter(mm); + return 0; +} + +static inline void ksm_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + __ksm_exit(mm); +} static inline struct stable_node *page_stable_node(struct page *page) { @@ -49,33 +64,6 @@ struct page *ksm_might_need_to_copy(struct page *page, int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); -#ifdef CONFIG_KSM_LEGACY -int __ksm_enter(struct mm_struct *mm); -void __ksm_exit(struct mm_struct *mm); -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -{ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) - return __ksm_enter(mm); - return 0; -} - -static inline void ksm_exit(struct mm_struct *mm) -{ - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) - __ksm_exit(mm); -} - -#elif defined(CONFIG_UKSM) -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) -{ - return 0; -} - -static inline void ksm_exit(struct mm_struct *mm) -{ -} -#endif /* !CONFIG_UKSM */ - #else /* !CONFIG_KSM */ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) @@ -118,6 +106,4 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ -#include <linux/uksm.h> - #endif /* __LINUX_KSM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b1f9fe171..624b78b84 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -352,9 +352,6 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; -#ifdef CONFIG_UKSM - struct vma_slot *uksm_vma_slot; -#endif }; struct core_thread { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 038651383..7b6c2cfee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -157,9 +157,6 @@ enum zone_stat_item { WORKINGSET_NODERECLAIM, NR_ANON_TRANSPARENT_HUGEPAGES, NR_FREE_CMA_PAGES, -#ifdef CONFIG_UKSM - NR_UKSM_ZERO_PAGES, -#endif NR_VM_ZONE_STAT_ITEMS }; /* @@ -811,7 +808,7 @@ static inline int is_highmem_idx(enum zone_type idx) } /** - * is_highmem - helper function to quickly check if a struct zone is a + * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone - pointer to struct zone variable diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h deleted file mode 100644 index 6780fdb0a..000000000 --- a/include/linux/sradix-tree.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef _LINUX_SRADIX_TREE_H -#define _LINUX_SRADIX_TREE_H - - -#define INIT_SRADIX_TREE(root, mask) \ -do { \ - (root)->height = 0; \ - (root)->gfp_mask = (mask); \ - (root)->rnode = NULL; \ -} while (0) - -#define ULONG_BITS (sizeof(unsigned long) * 8) -#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) -//#define SRADIX_TREE_MAP_SHIFT 6 -//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) -//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) - -struct sradix_tree_node { - unsigned int height; /* Height from the bottom */ - unsigned int count; - unsigned int fulls; /* Number of full sublevel trees */ - struct sradix_tree_node *parent; - void *stores[0]; -}; - -/* A simple radix tree implementation */ -struct sradix_tree_root { - unsigned int height; - struct sradix_tree_node *rnode; - - /* Where found to have available empty stores in its sublevels */ - struct sradix_tree_node *enter_node; - unsigned int shift; - unsigned int stores_size; - unsigned int mask; - unsigned long min; /* The first hole index */ - unsigned long num; - //unsigned long *height_to_maxindex; - - /* How the node is allocated and freed. */ - struct sradix_tree_node *(*alloc)(void); - void (*free)(struct sradix_tree_node *node); - - /* When a new node is added and removed */ - void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); - void (*assign)(struct sradix_tree_node *node, unsigned index, void *item); - void (*rm)(struct sradix_tree_node *node, unsigned offset); -}; - -struct sradix_tree_path { - struct sradix_tree_node *node; - int offset; -}; - -static inline -void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) -{ - root->height = 0; - root->rnode = NULL; - root->shift = shift; - root->stores_size = 1UL << shift; - root->mask = root->stores_size - 1; -} - - -extern void *sradix_tree_next(struct sradix_tree_root *root, - struct sradix_tree_node *node, unsigned long index, - int (*iter)(void *, unsigned long)); - -extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); - -extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, - struct sradix_tree_node *node, unsigned long index); - -extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); - -#endif /* _LINUX_SRADIX_TREE_H */ diff --git a/include/linux/uksm.h b/include/linux/uksm.h deleted file mode 100644 index 206f10958..000000000 --- a/include/linux/uksm.h +++ /dev/null @@ -1,146 +0,0 @@ -#ifndef __LINUX_UKSM_H -#define __LINUX_UKSM_H -/* - * Memory merging support. - * - * This code enables dynamic sharing of identical pages found in different - * memory areas, even if they are not shared by fork(). - */ - -/* if !CONFIG_UKSM this file should not be compiled at all. */ -#ifdef CONFIG_UKSM - -#include <linux/bitops.h> -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/rmap.h> -#include <linux/sched.h> - -extern unsigned long zero_pfn __read_mostly; -extern unsigned long uksm_zero_pfn __read_mostly; -extern struct page *empty_uksm_zero_page; - -/* must be done before linked to mm */ -extern void uksm_vma_add_new(struct vm_area_struct *vma); -extern void uksm_remove_vma(struct vm_area_struct *vma); - -#define UKSM_SLOT_NEED_SORT (1 << 0) -#define UKSM_SLOT_NEED_RERAND (1 << 1) -#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ -#define UKSM_SLOT_FUL_SCANNED (1 << 3) -#define UKSM_SLOT_IN_UKSM (1 << 4) - -struct vma_slot { - struct sradix_tree_node *snode; - unsigned long sindex; - - struct list_head slot_list; - unsigned long fully_scanned_round; - unsigned long dedup_num; - unsigned long pages_scanned; - unsigned long last_scanned; - unsigned long pages_to_scan; - struct scan_rung *rung; - struct page **rmap_list_pool; - unsigned int *pool_counts; - unsigned long pool_size; - struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long ctime_j; - unsigned long pages; - unsigned long flags; - unsigned long pages_cowed; /* pages cowed this round */ - unsigned long pages_merged; /* pages merged this round */ - unsigned long pages_bemerged; - - /* when it has page merged in this eval round */ - struct list_head dedup_list; -}; - -static inline void uksm_unmap_zero_page(pte_t pte) -{ - if (pte_pfn(pte) == uksm_zero_pfn) - __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -} - -static inline void uksm_map_zero_page(pte_t pte) -{ - if (pte_pfn(pte) == uksm_zero_pfn) - __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); -} - -static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -{ - if (vma->uksm_vma_slot && PageKsm(page)) - vma->uksm_vma_slot->pages_cowed++; -} - -static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -{ - if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) - vma->uksm_vma_slot->pages_cowed++; -} - -static inline int uksm_flags_can_scan(unsigned long vm_flags) -{ -#ifndef VM_SAO -#define VM_SAO 0 -#endif - return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_HUGETLB | VM_MIXEDMAP | VM_SHARED - | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO)); -} - -static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -{ - if (uksm_flags_can_scan(*vm_flags_p)) - *vm_flags_p |= VM_MERGEABLE; -} - -/* - * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will - * be removed when uksm zero page patch is stable enough. - */ -static inline void uksm_bugon_zeropage(pte_t pte) -{ - BUG_ON(pte_pfn(pte) == uksm_zero_pfn); -} -#else -static inline void uksm_vma_add_new(struct vm_area_struct *vma) -{ -} - -static inline void uksm_remove_vma(struct vm_area_struct *vma) -{ -} - -static inline void uksm_unmap_zero_page(pte_t pte) -{ -} - -static inline void uksm_map_zero_page(pte_t pte) -{ -} - -static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) -{ -} - -static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) -{ -} - -static inline int uksm_flags_can_scan(unsigned long vm_flags) -{ - return 0; -} - -static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) -{ -} - -static inline void uksm_bugon_zeropage(pte_t pte) -{ -} -#endif /* !CONFIG_UKSM */ -#endif /* __LINUX_UKSM_H */ diff --git a/kernel/fork.c b/kernel/fork.c index f91740137..2e391c754 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -444,7 +444,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; charge = len; } - tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!tmp) goto fail_nomem; *tmp = *mpnt; @@ -497,7 +497,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; - uksm_vma_add_new(tmp); + mm->map_count++; retval = copy_page_range(mm, oldmm, mpnt); diff --git a/lib/Makefile b/lib/Makefile index 1bf34ead2..a7c26a41a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\ + rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ sha1.o md5.o irq_regs.o argv_split.o \ proportions.o flex_proportions.o ratelimit.o show_mem.o \ diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c deleted file mode 100644 index 8d0632917..000000000 --- a/lib/sradix-tree.c +++ /dev/null @@ -1,476 +0,0 @@ -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/spinlock.h> -#include <linux/slab.h> -#include <linux/gcd.h> -#include <linux/sradix-tree.h> - -static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) -{ - return node->fulls == root->stores_size || - (node->height == 1 && node->count == root->stores_size); -} - -/* - * Extend a sradix tree so it can store key @index. - */ -static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) -{ - struct sradix_tree_node *node; - unsigned int height; - - if (unlikely(root->rnode == NULL)) { - if (!(node = root->alloc())) - return -ENOMEM; - - node->height = 1; - root->rnode = node; - root->height = 1; - } - - /* Figure out what the height should be. */ - height = root->height; - index >>= root->shift * height; - - while (index) { - index >>= root->shift; - height++; - } - - while (height > root->height) { - unsigned int newheight; - if (!(node = root->alloc())) - return -ENOMEM; - - /* Increase the height. */ - node->stores[0] = root->rnode; - root->rnode->parent = node; - if (root->extend) - root->extend(node, root->rnode); - - newheight = root->height + 1; - node->height = newheight; - node->count = 1; - if (sradix_node_full(root, root->rnode)) - node->fulls = 1; - - root->rnode = node; - root->height = newheight; - } - - return 0; -} - -/* - * Search the next item from the current node, that is not NULL - * and can satify root->iter(). - */ -void *sradix_tree_next(struct sradix_tree_root *root, - struct sradix_tree_node *node, unsigned long index, - int (*iter)(void *item, unsigned long height)) -{ - unsigned long offset; - void *item; - - if (unlikely(node == NULL)) { - node = root->rnode; - for (offset = 0; offset < root->stores_size; offset++) { - item = node->stores[offset]; - if (item && (!iter || iter(item, node->height))) - break; - } - - if (unlikely(offset >= root->stores_size)) - return NULL; - - if (node->height == 1) - return item; - else - goto go_down; - } - - while (node) { - offset = (index & root->mask) + 1; - for (;offset < root->stores_size; offset++) { - item = node->stores[offset]; - if (item && (!iter || iter(item, node->height))) - break; - } - - if (offset < root->stores_size) - break; - - node = node->parent; - index >>= root->shift; - } - - if (!node) - return NULL; - - while (node->height > 1) { -go_down: - node = item; - for (offset = 0; offset < root->stores_size; offset++) { - item = node->stores[offset]; - if (item && (!iter || iter(item, node->height))) - break; - } - - if (unlikely(offset >= root->stores_size)) - return NULL; - } - - BUG_ON(offset > root->stores_size); - - return item; -} - -/* - * Blindly insert the item to the tree. Typically, we reuse the - * first empty store item. - */ -int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) -{ - unsigned long index; - unsigned int height; - struct sradix_tree_node *node, *tmp = NULL; - int offset, offset_saved; - void **store = NULL; - int error, i, j, shift; - -go_on: - index = root->min; - - if (root->enter_node && !sradix_node_full(root, root->enter_node)) { - node = root->enter_node; - BUG_ON((index >> (root->shift * root->height))); - } else { - node = root->rnode; - if (node == NULL || (index >> (root->shift * root->height)) - || sradix_node_full(root, node)) { - error = sradix_tree_extend(root, index); - if (error) - return error; - - node = root->rnode; - } - } - - - height = node->height; - shift = (height - 1) * root->shift; - offset = (index >> shift) & root->mask; - while (shift > 0) { - offset_saved = offset; - for (; offset < root->stores_size; offset++) { - store = &node->stores[offset]; - tmp = *store; - - if (!tmp || !sradix_node_full(root, tmp)) - break; - } - BUG_ON(offset >= root->stores_size); - - if (offset != offset_saved) { - index += (offset - offset_saved) << shift; - index &= ~((1UL << shift) - 1); - } - - if (!tmp) { - if (!(tmp = root->alloc())) - return -ENOMEM; - - tmp->height = shift / root->shift; - *store = tmp; - tmp->parent = node; - node->count++; -// if (root->extend) -// root->extend(node, tmp); - } - - node = tmp; - shift -= root->shift; - offset = (index >> shift) & root->mask; - } - - BUG_ON(node->height != 1); - - - store = &node->stores[offset]; - for (i = 0, j = 0; - j < root->stores_size - node->count && - i < root->stores_size - offset && j < num; i++) { - if (!store[i]) { - store[i] = item[j]; - if (root->assign) - root->assign(node, index + i, item[j]); - j++; - } - } - - node->count += j; - root->num += j; - num -= j; - - while (sradix_node_full(root, node)) { - node = node->parent; - if (!node) - break; - - node->fulls++; - } - - if (unlikely(!node)) { - /* All nodes are full */ - root->min = 1 << (root->height * root->shift); - root->enter_node = NULL; - } else { - root->min = index + i - 1; - root->min |= (1UL << (node->height - 1)) - 1; - root->min++; - root->enter_node = node; - } - - if (num) { - item += j; - goto go_on; - } - - return 0; -} - - -/** - * sradix_tree_shrink - shrink height of a sradix tree to minimal - * @root sradix tree root - * - */ -static inline void sradix_tree_shrink(struct sradix_tree_root *root) -{ - /* try to shrink tree height */ - while (root->height > 1) { - struct sradix_tree_node *to_free = root->rnode; - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost store, we cannot shrink. - */ - if (to_free->count != 1 || !to_free->stores[0]) - break; - - root->rnode = to_free->stores[0]; - root->rnode->parent = NULL; - root->height--; - if (unlikely(root->enter_node == to_free)) { - root->enter_node = NULL; - } - root->free(to_free); - } -} - -/* - * Del the item on the known leaf node and index - */ -void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, - struct sradix_tree_node *node, unsigned long index) -{ - unsigned int offset; - struct sradix_tree_node *start, *end; - - BUG_ON(node->height != 1); - - start = node; - while (node && !(--node->count)) - node = node->parent; - - end = node; - if (!node) { - root->rnode = NULL; - root->height = 0; - root->min = 0; - root->num = 0; - root->enter_node = NULL; - } else { - offset = (index >> (root->shift * (node->height - 1))) & root->mask; - if (root->rm) - root->rm(node, offset); - node->stores[offset] = NULL; - root->num--; - if (root->min > index) { - root->min = index; - root->enter_node = node; - } - } - - if (start != end) { - do { - node = start; - start = start->parent; - if (unlikely(root->enter_node == node)) - root->enter_node = end; - root->free(node); - } while (start != end); - - /* - * Note that shrink may free "end", so enter_node still need to - * be checked inside. - */ - sradix_tree_shrink(root); - } else if (node->count == root->stores_size - 1) { - /* It WAS a full leaf node. Update the ancestors */ - node = node->parent; - while (node) { - node->fulls--; - if (node->fulls != root->stores_size - 1) - break; - - node = node->parent; - } - } -} - -void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) -{ - unsigned int height, offset; - struct sradix_tree_node *node; - int shift; - - node = root->rnode; - if (node == NULL || (index >> (root->shift * root->height))) - return NULL; - - height = root->height; - shift = (height - 1) * root->shift; - - do { - offset = (index >> shift) & root->mask; - node = node->stores[offset]; - if (!node) - return NULL; - - shift -= root->shift; - } while (shift >= 0); - - return node; -} - -/* - * Return the item if it exists, otherwise create it in place - * and return the created item. - */ -void *sradix_tree_lookup_create(struct sradix_tree_root *root, - unsigned long index, void *(*item_alloc)(void)) -{ - unsigned int height, offset; - struct sradix_tree_node *node, *tmp; - void *item; - int shift, error; - - if (root->rnode == NULL || (index >> (root->shift * root->height))) { - if (item_alloc) { - error = sradix_tree_extend(root, index); - if (error) - return NULL; - } else { - return NULL; - } - } - - node = root->rnode; - height = root->height; - shift = (height - 1) * root->shift; - - do { - offset = (index >> shift) & root->mask; - if (!node->stores[offset]) { - if (!(tmp = root->alloc())) - return NULL; - - tmp->height = shift / root->shift; - node->stores[offset] = tmp; - tmp->parent = node; - node->count++; - node = tmp; - } else { - node = node->stores[offset]; - } - - shift -= root->shift; - } while (shift > 0); - - BUG_ON(node->height != 1); - offset = index & root->mask; - if (node->stores[offset]) { - return node->stores[offset]; - } else if (item_alloc) { - if (!(item = item_alloc())) - return NULL; - - node->stores[offset] = item; - - /* - * NOTE: we do NOT call root->assign here, since this item is - * newly created by us having no meaning. Caller can call this - * if it's necessary to do so. - */ - - node->count++; - root->num++; - - while (sradix_node_full(root, node)) { - node = node->parent; - if (!node) - break; - - node->fulls++; - } - - if (unlikely(!node)) { - /* All nodes are full */ - root->min = 1 << (root->height * root->shift); - } else { - if (root->min == index) { - root->min |= (1UL << (node->height - 1)) - 1; - root->min++; - root->enter_node = node; - } - } - - return item; - } else { - return NULL; - } - -} - -int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) -{ - unsigned int height, offset; - struct sradix_tree_node *node; - int shift; - - node = root->rnode; - if (node == NULL || (index >> (root->shift * root->height))) - return -ENOENT; - - height = root->height; - shift = (height - 1) * root->shift; - - do { - offset = (index >> shift) & root->mask; - node = node->stores[offset]; - if (!node) - return -ENOENT; - - shift -= root->shift; - } while (shift > 0); - - offset = index & root->mask; - if (!node->stores[offset]) - return -ENOENT; - - sradix_tree_delete_from_leaf(root, node, index); - - return 0; -} diff --git a/mm/Kconfig b/mm/Kconfig index 1b1b681c9..03cbfa072 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -322,32 +322,6 @@ config KSM See Documentation/vm/ksm.txt for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). -choice - prompt "Choose UKSM/KSM strategy" - default UKSM - depends on KSM - help - This option allows to select a UKSM/KSM stragety. - -config UKSM - bool "Ultra-KSM for page merging" - depends on KSM - help - UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same - page Merging), but with a fundamentally rewritten core algorithm. With - an advanced algorithm, UKSM now can transparently scans all anonymously - mapped user space applications with an significantly improved scan speed - and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from - UKSM. Now UKSM has its first stable release and first real world enterprise user. - For more information, please goto its project page. - (www.kerneldedup.org) - -config KSM_LEGACY - bool "Legacy KSM implementation" - depends on KSM - help - The legacy KSM implementation from Redhat. -endchoice config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" diff --git a/mm/Makefile b/mm/Makefile index 90fd58a58..2ed43191f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -47,8 +47,7 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o -obj-$(CONFIG_KSM_LEGACY) += ksm.o -obj-$(CONFIG_UKSM) += uksm.o +obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o diff --git a/mm/memory.c b/mm/memory.c index 09e967a3c..8132787ae 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -122,28 +122,6 @@ unsigned long highest_memmap_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); -#ifdef CONFIG_UKSM -unsigned long uksm_zero_pfn __read_mostly; -EXPORT_SYMBOL_GPL(uksm_zero_pfn); -struct page *empty_uksm_zero_page; - -static int __init setup_uksm_zero_page(void) -{ - unsigned long addr; - addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0); - if (!addr) - panic("Oh boy, that early out of memory?"); - - empty_uksm_zero_page = virt_to_page((void *) addr); - SetPageReserved(empty_uksm_zero_page); - - uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); - - return 0; -} -core_initcall(setup_uksm_zero_page); -#endif - /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ @@ -155,7 +133,6 @@ static int __init init_zero_pfn(void) core_initcall(init_zero_pfn); - #if defined(SPLIT_RSS_COUNTING) void sync_mm_rss(struct mm_struct *mm) @@ -890,11 +867,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; - - /* Should return NULL in vm_normal_page() */ - uksm_bugon_zeropage(pte); - } else { - uksm_map_zero_page(pte); } out_set_pte: @@ -1129,7 +1101,6 @@ again: tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) - uksm_unmap_zero_page(ptent); continue; if (!PageAnon(page)) { @@ -1966,10 +1937,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo clear_page(kaddr); kunmap_atomic(kaddr); flush_dcache_page(dst); - } else { + } else copy_user_highpage(dst, src, va, vma); - uksm_cow_page(vma, src); - } } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) @@ -2114,7 +2083,6 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, new_page = alloc_zeroed_user_highpage_movable(vma, address); if (!new_page) goto oom; - uksm_cow_pte(vma, orig_pte); } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) @@ -2140,9 +2108,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } - uksm_bugon_zeropage(orig_pte); } else { - uksm_unmap_zero_page(orig_pte); inc_mm_counter_fast(mm, MM_ANONPAGES); } flush_cache_page(vma, address, pte_pfn(orig_pte)); @@ -43,7 +43,6 @@ #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> -#include <linux/ksm.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -293,7 +292,6 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - uksm_remove_vma(vma); kmem_cache_free(vm_area_cachep, vma); return next; } @@ -758,16 +756,9 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, long adjust_next = 0; int remove_next = 0; -/* - * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is - * acquired - */ - uksm_remove_vma(vma); - if (next && !insert) { struct vm_area_struct *exporter = NULL; - uksm_remove_vma(next); if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -861,7 +852,6 @@ again: remove_next = 1 + (end > next->vm_end); end_changed = true; } vma->vm_pgoff = pgoff; - if (adjust_next) { next->vm_start += adjust_next << PAGE_SHIFT; next->vm_pgoff += adjust_next; @@ -932,22 +922,16 @@ again: remove_next = 1 + (end > next->vm_end); * up the code too much to do both in one go. */ next = vma->vm_next; - if (remove_next == 2) { - uksm_remove_vma(next); + if (remove_next == 2) goto again; - } else if (next) { + else if (next) vma_gap_update(next); - } else { + else mm->highest_vm_end = end; - } - } else { - if (next && !insert) - uksm_vma_add_new(next); } if (insert && file) uprobe_mmap(insert); - uksm_vma_add_new(vma); validate_mm(mm); return 0; @@ -1332,9 +1316,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */ - uksm_vm_flags_mod(&vm_flags); - if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; @@ -1673,7 +1654,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, allow_write_access(file); } file = vma->vm_file; - uksm_vma_add_new(vma); out: perf_event_mmap(vma); @@ -1715,7 +1695,6 @@ allow_write_and_free_vma: if (vm_flags & VM_DENYWRITE) allow_write_access(file); free_vma: - uksm_remove_vma(vma); kmem_cache_free(vm_area_cachep, vma); unacct_error: if (charged) @@ -2511,8 +2490,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, else err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - uksm_vma_add_new(new); - /* Success. */ if (!err) return 0; @@ -2773,7 +2750,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) return addr; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - uksm_vm_flags_mod(&flags); error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (offset_in_page(error)) @@ -2831,7 +2807,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); - uksm_vma_add_new(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -2868,12 +2843,6 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); - /* - * Taking write lock on mmap_sem does not harm others, - * but it's crucial for uksm to avoid races. - */ - down_write(&mm->mmap_sem); - if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -2909,11 +2878,6 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); } vm_unacct_memory(nr_accounted); - - mm->mmap = NULL; - mm->mm_rb = RB_ROOT; - vmacache_invalidate(mm); - up_write(&mm->mmap_sem); } /* Insert vm structure into process list sorted by address @@ -3023,7 +2987,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_ops->open(new_vma); vma_link(mm, new_vma, prev, rb_link, rb_parent); *need_rmap_locks = false; - uksm_vma_add_new(new_vma); } return new_vma; @@ -3153,7 +3116,6 @@ static struct vm_area_struct *__install_special_mapping( vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); - uksm_vma_add_new(vma); return vma; @@ -1125,9 +1125,9 @@ void page_move_anon_rmap(struct page *page, /** * __page_set_anon_rmap - set up new anonymous rmap - * @page: Page to add to rmap + * @page: Page to add to rmap * @vma: VM area to add page to. - * @address: User virtual address of the mapping + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, diff --git a/mm/uksm.c b/mm/uksm.c deleted file mode 100644 index 2ce796323..000000000 --- a/mm/uksm.c +++ /dev/null @@ -1,5525 +0,0 @@ -/* - * Ultra KSM. Copyright (C) 2011-2012 Nai Xia - * - * This is an improvement upon KSM. Some basic data structures and routines - * are borrowed from ksm.c . - * - * Its new features: - * 1. Full system scan: - * It automatically scans all user processes' anonymous VMAs. Kernel-user - * interaction to submit a memory area to KSM is no longer needed. - * - * 2. Rich area detection: - * It automatically detects rich areas containing abundant duplicated - * pages based. Rich areas are given a full scan speed. Poor areas are - * sampled at a reasonable speed with very low CPU consumption. - * - * 3. Ultra Per-page scan speed improvement: - * A new hash algorithm is proposed. As a result, on a machine with - * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it - * can scan memory areas that does not contain duplicated pages at speed of - * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of - * 477MB/sec ~ 923MB/sec. - * - * 4. Thrashing area avoidance: - * Thrashing area(an VMA that has frequent Ksm page break-out) can be - * filtered out. My benchmark shows it's more efficient than KSM's per-page - * hash value based volatile page detection. - * - * - * 5. Misc changes upon KSM: - * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page - * comparison. It's much faster than default C version on x86. - * * rmap_item now has an struct *page member to loosely cache a - * address-->page mapping, which reduces too much time-costly - * follow_page(). - * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. - * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ - * ksm is needed for this case. - * - * 6. Full Zero Page consideration(contributed by Figo Zhang) - * Now uksmd consider full zero pages as special pages and merge them to an - * special unswappable uksm zero page. - */ - -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/mman.h> -#include <linux/sched.h> -#include <linux/rwsem.h> -#include <linux/pagemap.h> -#include <linux/rmap.h> -#include <linux/spinlock.h> -#include <linux/jhash.h> -#include <linux/delay.h> -#include <linux/kthread.h> -#include <linux/wait.h> -#include <linux/slab.h> -#include <linux/rbtree.h> -#include <linux/memory.h> -#include <linux/mmu_notifier.h> -#include <linux/swap.h> -#include <linux/ksm.h> -#include <linux/crypto.h> -#include <linux/scatterlist.h> -#include <crypto/hash.h> -#include <linux/random.h> -#include <linux/math64.h> -#include <linux/gcd.h> -#include <linux/freezer.h> -#include <linux/sradix-tree.h> - -#include <asm/tlbflush.h> -#include "internal.h" - -#ifdef CONFIG_X86 -#undef memcmp - -#ifdef CONFIG_X86_32 -#define memcmp memcmpx86_32 -/* - * Compare 4-byte-aligned address s1 and s2, with length n - */ -int memcmpx86_32(void *s1, void *s2, size_t n) -{ - size_t num = n / 4; - register int res; - - __asm__ __volatile__ - ( - "testl %3,%3\n\t" - "repe; cmpsd\n\t" - "je 1f\n\t" - "sbbl %0,%0\n\t" - "orl $1,%0\n" - "1:" - : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) - : "0" (0) - : "cc"); - - return res; -} - -/* - * Check the page is all zero ? - */ -static int is_full_zero(const void *s1, size_t len) -{ - unsigned char same; - - len /= 4; - - __asm__ __volatile__ - ("repe; scasl;" - "sete %0" - : "=qm" (same), "+D" (s1), "+c" (len) - : "a" (0) - : "cc"); - - return same; -} - - -#elif defined(CONFIG_X86_64) -#define memcmp memcmpx86_64 -/* - * Compare 8-byte-aligned address s1 and s2, with length n - */ -int memcmpx86_64(void *s1, void *s2, size_t n) -{ - size_t num = n / 8; - register int res; - - __asm__ __volatile__ - ( - "testq %q3,%q3\n\t" - "repe; cmpsq\n\t" - "je 1f\n\t" - "sbbq %q0,%q0\n\t" - "orq $1,%q0\n" - "1:" - : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) - : "0" (0) - : "cc"); - - return res; -} - -static int is_full_zero(const void *s1, size_t len) -{ - unsigned char same; - - len /= 8; - - __asm__ __volatile__ - ("repe; scasq;" - "sete %0" - : "=qm" (same), "+D" (s1), "+c" (len) - : "a" (0) - : "cc"); - - return same; -} - -#endif -#else -static int is_full_zero(const void *s1, size_t len) -{ - unsigned long *src = s1; - int i; - - len /= sizeof(*src); - - for (i = 0; i < len; i++) { - if (src[i]) - return 0; - } - - return 1; -} -#endif - -#define UKSM_RUNG_ROUND_FINISHED (1 << 0) -#define TIME_RATIO_SCALE 10000 - -#define SLOT_TREE_NODE_SHIFT 8 -#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) -struct slot_tree_node { - unsigned long size; - struct sradix_tree_node snode; - void *stores[SLOT_TREE_NODE_STORE_SIZE]; -}; - -static struct kmem_cache *slot_tree_node_cachep; - -static struct sradix_tree_node *slot_tree_node_alloc(void) -{ - struct slot_tree_node *p; - p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL); - if (!p) - return NULL; - - return &p->snode; -} - -static void slot_tree_node_free(struct sradix_tree_node *node) -{ - struct slot_tree_node *p; - - p = container_of(node, struct slot_tree_node, snode); - kmem_cache_free(slot_tree_node_cachep, p); -} - -static void slot_tree_node_extend(struct sradix_tree_node *parent, - struct sradix_tree_node *child) -{ - struct slot_tree_node *p, *c; - - p = container_of(parent, struct slot_tree_node, snode); - c = container_of(child, struct slot_tree_node, snode); - - p->size += c->size; -} - -void slot_tree_node_assign(struct sradix_tree_node *node, - unsigned index, void *item) -{ - struct vma_slot *slot = item; - struct slot_tree_node *cur; - - slot->snode = node; - slot->sindex = index; - - while (node) { - cur = container_of(node, struct slot_tree_node, snode); - cur->size += slot->pages; - node = node->parent; - } -} - -void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset) -{ - struct vma_slot *slot; - struct slot_tree_node *cur; - unsigned long pages; - - if (node->height == 1) { - slot = node->stores[offset]; - pages = slot->pages; - } else { - cur = container_of(node->stores[offset], - struct slot_tree_node, snode); - pages = cur->size; - } - - while (node) { - cur = container_of(node, struct slot_tree_node, snode); - cur->size -= pages; - node = node->parent; - } -} - -unsigned long slot_iter_index; -int slot_iter(void *item, unsigned long height) -{ - struct slot_tree_node *node; - struct vma_slot *slot; - - if (height == 1) { - slot = item; - if (slot_iter_index < slot->pages) { - /*in this one*/ - return 1; - } else { - slot_iter_index -= slot->pages; - return 0; - } - - } else { - node = container_of(item, struct slot_tree_node, snode); - if (slot_iter_index < node->size) { - /*in this one*/ - return 1; - } else { - slot_iter_index -= node->size; - return 0; - } - } -} - - -static inline void slot_tree_init_root(struct sradix_tree_root *root) -{ - init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); - root->alloc = slot_tree_node_alloc; - root->free = slot_tree_node_free; - root->extend = slot_tree_node_extend; - root->assign = slot_tree_node_assign; - root->rm = slot_tree_node_rm; -} - -void slot_tree_init(void) -{ - slot_tree_node_cachep = kmem_cache_create("slot_tree_node", - sizeof(struct slot_tree_node), 0, - SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, - NULL); -} - - -/* Each rung of this ladder is a list of VMAs having a same scan ratio */ -struct scan_rung { - //struct list_head scanned_list; - struct sradix_tree_root vma_root; - struct sradix_tree_root vma_root2; - - struct vma_slot *current_scan; - unsigned long current_offset; - - /* - * The initial value for current_offset, it should loop over - * [0~ step - 1] to let all slot have its chance to be scanned. - */ - unsigned long offset_init; - unsigned long step; /* dynamic step for current_offset */ - unsigned int flags; - unsigned long pages_to_scan; - //unsigned long fully_scanned_slots; - /* - * a little bit tricky - if cpu_time_ratio > 0, then the value is the - * the cpu time ratio it can spend in rung_i for every scan - * period. if < 0, then it is the cpu time ratio relative to the - * max cpu percentage user specified. Both in unit of - * 1/TIME_RATIO_SCALE - */ - int cpu_ratio; - - /* - * How long it will take for all slots in this rung to be fully - * scanned? If it's zero, we don't care about the cover time: - * it's fully scanned. - */ - unsigned int cover_msecs; - //unsigned long vma_num; - //unsigned long pages; /* Sum of all slot's pages in rung */ -}; - -/** - * node of either the stable or unstale rbtree - * - */ -struct tree_node { - struct rb_node node; /* link in the main (un)stable rbtree */ - struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ - u32 hash; - unsigned long count; /* TODO: merged with sub_root */ - struct list_head all_list; /* all tree nodes in stable/unstable tree */ -}; - -/** - * struct stable_node - node of the stable rbtree - * @node: rb node of this ksm page in the stable tree - * @hlist: hlist head of rmap_items using this ksm page - * @kpfn: page frame number of this ksm page - */ -struct stable_node { - struct rb_node node; /* link in sub-rbtree */ - struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ - struct hlist_head hlist; - unsigned long kpfn; - u32 hash_max; /* if ==0 then it's not been calculated yet */ - struct list_head all_list; /* in a list for all stable nodes */ -}; - -/** - * struct node_vma - group rmap_items linked in a same stable - * node together. - */ -struct node_vma { - union { - struct vma_slot *slot; - unsigned long key; /* slot is used as key sorted on hlist */ - }; - struct hlist_node hlist; - struct hlist_head rmap_hlist; - struct stable_node *head; -}; - -/** - * struct rmap_item - reverse mapping item for virtual addresses - * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list - * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree - * @mm: the memory structure this rmap_item is pointing into - * @address: the virtual address this rmap_item tracks (+ flags in low bits) - * @node: rb node of this rmap_item in the unstable tree - * @head: pointer to stable_node heading this list in the stable tree - * @hlist: link into hlist of rmap_items hanging off that stable_node - */ -struct rmap_item { - struct vma_slot *slot; - struct page *page; - unsigned long address; /* + low bits used for flags below */ - unsigned long hash_round; - unsigned long entry_index; - union { - struct {/* when in unstable tree */ - struct rb_node node; - struct tree_node *tree_node; - u32 hash_max; - }; - struct { /* when in stable tree */ - struct node_vma *head; - struct hlist_node hlist; - struct anon_vma *anon_vma; - }; - }; -} __attribute__((aligned(4))); - -struct rmap_list_entry { - union { - struct rmap_item *item; - unsigned long addr; - }; - /* lowest bit is used for is_addr tag */ -} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/ - - -/* Basic data structure definition ends */ - - -/* - * Flags for rmap_item to judge if it's listed in the stable/unstable tree. - * The flags use the low bits of rmap_item.address - */ -#define UNSTABLE_FLAG 0x1 -#define STABLE_FLAG 0x2 -#define get_rmap_addr(x) ((x)->address & PAGE_MASK) - -/* - * rmap_list_entry helpers - */ -#define IS_ADDR_FLAG 1 -#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) -#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) -#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) - - -/* - * High speed caches for frequently allocated and freed structs - */ -static struct kmem_cache *rmap_item_cache; -static struct kmem_cache *stable_node_cache; -static struct kmem_cache *node_vma_cache; -static struct kmem_cache *vma_slot_cache; -static struct kmem_cache *tree_node_cache; -#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ - sizeof(struct __struct), __alignof__(struct __struct),\ - (__flags), NULL) - -/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ -#define SCAN_LADDER_SIZE 4 -static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; - -/* The evaluation rounds uksmd has finished */ -static unsigned long long uksm_eval_round = 1; - -/* - * we add 1 to this var when we consider we should rebuild the whole - * unstable tree. - */ -static unsigned long uksm_hash_round = 1; - -/* - * How many times the whole memory is scanned. - */ -static unsigned long long fully_scanned_round = 1; - -/* The total number of virtual pages of all vma slots */ -static u64 uksm_pages_total; - -/* The number of pages has been scanned since the start up */ -static u64 uksm_pages_scanned; - -static u64 scanned_virtual_pages; - -/* The number of pages has been scanned since last encode_benefit call */ -static u64 uksm_pages_scanned_last; - -/* If the scanned number is tooo large, we encode it here */ -static u64 pages_scanned_stored; - -static unsigned long pages_scanned_base; - -/* The number of nodes in the stable tree */ -static unsigned long uksm_pages_shared; - -/* The number of page slots additionally sharing those nodes */ -static unsigned long uksm_pages_sharing; - -/* The number of nodes in the unstable tree */ -static unsigned long uksm_pages_unshared; - -/* - * Milliseconds ksmd should sleep between scans, - * >= 100ms to be consistent with - * scan_time_to_sleep_msec() - */ -static unsigned int uksm_sleep_jiffies; - -/* The real value for the uksmd next sleep */ -static unsigned int uksm_sleep_real; - -/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ -static unsigned int uksm_sleep_saved; - -/* Max percentage of cpu utilization ksmd can take to scan in one batch */ -static unsigned int uksm_max_cpu_percentage; - -static int uksm_cpu_governor; - -static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; - -struct uksm_cpu_preset_s { - int cpu_ratio[SCAN_LADDER_SIZE]; - unsigned int cover_msecs[SCAN_LADDER_SIZE]; - unsigned int max_cpu; /* percentage */ -}; - -struct uksm_cpu_preset_s uksm_cpu_preset[4] = { - { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, - { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, - { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, - { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, -}; - -/* The default value for uksm_ema_page_time if it's not initialized */ -#define UKSM_PAGE_TIME_DEFAULT 500 - -/*cost to scan one page by expotional moving average in nsecs */ -static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; - -/* The expotional moving average alpha weight, in percentage. */ -#define EMA_ALPHA 20 - -/* - * The threshold used to filter out thrashing areas, - * If it == 0, filtering is disabled, otherwise it's the percentage up-bound - * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio - * will be considered as having a zero duplication ratio. - */ -static unsigned int uksm_thrash_threshold = 50; - -/* How much dedup ratio is considered to be abundant*/ -static unsigned int uksm_abundant_threshold = 10; - -/* All slots having merged pages in this eval round. */ -struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); - -/* How many times the ksmd has slept since startup */ -static unsigned long long uksm_sleep_times; - -#define UKSM_RUN_STOP 0 -#define UKSM_RUN_MERGE 1 -static unsigned int uksm_run = 1; - -static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); -static DEFINE_MUTEX(uksm_thread_mutex); - -/* - * List vma_slot_new is for newly created vma_slot waiting to be added by - * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to - * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding - * VMA has been removed/freed. - */ -struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); -struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); -struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); -static DEFINE_SPINLOCK(vma_slot_list_lock); - -/* The unstable tree heads */ -static struct rb_root root_unstable_tree = RB_ROOT; - -/* - * All tree_nodes are in a list to be freed at once when unstable tree is - * freed after each scan round. - */ -static struct list_head unstable_tree_node_list = - LIST_HEAD_INIT(unstable_tree_node_list); - -/* List contains all stable nodes */ -static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); - -/* - * When the hash strength is changed, the stable tree must be delta_hashed and - * re-structured. We use two set of below structs to speed up the - * re-structuring of stable tree. - */ -static struct list_head -stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), - LIST_HEAD_INIT(stable_tree_node_list[1])}; - -static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; -static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; -static struct rb_root *root_stable_treep = &root_stable_tree[0]; -static unsigned long stable_tree_index; - -/* The hash strength needed to hash a full page */ -#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) - -/* The hash strength needed for loop-back hashing */ -#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) - -/* The random offsets in a page */ -static u32 *random_nums; - -/* The hash strength */ -static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; - -/* The delta value each time the hash strength increases or decreases */ -static unsigned long hash_strength_delta; -#define HASH_STRENGTH_DELTA_MAX 5 - -/* The time we have saved due to random_sample_hash */ -static u64 rshash_pos; - -/* The time we have wasted due to hash collision */ -static u64 rshash_neg; - -struct uksm_benefit { - u64 pos; - u64 neg; - u64 scanned; - unsigned long base; -} benefit; - -/* - * The relative cost of memcmp, compared to 1 time unit of random sample - * hash, this value is tested when ksm module is initialized - */ -static unsigned long memcmp_cost; - -static unsigned long rshash_neg_cont_zero; -static unsigned long rshash_cont_obscure; - -/* The possible states of hash strength adjustment heuristic */ -enum rshash_states { - RSHASH_STILL, - RSHASH_TRYUP, - RSHASH_TRYDOWN, - RSHASH_NEW, - RSHASH_PRE_STILL, -}; - -/* The possible direction we are about to adjust hash strength */ -enum rshash_direct { - GO_UP, - GO_DOWN, - OBSCURE, - STILL, -}; - -/* random sampling hash state machine */ -static struct { - enum rshash_states state; - enum rshash_direct pre_direct; - u8 below_count; - /* Keep a lookup window of size 5, iff above_count/below_count > 3 - * in this window we stop trying. - */ - u8 lookup_window_index; - u64 stable_benefit; - unsigned long turn_point_down; - unsigned long turn_benefit_down; - unsigned long turn_point_up; - unsigned long turn_benefit_up; - unsigned long stable_point; -} rshash_state; - -/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ -static u32 *zero_hash_table; - -static inline struct node_vma *alloc_node_vma(void) -{ - struct node_vma *node_vma; - node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL); - if (node_vma) { - INIT_HLIST_HEAD(&node_vma->rmap_hlist); - INIT_HLIST_NODE(&node_vma->hlist); - } - return node_vma; -} - -static inline void free_node_vma(struct node_vma *node_vma) -{ - kmem_cache_free(node_vma_cache, node_vma); -} - - -static inline struct vma_slot *alloc_vma_slot(void) -{ - struct vma_slot *slot; - - /* - * In case ksm is not initialized by now. - * Oops, we need to consider the call site of uksm_init() in the future. - */ - if (!vma_slot_cache) - return NULL; - - slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL); - if (slot) { - INIT_LIST_HEAD(&slot->slot_list); - INIT_LIST_HEAD(&slot->dedup_list); - slot->flags |= UKSM_SLOT_NEED_RERAND; - } - return slot; -} - -static inline void free_vma_slot(struct vma_slot *vma_slot) -{ - kmem_cache_free(vma_slot_cache, vma_slot); -} - - - -static inline struct rmap_item *alloc_rmap_item(void) -{ - struct rmap_item *rmap_item; - - rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); - if (rmap_item) { - /* bug on lowest bit is not clear for flag use */ - BUG_ON(is_addr(rmap_item)); - } - return rmap_item; -} - -static inline void free_rmap_item(struct rmap_item *rmap_item) -{ - rmap_item->slot = NULL; /* debug safety */ - kmem_cache_free(rmap_item_cache, rmap_item); -} - -static inline struct stable_node *alloc_stable_node(void) -{ - struct stable_node *node; - node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC); - if (!node) - return NULL; - - INIT_HLIST_HEAD(&node->hlist); - list_add(&node->all_list, &stable_node_list); - return node; -} - -static inline void free_stable_node(struct stable_node *stable_node) -{ - list_del(&stable_node->all_list); - kmem_cache_free(stable_node_cache, stable_node); -} - -static inline struct tree_node *alloc_tree_node(struct list_head *list) -{ - struct tree_node *node; - node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC); - if (!node) - return NULL; - - list_add(&node->all_list, list); - return node; -} - -static inline void free_tree_node(struct tree_node *node) -{ - list_del(&node->all_list); - kmem_cache_free(tree_node_cache, node); -} - -static void uksm_drop_anon_vma(struct rmap_item *rmap_item) -{ - struct anon_vma *anon_vma = rmap_item->anon_vma; - - put_anon_vma(anon_vma); -} - - -/** - * Remove a stable node from stable_tree, may unlink from its tree_node and - * may remove its parent tree_node if no other stable node is pending. - * - * @stable_node The node need to be removed - * @unlink_rb Will this node be unlinked from the rbtree? - * @remove_tree_ node Will its tree_node be removed if empty? - */ -static void remove_node_from_stable_tree(struct stable_node *stable_node, - int unlink_rb, int remove_tree_node) -{ - struct node_vma *node_vma; - struct rmap_item *rmap_item; - struct hlist_node *n; - - if (!hlist_empty(&stable_node->hlist)) { - hlist_for_each_entry_safe(node_vma, n, - &stable_node->hlist, hlist) { - hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { - uksm_pages_sharing--; - - uksm_drop_anon_vma(rmap_item); - rmap_item->address &= PAGE_MASK; - } - free_node_vma(node_vma); - cond_resched(); - } - - /* the last one is counted as shared */ - uksm_pages_shared--; - uksm_pages_sharing++; - } - - if (stable_node->tree_node && unlink_rb) { - rb_erase(&stable_node->node, - &stable_node->tree_node->sub_root); - - if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && - remove_tree_node) { - rb_erase(&stable_node->tree_node->node, - root_stable_treep); - free_tree_node(stable_node->tree_node); - } else { - stable_node->tree_node->count--; - } - } - - free_stable_node(stable_node); -} - - -/* - * get_uksm_page: checks if the page indicated by the stable node - * is still its ksm page, despite having held no reference to it. - * In which case we can trust the content of the page, and it - * returns the gotten page; but if the page has now been zapped, - * remove the stale node from the stable tree and return NULL. - * - * You would expect the stable_node to hold a reference to the ksm page. - * But if it increments the page's count, swapping out has to wait for - * ksmd to come around again before it can free the page, which may take - * seconds or even minutes: much too unresponsive. So instead we use a - * "keyhole reference": access to the ksm page from the stable node peeps - * out through its keyhole to see if that page still holds the right key, - * pointing back to this stable node. This relies on freeing a PageAnon - * page to reset its page->mapping to NULL, and relies on no other use of - * a page to put something that might look like our key in page->mapping. - * - * include/linux/pagemap.h page_cache_get_speculative() is a good reference, - * but this is different - made simpler by uksm_thread_mutex being held, but - * interesting for assuming that no other use of the struct page could ever - * put our expected_mapping into page->mapping (or a field of the union which - * coincides with page->mapping). The RCU calls are not for KSM at all, but - * to keep the page_count protocol described with page_cache_get_speculative. - * - * Note: it is possible that get_uksm_page() will return NULL one moment, - * then page the next, if the page is in between page_freeze_refs() and - * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page - * is on its way to being freed; but it is an anomaly to bear in mind. - * - * @unlink_rb: if the removal of this node will firstly unlink from - * its rbtree. stable_node_reinsert will prevent this when restructuring the - * node from its old tree. - * - * @remove_tree_node: if this is the last one of its tree_node, will the - * tree_node be freed ? If we are inserting stable node, this tree_node may - * be reused, so don't free it. - */ -static struct page *get_uksm_page(struct stable_node *stable_node, - int unlink_rb, int remove_tree_node) -{ - struct page *page; - void *expected_mapping; - - page = pfn_to_page(stable_node->kpfn); - expected_mapping = (void *)stable_node + - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); - rcu_read_lock(); - if (page->mapping != expected_mapping) - goto stale; - if (!get_page_unless_zero(page)) - goto stale; - if (page->mapping != expected_mapping) { - put_page(page); - goto stale; - } - rcu_read_unlock(); - return page; -stale: - rcu_read_unlock(); - remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); - - return NULL; -} - -/* - * Removing rmap_item from stable or unstable tree. - * This function will clean the information from the stable/unstable tree. - */ -static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) -{ - if (rmap_item->address & STABLE_FLAG) { - struct stable_node *stable_node; - struct node_vma *node_vma; - struct page *page; - - node_vma = rmap_item->head; - stable_node = node_vma->head; - page = get_uksm_page(stable_node, 1, 1); - if (!page) - goto out; - - /* - * page lock is needed because it's racing with - * try_to_unmap_ksm(), etc. - */ - lock_page(page); - hlist_del(&rmap_item->hlist); - - if (hlist_empty(&node_vma->rmap_hlist)) { - hlist_del(&node_vma->hlist); - free_node_vma(node_vma); - } - unlock_page(page); - - put_page(page); - if (hlist_empty(&stable_node->hlist)) { - /* do NOT call remove_node_from_stable_tree() here, - * it's possible for a forked rmap_item not in - * stable tree while the in-tree rmap_items were - * deleted. - */ - uksm_pages_shared--; - } else - uksm_pages_sharing--; - - - uksm_drop_anon_vma(rmap_item); - } else if (rmap_item->address & UNSTABLE_FLAG) { - if (rmap_item->hash_round == uksm_hash_round) { - - rb_erase(&rmap_item->node, - &rmap_item->tree_node->sub_root); - if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { - rb_erase(&rmap_item->tree_node->node, - &root_unstable_tree); - - free_tree_node(rmap_item->tree_node); - } else - rmap_item->tree_node->count--; - } - uksm_pages_unshared--; - } - - rmap_item->address &= PAGE_MASK; - rmap_item->hash_max = 0; - -out: - cond_resched(); /* we're called from many long loops */ -} - -static inline int slot_in_uksm(struct vma_slot *slot) -{ - return list_empty(&slot->slot_list); -} - -/* - * Test if the mm is exiting - */ -static inline bool uksm_test_exit(struct mm_struct *mm) -{ - return atomic_read(&mm->mm_users) == 0; -} - -static inline unsigned long vma_pool_size(struct vma_slot *slot) -{ - return round_up(sizeof(struct rmap_list_entry) * slot->pages, - PAGE_SIZE) >> PAGE_SHIFT; -} - -#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) - -/* must be done with sem locked */ -static int slot_pool_alloc(struct vma_slot *slot) -{ - unsigned long pool_size; - - if (slot->rmap_list_pool) - return 0; - - pool_size = vma_pool_size(slot); - slot->rmap_list_pool = kzalloc(sizeof(struct page *) * - pool_size, GFP_KERNEL); - if (!slot->rmap_list_pool) - return -ENOMEM; - - slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size, - GFP_KERNEL); - if (!slot->pool_counts) { - kfree(slot->rmap_list_pool); - return -ENOMEM; - } - - slot->pool_size = pool_size; - BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); - slot->flags |= UKSM_SLOT_IN_UKSM; - uksm_pages_total += slot->pages; - - return 0; -} - -/* - * Called after vma is unlinked from its mm - */ -void uksm_remove_vma(struct vm_area_struct *vma) -{ - struct vma_slot *slot; - - if (!vma->uksm_vma_slot) - return; - - spin_lock(&vma_slot_list_lock); - slot = vma->uksm_vma_slot; - if (!slot) - goto out; - - if (slot_in_uksm(slot)) { - /** - * This slot has been added by ksmd, so move to the del list - * waiting ksmd to free it. - */ - list_add_tail(&slot->slot_list, &vma_slot_del); - } else { - /** - * It's still on new list. It's ok to free slot directly. - */ - list_del(&slot->slot_list); - free_vma_slot(slot); - } -out: - vma->uksm_vma_slot = NULL; - spin_unlock(&vma_slot_list_lock); -} - -/** - * Need to do two things: - * 1. check if slot was moved to del list - * 2. make sure the mmap_sem is manipulated under valid vma. - * - * My concern here is that in some cases, this may make - * vma_slot_list_lock() waiters to serialized further by some - * sem->wait_lock, can this really be expensive? - * - * - * @return - * 0: if successfully locked mmap_sem - * -ENOENT: this slot was moved to del list - * -EBUSY: vma lock failed - */ -static int try_down_read_slot_mmap_sem(struct vma_slot *slot) -{ - struct vm_area_struct *vma; - struct mm_struct *mm; - struct rw_semaphore *sem; - - spin_lock(&vma_slot_list_lock); - - /* the slot_list was removed and inited from new list, when it enters - * uksm_list. If now it's not empty, then it must be moved to del list - */ - if (!slot_in_uksm(slot)) { - spin_unlock(&vma_slot_list_lock); - return -ENOENT; - } - - BUG_ON(slot->pages != vma_pages(slot->vma)); - /* Ok, vma still valid */ - vma = slot->vma; - mm = vma->vm_mm; - sem = &mm->mmap_sem; - - if (uksm_test_exit(mm)) { - spin_unlock(&vma_slot_list_lock); - return -ENOENT; - } - - if (down_read_trylock(sem)) { - spin_unlock(&vma_slot_list_lock); - if (slot_pool_alloc(slot)) { - uksm_remove_vma(vma); - up_read(sem); - return -ENOENT; - } - return 0; - } - - spin_unlock(&vma_slot_list_lock); - return -EBUSY; -} - -static inline unsigned long -vma_page_address(struct page *page, struct vm_area_struct *vma) -{ - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - unsigned long address; - - address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { - /* page should be within @vma mapping range */ - return -EFAULT; - } - return address; -} - - -/* return 0 on success with the item's mmap_sem locked */ -static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) -{ - struct mm_struct *mm; - struct vma_slot *slot = item->slot; - int err = -EINVAL; - - struct page *page; - - /* - * try_down_read_slot_mmap_sem() returns non-zero if the slot - * has been removed by uksm_remove_vma(). - */ - if (try_down_read_slot_mmap_sem(slot)) - return -EBUSY; - - mm = slot->vma->vm_mm; - - if (uksm_test_exit(mm)) - goto failout_up; - - page = item->page; - rcu_read_lock(); - if (!get_page_unless_zero(page)) { - rcu_read_unlock(); - goto failout_up; - } - - /* No need to consider huge page here. */ - if (item->slot->vma->anon_vma != page_anon_vma(page) || - vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { - /* - * TODO: - * should we release this item becase of its stale page - * mapping? - */ - put_page(page); - rcu_read_unlock(); - goto failout_up; - } - rcu_read_unlock(); - return 0; - -failout_up: - up_read(&mm->mmap_sem); - return err; -} - -/* - * What kind of VMA is considered ? - */ -static inline int vma_can_enter(struct vm_area_struct *vma) -{ - return uksm_flags_can_scan(vma->vm_flags); -} - -/* - * Called whenever a fresh new vma is created A new vma_slot. - * is created and inserted into a global list Must be called. - * after vma is inserted to its mm . - */ -void uksm_vma_add_new(struct vm_area_struct *vma) -{ - struct vma_slot *slot; - - if (!vma_can_enter(vma)) { - vma->uksm_vma_slot = NULL; - return; - } - - slot = alloc_vma_slot(); - if (!slot) { - vma->uksm_vma_slot = NULL; - return; - } - - vma->uksm_vma_slot = slot; - vma->vm_flags |= VM_MERGEABLE; - slot->vma = vma; - slot->mm = vma->vm_mm; - slot->ctime_j = jiffies; - slot->pages = vma_pages(vma); - spin_lock(&vma_slot_list_lock); - list_add_tail(&slot->slot_list, &vma_slot_new); - spin_unlock(&vma_slot_list_lock); -} - -/* 32/3 < they < 32/2 */ -#define shiftl 8 -#define shiftr 12 - -#define HASH_FROM_TO(from, to) \ -for (index = from; index < to; index++) { \ - pos = random_nums[index]; \ - hash += key[pos]; \ - hash += (hash << shiftl); \ - hash ^= (hash >> shiftr); \ -} - - -#define HASH_FROM_DOWN_TO(from, to) \ -for (index = from - 1; index >= to; index--) { \ - hash ^= (hash >> shiftr); \ - hash ^= (hash >> (shiftr*2)); \ - hash -= (hash << shiftl); \ - hash += (hash << (shiftl*2)); \ - pos = random_nums[index]; \ - hash -= key[pos]; \ -} - -/* - * The main random sample hash function. - */ -static u32 random_sample_hash(void *addr, u32 hash_strength) -{ - u32 hash = 0xdeadbeef; - int index, pos, loop = hash_strength; - u32 *key = (u32 *)addr; - - if (loop > HASH_STRENGTH_FULL) - loop = HASH_STRENGTH_FULL; - - HASH_FROM_TO(0, loop); - - if (hash_strength > HASH_STRENGTH_FULL) { - loop = hash_strength - HASH_STRENGTH_FULL; - HASH_FROM_TO(0, loop); - } - - return hash; -} - - -/** - * It's used when hash strength is adjusted - * - * @addr The page's virtual address - * @from The original hash strength - * @to The hash strength changed to - * @hash The hash value generated with "from" hash value - * - * return the hash value - */ -static u32 delta_hash(void *addr, int from, int to, u32 hash) -{ - u32 *key = (u32 *)addr; - int index, pos; /* make sure they are int type */ - - if (to > from) { - if (from >= HASH_STRENGTH_FULL) { - from -= HASH_STRENGTH_FULL; - to -= HASH_STRENGTH_FULL; - HASH_FROM_TO(from, to); - } else if (to <= HASH_STRENGTH_FULL) { - HASH_FROM_TO(from, to); - } else { - HASH_FROM_TO(from, HASH_STRENGTH_FULL); - HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); - } - } else { - if (from <= HASH_STRENGTH_FULL) { - HASH_FROM_DOWN_TO(from, to); - } else if (to >= HASH_STRENGTH_FULL) { - from -= HASH_STRENGTH_FULL; - to -= HASH_STRENGTH_FULL; - HASH_FROM_DOWN_TO(from, to); - } else { - HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); - HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); - } - } - - return hash; -} - -/** - * - * Called when: rshash_pos or rshash_neg is about to overflow or a scan round - * has finished. - * - * return 0 if no page has been scanned since last call, 1 otherwise. - */ -static inline int encode_benefit(void) -{ - u64 scanned_delta, pos_delta, neg_delta; - unsigned long base = benefit.base; - - scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; - - if (!scanned_delta) - return 0; - - scanned_delta >>= base; - pos_delta = rshash_pos >> base; - neg_delta = rshash_neg >> base; - - if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || - CAN_OVERFLOW_U64(benefit.neg, neg_delta) || - CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { - benefit.scanned >>= 1; - benefit.neg >>= 1; - benefit.pos >>= 1; - benefit.base++; - scanned_delta >>= 1; - pos_delta >>= 1; - neg_delta >>= 1; - } - - benefit.pos += pos_delta; - benefit.neg += neg_delta; - benefit.scanned += scanned_delta; - - BUG_ON(!benefit.scanned); - - rshash_pos = rshash_neg = 0; - uksm_pages_scanned_last = uksm_pages_scanned; - - return 1; -} - -static inline void reset_benefit(void) -{ - benefit.pos = 0; - benefit.neg = 0; - benefit.base = 0; - benefit.scanned = 0; -} - -static inline void inc_rshash_pos(unsigned long delta) -{ - if (CAN_OVERFLOW_U64(rshash_pos, delta)) - encode_benefit(); - - rshash_pos += delta; -} - -static inline void inc_rshash_neg(unsigned long delta) -{ - if (CAN_OVERFLOW_U64(rshash_neg, delta)) - encode_benefit(); - - rshash_neg += delta; -} - - -static inline u32 page_hash(struct page *page, unsigned long hash_strength, - int cost_accounting) -{ - u32 val; - unsigned long delta; - - void *addr = kmap_atomic(page); - - val = random_sample_hash(addr, hash_strength); - kunmap_atomic(addr); - - if (cost_accounting) { - if (HASH_STRENGTH_FULL > hash_strength) - delta = HASH_STRENGTH_FULL - hash_strength; - else - delta = 0; - - inc_rshash_pos(delta); - } - - return val; -} - -static int memcmp_pages(struct page *page1, struct page *page2, - int cost_accounting) -{ - char *addr1, *addr2; - int ret; - - addr1 = kmap_atomic(page1); - addr2 = kmap_atomic(page2); - ret = memcmp(addr1, addr2, PAGE_SIZE); - kunmap_atomic(addr2); - kunmap_atomic(addr1); - - if (cost_accounting) - inc_rshash_neg(memcmp_cost); - - return ret; -} - -static inline int pages_identical(struct page *page1, struct page *page2) -{ - return !memcmp_pages(page1, page2, 0); -} - -static inline int is_page_full_zero(struct page *page) -{ - char *addr; - int ret; - - addr = kmap_atomic(page); - ret = is_full_zero(addr, PAGE_SIZE); - kunmap_atomic(addr); - - return ret; -} - -static int write_protect_page(struct vm_area_struct *vma, struct page *page, - pte_t *orig_pte, pte_t *old_pte) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long addr; - pte_t *ptep; - spinlock_t *ptl; - int swapped; - int err = -EFAULT; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - goto out; - - BUG_ON(PageTransCompound(page)); - - mmun_start = addr; - mmun_end = addr + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) - goto out_mn; - - if (old_pte) - *old_pte = *ptep; - - if (pte_write(*ptep) || pte_dirty(*ptep)) { - pte_t entry; - - swapped = PageSwapCache(page); - flush_cache_page(vma, addr, page_to_pfn(page)); - /* - * Ok this is tricky, when get_user_pages_fast() run it doesnt - * take any lock, therefore the check that we are going to make - * with the pagecount against the mapcount is racey and - * O_DIRECT can happen right after the check. - * So we clear the pte and flush the tlb before the check - * this assure us that no O_DIRECT can happen after the check - * or in the middle of the check. - */ - entry = ptep_clear_flush_notify(vma, addr, ptep); - /* - * Check that no O_DIRECT or similar I/O is in progress on the - * page - */ - if (page_mapcount(page) + 1 + swapped != page_count(page)) { - set_pte_at(mm, addr, ptep, entry); - goto out_unlock; - } - if (pte_dirty(entry)) - set_page_dirty(page); - entry = pte_mkclean(pte_wrprotect(entry)); - set_pte_at_notify(mm, addr, ptep, entry); - } - *orig_pte = *ptep; - err = 0; - -out_unlock: - pte_unmap_unlock(ptep, ptl); -out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); -out: - return err; -} - -#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ -#define MERGE_ERR_COLLI 2 /* there is a collision */ -#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ -#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ - - -/** - * replace_page - replace page in vma by new ksm page - * @vma: vma that holds the pte pointing to page - * @page: the page we are replacing by kpage - * @kpage: the ksm page we replace page by - * @orig_pte: the original value of the pte - * - * Returns 0 on success, MERGE_ERR_PGERR on failure. - */ -static int replace_page(struct vm_area_struct *vma, struct page *page, - struct page *kpage, pte_t orig_pte) -{ - struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; - pte_t entry; - - unsigned long addr; - int err = MERGE_ERR_PGERR; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - goto out; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); - BUG_ON(pmd_trans_huge(*pmd)); - if (!pmd_present(*pmd)) - goto out; - - mmun_start = addr; - mmun_end = addr + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!pte_same(*ptep, orig_pte)) { - pte_unmap_unlock(ptep, ptl); - goto out_mn; - } - - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - entry = mk_pte(kpage, vma->vm_page_prot); - - /* special treatment is needed for zero_page */ - if ((page_to_pfn(kpage) == uksm_zero_pfn) || - (page_to_pfn(kpage) == zero_pfn)) - entry = pte_mkspecial(entry); - else { - get_page(kpage); - page_add_anon_rmap(kpage, vma, addr); - } - - set_pte_at_notify(mm, addr, ptep, entry); - - page_remove_rmap(page); - if (!page_mapped(page)) - try_to_free_swap(page); - put_page(page); - - pte_unmap_unlock(ptep, ptl); - err = 0; -out_mn: - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); -out: - return err; -} - - -/** - * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The - * zero hash value at HASH_STRENGTH_MAX is used to indicated that its - * hash_max member has not been calculated. - * - * @page The page needs to be hashed - * @hash_old The hash value calculated with current hash strength - * - * return the new hash value calculated at HASH_STRENGTH_MAX - */ -static inline u32 page_hash_max(struct page *page, u32 hash_old) -{ - u32 hash_max = 0; - void *addr; - - addr = kmap_atomic(page); - hash_max = delta_hash(addr, hash_strength, - HASH_STRENGTH_MAX, hash_old); - - kunmap_atomic(addr); - - if (!hash_max) - hash_max = 1; - - inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); - return hash_max; -} - -/* - * We compare the hash again, to ensure that it is really a hash collision - * instead of being caused by page write. - */ -static inline int check_collision(struct rmap_item *rmap_item, - u32 hash) -{ - int err; - struct page *page = rmap_item->page; - - /* if this rmap_item has already been hash_maxed, then the collision - * must appears in the second-level rbtree search. In this case we check - * if its hash_max value has been changed. Otherwise, the collision - * happens in the first-level rbtree search, so we check against it's - * current hash value. - */ - if (rmap_item->hash_max) { - inc_rshash_neg(memcmp_cost); - inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); - - if (rmap_item->hash_max == page_hash_max(page, hash)) - err = MERGE_ERR_COLLI; - else - err = MERGE_ERR_CHANGED; - } else { - inc_rshash_neg(memcmp_cost + hash_strength); - - if (page_hash(page, hash_strength, 0) == hash) - err = MERGE_ERR_COLLI; - else - err = MERGE_ERR_CHANGED; - } - - return err; -} - -static struct page *page_trans_compound_anon(struct page *page) -{ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - /* - * head may actually be splitted and freed from under - * us but it's ok here. - */ - if (PageAnon(head)) - return head; - } - return NULL; -} - -static int page_trans_compound_anon_split(struct page *page) -{ - int ret = 0; - struct page *transhuge_head = page_trans_compound_anon(page); - if (transhuge_head) { - /* Get the reference on the head to split it. */ - if (get_page_unless_zero(transhuge_head)) { - /* - * Recheck we got the reference while the head - * was still anonymous. - */ - if (PageAnon(transhuge_head)) - ret = split_huge_page(transhuge_head); - else - /* - * Retry later if split_huge_page run - * from under us. - */ - ret = 1; - put_page(transhuge_head); - } else - /* Retry later if split_huge_page run from under us. */ - ret = 1; - } - return ret; -} - -/** - * Try to merge a rmap_item.page with a kpage in stable node. kpage must - * already be a ksm page. - * - * @return 0 if the pages were merged, -EFAULT otherwise. - */ -static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, - struct page *kpage, u32 hash) -{ - struct vm_area_struct *vma = rmap_item->slot->vma; - struct mm_struct *mm = vma->vm_mm; - pte_t orig_pte = __pte(0); - int err = MERGE_ERR_PGERR; - struct page *page; - - if (uksm_test_exit(mm)) - goto out; - - page = rmap_item->page; - - if (page == kpage) { /* ksm page forked */ - err = 0; - goto out; - } - - if (PageTransCompound(page) && page_trans_compound_anon_split(page)) - goto out; - BUG_ON(PageTransCompound(page)); - - if (!PageAnon(page) || !PageKsm(kpage)) - goto out; - - /* - * We need the page lock to read a stable PageSwapCache in - * write_protect_page(). We use trylock_page() instead of - * lock_page() because we don't want to wait here - we - * prefer to continue scanning and merging different pages, - * then come back to this page when it is unlocked. - */ - if (!trylock_page(page)) - goto out; - /* - * If this anonymous page is mapped only here, its pte may need - * to be write-protected. If it's mapped elsewhere, all of its - * ptes are necessarily already write-protected. But in either - * case, we need to lock and check page_count is not raised. - */ - if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { - if (pages_identical(page, kpage)) - err = replace_page(vma, page, kpage, orig_pte); - else - err = check_collision(rmap_item, hash); - } - - if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { - munlock_vma_page(page); - if (!PageMlocked(kpage)) { - unlock_page(page); - lock_page(kpage); - mlock_vma_page(kpage); - page = kpage; /* for final unlock */ - } - } - - unlock_page(page); -out: - return err; -} - - - -/** - * If two pages fail to merge in try_to_merge_two_pages, then we have a chance - * to restore a page mapping that has been changed in try_to_merge_two_pages. - * - * @return 0 on success. - */ -static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t orig_pte, pte_t wprt_pte) -{ - struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; - - int err = -EFAULT; - - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; - - ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!pte_same(*ptep, wprt_pte)) { - /* already copied, let it be */ - pte_unmap_unlock(ptep, ptl); - goto out; - } - - /* - * Good boy, still here. When we still get the ksm page, it does not - * return to the free page pool, there is no way that a pte was changed - * to other page and gets back to this page. And remind that ksm page - * do not reuse in do_wp_page(). So it's safe to restore the original - * pte. - */ - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, orig_pte); - - pte_unmap_unlock(ptep, ptl); - err = 0; -out: - return err; -} - -/** - * try_to_merge_two_pages() - take two identical pages and prepare - * them to be merged into one page(rmap_item->page) - * - * @return 0 if we successfully merged two identical pages into - * one ksm page. MERGE_ERR_COLLI if it's only a hash collision - * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been - * changed since it's hashed. MERGE_ERR_PGERR otherwise. - * - */ -static int try_to_merge_two_pages(struct rmap_item *rmap_item, - struct rmap_item *tree_rmap_item, - u32 hash) -{ - pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); - pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); - struct vm_area_struct *vma1 = rmap_item->slot->vma; - struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; - struct page *page = rmap_item->page; - struct page *tree_page = tree_rmap_item->page; - int err = MERGE_ERR_PGERR; - struct address_space *saved_mapping; - - - if (rmap_item->page == tree_rmap_item->page) - goto out; - - if (PageTransCompound(page) && page_trans_compound_anon_split(page)) - goto out; - BUG_ON(PageTransCompound(page)); - - if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page)) - goto out; - BUG_ON(PageTransCompound(tree_page)); - - if (!PageAnon(page) || !PageAnon(tree_page)) - goto out; - - if (!trylock_page(page)) - goto out; - - - if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { - unlock_page(page); - goto out; - } - - /* - * While we hold page lock, upgrade page from - * PageAnon+anon_vma to PageKsm+NULL stable_node: - * stable_tree_insert() will update stable_node. - */ - saved_mapping = page->mapping; - set_page_stable_node(page, NULL); - mark_page_accessed(page); - unlock_page(page); - - if (!trylock_page(tree_page)) - goto restore_out; - - if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { - unlock_page(tree_page); - goto restore_out; - } - - if (pages_identical(page, tree_page)) { - err = replace_page(vma2, tree_page, page, wprt_pte2); - if (err) { - unlock_page(tree_page); - goto restore_out; - } - - if ((vma2->vm_flags & VM_LOCKED)) { - munlock_vma_page(tree_page); - if (!PageMlocked(page)) { - unlock_page(tree_page); - lock_page(page); - mlock_vma_page(page); - tree_page = page; /* for final unlock */ - } - } - - unlock_page(tree_page); - - goto out; /* success */ - - } else { - if (tree_rmap_item->hash_max && - tree_rmap_item->hash_max == rmap_item->hash_max) { - err = MERGE_ERR_COLLI_MAX; - } else if (page_hash(page, hash_strength, 0) == - page_hash(tree_page, hash_strength, 0)) { - inc_rshash_neg(memcmp_cost + hash_strength * 2); - err = MERGE_ERR_COLLI; - } else { - err = MERGE_ERR_CHANGED; - } - - unlock_page(tree_page); - } - -restore_out: - lock_page(page); - if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), - orig_pte1, wprt_pte1)) - page->mapping = saved_mapping; - - unlock_page(page); -out: - return err; -} - -static inline int hash_cmp(u32 new_val, u32 node_val) -{ - if (new_val > node_val) - return 1; - else if (new_val < node_val) - return -1; - else - return 0; -} - -static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) -{ - u32 hash_max = item->hash_max; - - if (!hash_max) { - hash_max = page_hash_max(item->page, hash); - - item->hash_max = hash_max; - } - - return hash_max; -} - - - -/** - * stable_tree_search() - search the stable tree for a page - * - * @item: the rmap_item we are comparing with - * @hash: the hash value of this item->page already calculated - * - * @return the page we have found, NULL otherwise. The page returned has - * been gotten. - */ -static struct page *stable_tree_search(struct rmap_item *item, u32 hash) -{ - struct rb_node *node = root_stable_treep->rb_node; - struct tree_node *tree_node; - unsigned long hash_max; - struct page *page = item->page; - struct stable_node *stable_node; - - stable_node = page_stable_node(page); - if (stable_node) { - /* ksm page forked, that is - * if (PageKsm(page) && !in_stable_tree(rmap_item)) - * it's actually gotten once outside. - */ - get_page(page); - return page; - } - - while (node) { - int cmp; - - tree_node = rb_entry(node, struct tree_node, node); - - cmp = hash_cmp(hash, tree_node->hash); - - if (cmp < 0) - node = node->rb_left; - else if (cmp > 0) - node = node->rb_right; - else - break; - } - - if (!node) - return NULL; - - if (tree_node->count == 1) { - stable_node = rb_entry(tree_node->sub_root.rb_node, - struct stable_node, node); - BUG_ON(!stable_node); - - goto get_page_out; - } - - /* - * ok, we have to search the second - * level subtree, hash the page to a - * full strength. - */ - node = tree_node->sub_root.rb_node; - BUG_ON(!node); - hash_max = rmap_item_hash_max(item, hash); - - while (node) { - int cmp; - - stable_node = rb_entry(node, struct stable_node, node); - - cmp = hash_cmp(hash_max, stable_node->hash_max); - - if (cmp < 0) - node = node->rb_left; - else if (cmp > 0) - node = node->rb_right; - else - goto get_page_out; - } - - return NULL; - -get_page_out: - page = get_uksm_page(stable_node, 1, 1); - return page; -} - -static int try_merge_rmap_item(struct rmap_item *item, - struct page *kpage, - struct page *tree_page) -{ - spinlock_t *ptl; - pte_t *ptep; - unsigned long addr; - struct vm_area_struct *vma = item->slot->vma; - - addr = get_rmap_addr(item); - ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0); - if (!ptep) - return 0; - - if (pte_write(*ptep)) { - /* has changed, abort! */ - pte_unmap_unlock(ptep, ptl); - return 0; - } - - get_page(tree_page); - page_add_anon_rmap(tree_page, vma, addr); - - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(vma->vm_mm, addr, ptep, - mk_pte(tree_page, vma->vm_page_prot)); - - page_remove_rmap(kpage); - put_page(kpage); - - pte_unmap_unlock(ptep, ptl); - - return 1; -} - -/** - * try_to_merge_with_stable_page() - when two rmap_items need to be inserted - * into stable tree, the page was found to be identical to a stable ksm page, - * this is the last chance we can merge them into one. - * - * @item1: the rmap_item holding the page which we wanted to insert - * into stable tree. - * @item2: the other rmap_item we found when unstable tree search - * @oldpage: the page currently mapped by the two rmap_items - * @tree_page: the page we found identical in stable tree node - * @success1: return if item1 is successfully merged - * @success2: return if item2 is successfully merged - */ -static void try_merge_with_stable(struct rmap_item *item1, - struct rmap_item *item2, - struct page **kpage, - struct page *tree_page, - int *success1, int *success2) -{ - struct vm_area_struct *vma1 = item1->slot->vma; - struct vm_area_struct *vma2 = item2->slot->vma; - *success1 = 0; - *success2 = 0; - - if (unlikely(*kpage == tree_page)) { - /* I don't think this can really happen */ - printk(KERN_WARNING "UKSM: unexpected condition detected in " - "try_merge_with_stable() -- *kpage == tree_page !\n"); - *success1 = 1; - *success2 = 1; - return; - } - - if (!PageAnon(*kpage) || !PageKsm(*kpage)) - goto failed; - - if (!trylock_page(tree_page)) - goto failed; - - /* If the oldpage is still ksm and still pointed - * to in the right place, and still write protected, - * we are confident it's not changed, no need to - * memcmp anymore. - * be ware, we cannot take nested pte locks, - * deadlock risk. - */ - if (!try_merge_rmap_item(item1, *kpage, tree_page)) - goto unlock_failed; - - /* ok, then vma2, remind that pte1 already set */ - if (!try_merge_rmap_item(item2, *kpage, tree_page)) - goto success_1; - - *success2 = 1; -success_1: - *success1 = 1; - - - if ((*success1 && vma1->vm_flags & VM_LOCKED) || - (*success2 && vma2->vm_flags & VM_LOCKED)) { - munlock_vma_page(*kpage); - if (!PageMlocked(tree_page)) - mlock_vma_page(tree_page); - } - - /* - * We do not need oldpage any more in the caller, so can break the lock - * now. - */ - unlock_page(*kpage); - *kpage = tree_page; /* Get unlocked outside. */ - return; - -unlock_failed: - unlock_page(tree_page); -failed: - return; -} - -static inline void stable_node_hash_max(struct stable_node *node, - struct page *page, u32 hash) -{ - u32 hash_max = node->hash_max; - - if (!hash_max) { - hash_max = page_hash_max(page, hash); - node->hash_max = hash_max; - } -} - -static inline -struct stable_node *new_stable_node(struct tree_node *tree_node, - struct page *kpage, u32 hash_max) -{ - struct stable_node *new_stable_node; - - new_stable_node = alloc_stable_node(); - if (!new_stable_node) - return NULL; - - new_stable_node->kpfn = page_to_pfn(kpage); - new_stable_node->hash_max = hash_max; - new_stable_node->tree_node = tree_node; - set_page_stable_node(kpage, new_stable_node); - - return new_stable_node; -} - -static inline -struct stable_node *first_level_insert(struct tree_node *tree_node, - struct rmap_item *rmap_item, - struct rmap_item *tree_rmap_item, - struct page **kpage, u32 hash, - int *success1, int *success2) -{ - int cmp; - struct page *tree_page; - u32 hash_max = 0; - struct stable_node *stable_node, *new_snode; - struct rb_node *parent = NULL, **new; - - /* this tree node contains no sub-tree yet */ - stable_node = rb_entry(tree_node->sub_root.rb_node, - struct stable_node, node); - - tree_page = get_uksm_page(stable_node, 1, 0); - if (tree_page) { - cmp = memcmp_pages(*kpage, tree_page, 1); - if (!cmp) { - try_merge_with_stable(rmap_item, tree_rmap_item, kpage, - tree_page, success1, success2); - put_page(tree_page); - if (!*success1 && !*success2) - goto failed; - - return stable_node; - - } else { - /* - * collision in first level try to create a subtree. - * A new node need to be created. - */ - put_page(tree_page); - - stable_node_hash_max(stable_node, tree_page, - tree_node->hash); - hash_max = rmap_item_hash_max(rmap_item, hash); - cmp = hash_cmp(hash_max, stable_node->hash_max); - - parent = &stable_node->node; - if (cmp < 0) { - new = &parent->rb_left; - } else if (cmp > 0) { - new = &parent->rb_right; - } else { - goto failed; - } - } - - } else { - /* the only stable_node deleted, we reuse its tree_node. - */ - parent = NULL; - new = &tree_node->sub_root.rb_node; - } - - new_snode = new_stable_node(tree_node, *kpage, hash_max); - if (!new_snode) - goto failed; - - rb_link_node(&new_snode->node, parent, new); - rb_insert_color(&new_snode->node, &tree_node->sub_root); - tree_node->count++; - *success1 = *success2 = 1; - - return new_snode; - -failed: - return NULL; -} - -static inline -struct stable_node *stable_subtree_insert(struct tree_node *tree_node, - struct rmap_item *rmap_item, - struct rmap_item *tree_rmap_item, - struct page **kpage, u32 hash, - int *success1, int *success2) -{ - struct page *tree_page; - u32 hash_max; - struct stable_node *stable_node, *new_snode; - struct rb_node *parent, **new; - -research: - parent = NULL; - new = &tree_node->sub_root.rb_node; - BUG_ON(!*new); - hash_max = rmap_item_hash_max(rmap_item, hash); - while (*new) { - int cmp; - - stable_node = rb_entry(*new, struct stable_node, node); - - cmp = hash_cmp(hash_max, stable_node->hash_max); - - if (cmp < 0) { - parent = *new; - new = &parent->rb_left; - } else if (cmp > 0) { - parent = *new; - new = &parent->rb_right; - } else { - tree_page = get_uksm_page(stable_node, 1, 0); - if (tree_page) { - cmp = memcmp_pages(*kpage, tree_page, 1); - if (!cmp) { - try_merge_with_stable(rmap_item, - tree_rmap_item, kpage, - tree_page, success1, success2); - - put_page(tree_page); - if (!*success1 && !*success2) - goto failed; - /* - * successfully merged with a stable - * node - */ - return stable_node; - } else { - put_page(tree_page); - goto failed; - } - } else { - /* - * stable node may be deleted, - * and subtree maybe - * restructed, cannot - * continue, research it. - */ - if (tree_node->count) { - goto research; - } else { - /* reuse the tree node*/ - parent = NULL; - new = &tree_node->sub_root.rb_node; - } - } - } - } - - new_snode = new_stable_node(tree_node, *kpage, hash_max); - if (!new_snode) - goto failed; - - rb_link_node(&new_snode->node, parent, new); - rb_insert_color(&new_snode->node, &tree_node->sub_root); - tree_node->count++; - *success1 = *success2 = 1; - - return new_snode; - -failed: - return NULL; -} - - -/** - * stable_tree_insert() - try to insert a merged page in unstable tree to - * the stable tree - * - * @kpage: the page need to be inserted - * @hash: the current hash of this page - * @rmap_item: the rmap_item being scanned - * @tree_rmap_item: the rmap_item found on unstable tree - * @success1: return if rmap_item is merged - * @success2: return if tree_rmap_item is merged - * - * @return the stable_node on stable tree if at least one - * rmap_item is inserted into stable tree, NULL - * otherwise. - */ -static struct stable_node * -stable_tree_insert(struct page **kpage, u32 hash, - struct rmap_item *rmap_item, - struct rmap_item *tree_rmap_item, - int *success1, int *success2) -{ - struct rb_node **new = &root_stable_treep->rb_node; - struct rb_node *parent = NULL; - struct stable_node *stable_node; - struct tree_node *tree_node; - u32 hash_max = 0; - - *success1 = *success2 = 0; - - while (*new) { - int cmp; - - tree_node = rb_entry(*new, struct tree_node, node); - - cmp = hash_cmp(hash, tree_node->hash); - - if (cmp < 0) { - parent = *new; - new = &parent->rb_left; - } else if (cmp > 0) { - parent = *new; - new = &parent->rb_right; - } else - break; - } - - if (*new) { - if (tree_node->count == 1) { - stable_node = first_level_insert(tree_node, rmap_item, - tree_rmap_item, kpage, - hash, success1, success2); - } else { - stable_node = stable_subtree_insert(tree_node, - rmap_item, tree_rmap_item, kpage, - hash, success1, success2); - } - } else { - - /* no tree node found */ - tree_node = alloc_tree_node(stable_tree_node_listp); - if (!tree_node) { - stable_node = NULL; - goto out; - } - - stable_node = new_stable_node(tree_node, *kpage, hash_max); - if (!stable_node) { - free_tree_node(tree_node); - goto out; - } - - tree_node->hash = hash; - rb_link_node(&tree_node->node, parent, new); - rb_insert_color(&tree_node->node, root_stable_treep); - parent = NULL; - new = &tree_node->sub_root.rb_node; - - rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, &tree_node->sub_root); - tree_node->count++; - *success1 = *success2 = 1; - } - -out: - return stable_node; -} - - -/** - * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem - * - * @return 0 on success, -EBUSY if unable to lock the mmap_sem, - * -EINVAL if the page mapping has been changed. - */ -static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) -{ - int err; - - err = get_mergeable_page_lock_mmap(tree_rmap_item); - - if (err == -EINVAL) { - /* its page map has been changed, remove it */ - remove_rmap_item_from_tree(tree_rmap_item); - } - - /* The page is gotten and mmap_sem is locked now. */ - return err; -} - - -/** - * unstable_tree_search_insert() - search an unstable tree rmap_item with the - * same hash value. Get its page and trylock the mmap_sem - */ -static inline -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, - u32 hash) - -{ - struct rb_node **new = &root_unstable_tree.rb_node; - struct rb_node *parent = NULL; - struct tree_node *tree_node; - u32 hash_max; - struct rmap_item *tree_rmap_item; - - while (*new) { - int cmp; - - tree_node = rb_entry(*new, struct tree_node, node); - - cmp = hash_cmp(hash, tree_node->hash); - - if (cmp < 0) { - parent = *new; - new = &parent->rb_left; - } else if (cmp > 0) { - parent = *new; - new = &parent->rb_right; - } else - break; - } - - if (*new) { - /* got the tree_node */ - if (tree_node->count == 1) { - tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, - struct rmap_item, node); - BUG_ON(!tree_rmap_item); - - goto get_page_out; - } - - /* well, search the collision subtree */ - new = &tree_node->sub_root.rb_node; - BUG_ON(!*new); - hash_max = rmap_item_hash_max(rmap_item, hash); - - while (*new) { - int cmp; - - tree_rmap_item = rb_entry(*new, struct rmap_item, - node); - - cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); - parent = *new; - if (cmp < 0) - new = &parent->rb_left; - else if (cmp > 0) - new = &parent->rb_right; - else - goto get_page_out; - } - } else { - /* alloc a new tree_node */ - tree_node = alloc_tree_node(&unstable_tree_node_list); - if (!tree_node) - return NULL; - - tree_node->hash = hash; - rb_link_node(&tree_node->node, parent, new); - rb_insert_color(&tree_node->node, &root_unstable_tree); - parent = NULL; - new = &tree_node->sub_root.rb_node; - } - - /* did not found even in sub-tree */ - rmap_item->tree_node = tree_node; - rmap_item->address |= UNSTABLE_FLAG; - rmap_item->hash_round = uksm_hash_round; - rb_link_node(&rmap_item->node, parent, new); - rb_insert_color(&rmap_item->node, &tree_node->sub_root); - - uksm_pages_unshared++; - return NULL; - -get_page_out: - if (tree_rmap_item->page == rmap_item->page) - return NULL; - - if (get_tree_rmap_item_page(tree_rmap_item)) - return NULL; - - return tree_rmap_item; -} - -static void hold_anon_vma(struct rmap_item *rmap_item, - struct anon_vma *anon_vma) -{ - rmap_item->anon_vma = anon_vma; - get_anon_vma(anon_vma); -} - - -/** - * stable_tree_append() - append a rmap_item to a stable node. Deduplication - * ratio statistics is done in this function. - * - */ -static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node, int logdedup) -{ - struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; - unsigned long key = (unsigned long)rmap_item->slot; - unsigned long factor = rmap_item->slot->rung->step; - - BUG_ON(!stable_node); - rmap_item->address |= STABLE_FLAG; - - if (hlist_empty(&stable_node->hlist)) { - uksm_pages_shared++; - goto node_vma_new; - } else { - uksm_pages_sharing++; - } - - hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { - if (node_vma->key >= key) - break; - - if (logdedup) { - node_vma->slot->pages_bemerged += factor; - if (list_empty(&node_vma->slot->dedup_list)) - list_add(&node_vma->slot->dedup_list, - &vma_slot_dedup); - } - } - - if (node_vma) { - if (node_vma->key == key) { - node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); - goto node_vma_ok; - } else if (node_vma->key > key) { - node_vma_cont = node_vma; - } - } - -node_vma_new: - /* no same vma already in node, alloc a new node_vma */ - new_node_vma = alloc_node_vma(); - BUG_ON(!new_node_vma); - new_node_vma->head = stable_node; - new_node_vma->slot = rmap_item->slot; - - if (!node_vma) { - hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); - } else if (node_vma->key != key) { - if (node_vma->key < key) - hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); - else { - hlist_add_before(&new_node_vma->hlist, - &node_vma->hlist); - } - - } - node_vma = new_node_vma; - -node_vma_ok: /* ok, ready to add to the list */ - rmap_item->head = node_vma; - hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); - hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); - if (logdedup) { - rmap_item->slot->pages_merged++; - if (node_vma_cont) { - node_vma = node_vma_cont; - hlist_for_each_entry_continue(node_vma, hlist) { - node_vma->slot->pages_bemerged += factor; - if (list_empty(&node_vma->slot->dedup_list)) - list_add(&node_vma->slot->dedup_list, - &vma_slot_dedup); - } - } - } -} - -/* - * We use break_ksm to break COW on a ksm page: it's a stripped down - * - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) - * put_page(page); - * - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, - * in case the application has unmapped and remapped mm,addr meanwhile. - * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP - * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. - */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) -{ - struct page *page; - int ret = 0; - - do { - cond_resched(); - page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) - break; - if (PageKsm(page)) { - ret = handle_mm_fault(vma->vm_mm, vma, addr, - FAULT_FLAG_WRITE); - } else - ret = VM_FAULT_WRITE; - put_page(page); - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); - /* - * We must loop because handle_mm_fault() may back out if there's - * any difficulty e.g. if pte accessed bit gets updated concurrently. - * - * VM_FAULT_WRITE is what we have been hoping for: it indicates that - * COW has been broken, even if the vma does not permit VM_WRITE; - * but note that a concurrent fault might break PageKsm for us. - * - * VM_FAULT_SIGBUS could occur if we race with truncation of the - * backing file, which also invalidates anonymous pages: that's - * okay, that truncation will have unmapped the PageKsm for us. - * - * VM_FAULT_OOM: at the time of writing (late July 2009), setting - * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the - * current task has TIF_MEMDIE set, and will be OOM killed on return - * to user; and ksmd, having no mm, would never be chosen for that. - * - * But if the mm is in a limited mem_cgroup, then the fault may fail - * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and - * even ksmd can fail in this way - though it's usually breaking ksm - * just to undo a merge it made a moment before, so unlikely to oom. - * - * That's a pity: we might therefore have more kernel pages allocated - * than we're counting as nodes in the stable tree; but uksm_do_scan - * will retry to break_cow on each pass, so should recover the page - * in due course. The important thing is to not let VM_MERGEABLE - * be cleared while any such pages might remain in the area. - */ - return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; -} - -static void break_cow(struct rmap_item *rmap_item) -{ - struct vm_area_struct *vma = rmap_item->slot->vma; - struct mm_struct *mm = vma->vm_mm; - unsigned long addr = get_rmap_addr(rmap_item); - - if (uksm_test_exit(mm)) - goto out; - - break_ksm(vma, addr); -out: - return; -} - -/* - * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather - * than check every pte of a given vma, the locking doesn't quite work for - * that - an rmap_item is assigned to the stable tree after inserting ksm - * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing - * rmap_items from parent to child at fork time (so as not to waste time - * if exit comes before the next scan reaches it). - * - * Similarly, although we'd like to remove rmap_items (so updating counts - * and freeing memory) when unmerging an area, it's easier to leave that - * to the next pass of ksmd - consider, for example, how ksmd might be - * in cmp_and_merge_page on one of the rmap_items we would be removing. - */ -inline int unmerge_uksm_pages(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - unsigned long addr; - int err = 0; - - for (addr = start; addr < end && !err; addr += PAGE_SIZE) { - if (uksm_test_exit(vma->vm_mm)) - break; - if (signal_pending(current)) - err = -ERESTARTSYS; - else - err = break_ksm(vma, addr); - } - return err; -} - -static inline void inc_uksm_pages_scanned(void) -{ - u64 delta; - - - if (uksm_pages_scanned == U64_MAX) { - encode_benefit(); - - delta = uksm_pages_scanned >> pages_scanned_base; - - if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { - pages_scanned_stored >>= 1; - delta >>= 1; - pages_scanned_base++; - } - - pages_scanned_stored += delta; - - uksm_pages_scanned = uksm_pages_scanned_last = 0; - } - - uksm_pages_scanned++; -} - -static inline int find_zero_page_hash(int strength, u32 hash) -{ - return (zero_hash_table[strength] == hash); -} - -static -int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) -{ - struct page *zero_page = empty_uksm_zero_page; - struct mm_struct *mm = vma->vm_mm; - pte_t orig_pte = __pte(0); - int err = -EFAULT; - - if (uksm_test_exit(mm)) - goto out; - - if (PageTransCompound(page) && page_trans_compound_anon_split(page)) - goto out; - BUG_ON(PageTransCompound(page)); - - if (!PageAnon(page)) - goto out; - - if (!trylock_page(page)) - goto out; - - if (write_protect_page(vma, page, &orig_pte, 0) == 0) { - if (is_page_full_zero(page)) - err = replace_page(vma, page, zero_page, orig_pte); - } - - unlock_page(page); -out: - return err; -} - -/* - * cmp_and_merge_page() - first see if page can be merged into the stable - * tree; if not, compare hash to previous and if it's the same, see if page - * can be inserted into the unstable tree, or merged with a page already there - * and both transferred to the stable tree. - * - * @page: the page that we are searching identical page to. - * @rmap_item: the reverse mapping into the virtual address of this page - */ -static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) -{ - struct rmap_item *tree_rmap_item; - struct page *page; - struct page *kpage = NULL; - u32 hash_max; - int err; - unsigned int success1, success2; - struct stable_node *snode; - int cmp; - struct rb_node *parent = NULL, **new; - - remove_rmap_item_from_tree(rmap_item); - page = rmap_item->page; - - /* We first start with searching the page inside the stable tree */ - kpage = stable_tree_search(rmap_item, hash); - if (kpage) { - err = try_to_merge_with_uksm_page(rmap_item, kpage, - hash); - if (!err) { - /* - * The page was successfully merged, add - * its rmap_item to the stable tree. - * page lock is needed because it's - * racing with try_to_unmap_ksm(), etc. - */ - lock_page(kpage); - snode = page_stable_node(kpage); - stable_tree_append(rmap_item, snode, 1); - unlock_page(kpage); - put_page(kpage); - return; /* success */ - } - put_page(kpage); - - /* - * if it's a collision and it has been search in sub-rbtree - * (hash_max != 0), we want to abort, because if it is - * successfully merged in unstable tree, the collision trends to - * happen again. - */ - if (err == MERGE_ERR_COLLI && rmap_item->hash_max) - return; - } - - tree_rmap_item = - unstable_tree_search_insert(rmap_item, hash); - if (tree_rmap_item) { - err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); - /* - * As soon as we merge this page, we want to remove the - * rmap_item of the page we have merged with from the unstable - * tree, and insert it instead as new node in the stable tree. - */ - if (!err) { - kpage = page; - remove_rmap_item_from_tree(tree_rmap_item); - lock_page(kpage); - snode = stable_tree_insert(&kpage, hash, - rmap_item, tree_rmap_item, - &success1, &success2); - - /* - * Do not log dedup for tree item, it's not counted as - * scanned in this round. - */ - if (success2) - stable_tree_append(tree_rmap_item, snode, 0); - - /* - * The order of these two stable append is important: - * we are scanning rmap_item. - */ - if (success1) - stable_tree_append(rmap_item, snode, 1); - - /* - * The original kpage may be unlocked inside - * stable_tree_insert() already. This page - * should be unlocked before doing - * break_cow(). - */ - unlock_page(kpage); - - if (!success1) - break_cow(rmap_item); - - if (!success2) - break_cow(tree_rmap_item); - - } else if (err == MERGE_ERR_COLLI) { - BUG_ON(tree_rmap_item->tree_node->count > 1); - - rmap_item_hash_max(tree_rmap_item, - tree_rmap_item->tree_node->hash); - - hash_max = rmap_item_hash_max(rmap_item, hash); - cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); - parent = &tree_rmap_item->node; - if (cmp < 0) - new = &parent->rb_left; - else if (cmp > 0) - new = &parent->rb_right; - else - goto put_up_out; - - rmap_item->tree_node = tree_rmap_item->tree_node; - rmap_item->address |= UNSTABLE_FLAG; - rmap_item->hash_round = uksm_hash_round; - rb_link_node(&rmap_item->node, parent, new); - rb_insert_color(&rmap_item->node, - &tree_rmap_item->tree_node->sub_root); - rmap_item->tree_node->count++; - } else { - /* - * either one of the page has changed or they collide - * at the max hash, we consider them as ill items. - */ - remove_rmap_item_from_tree(tree_rmap_item); - } -put_up_out: - put_page(tree_rmap_item->page); - up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); - } -} - - - - -static inline unsigned long get_pool_index(struct vma_slot *slot, - unsigned long index) -{ - unsigned long pool_index; - - pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; - if (pool_index >= slot->pool_size) - BUG(); - return pool_index; -} - -static inline unsigned long index_page_offset(unsigned long index) -{ - return offset_in_page(sizeof(struct rmap_list_entry *) * index); -} - -static inline -struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, - unsigned long index, int need_alloc) -{ - unsigned long pool_index; - struct page *page; - void *addr; - - - pool_index = get_pool_index(slot, index); - if (!slot->rmap_list_pool[pool_index]) { - if (!need_alloc) - return NULL; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); - if (!page) - return NULL; - - slot->rmap_list_pool[pool_index] = page; - } - - addr = kmap(slot->rmap_list_pool[pool_index]); - addr += index_page_offset(index); - - return addr; -} - -static inline void put_rmap_list_entry(struct vma_slot *slot, - unsigned long index) -{ - unsigned long pool_index; - - pool_index = get_pool_index(slot, index); - BUG_ON(!slot->rmap_list_pool[pool_index]); - kunmap(slot->rmap_list_pool[pool_index]); -} - -static inline int entry_is_new(struct rmap_list_entry *entry) -{ - return !entry->item; -} - -static inline unsigned long get_index_orig_addr(struct vma_slot *slot, - unsigned long index) -{ - return slot->vma->vm_start + (index << PAGE_SHIFT); -} - -static inline unsigned long get_entry_address(struct rmap_list_entry *entry) -{ - unsigned long addr; - - if (is_addr(entry->addr)) - addr = get_clean_addr(entry->addr); - else if (entry->item) - addr = get_rmap_addr(entry->item); - else - BUG(); - - return addr; -} - -static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) -{ - if (is_addr(entry->addr)) - return NULL; - - return entry->item; -} - -static inline void inc_rmap_list_pool_count(struct vma_slot *slot, - unsigned long index) -{ - unsigned long pool_index; - - pool_index = get_pool_index(slot, index); - BUG_ON(!slot->rmap_list_pool[pool_index]); - slot->pool_counts[pool_index]++; -} - -static inline void dec_rmap_list_pool_count(struct vma_slot *slot, - unsigned long index) -{ - unsigned long pool_index; - - pool_index = get_pool_index(slot, index); - BUG_ON(!slot->rmap_list_pool[pool_index]); - BUG_ON(!slot->pool_counts[pool_index]); - slot->pool_counts[pool_index]--; -} - -static inline int entry_has_rmap(struct rmap_list_entry *entry) -{ - return !is_addr(entry->addr) && entry->item; -} - -static inline void swap_entries(struct rmap_list_entry *entry1, - unsigned long index1, - struct rmap_list_entry *entry2, - unsigned long index2) -{ - struct rmap_list_entry tmp; - - /* swapping two new entries is meaningless */ - BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); - - tmp = *entry1; - *entry1 = *entry2; - *entry2 = tmp; - - if (entry_has_rmap(entry1)) - entry1->item->entry_index = index1; - - if (entry_has_rmap(entry2)) - entry2->item->entry_index = index2; - - if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { - inc_rmap_list_pool_count(entry1->item->slot, index1); - dec_rmap_list_pool_count(entry1->item->slot, index2); - } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { - inc_rmap_list_pool_count(entry2->item->slot, index2); - dec_rmap_list_pool_count(entry2->item->slot, index1); - } -} - -static inline void free_entry_item(struct rmap_list_entry *entry) -{ - unsigned long index; - struct rmap_item *item; - - if (!is_addr(entry->addr)) { - BUG_ON(!entry->item); - item = entry->item; - entry->addr = get_rmap_addr(item); - set_is_addr(entry->addr); - index = item->entry_index; - remove_rmap_item_from_tree(item); - dec_rmap_list_pool_count(item->slot, index); - free_rmap_item(item); - } -} - -static inline int pool_entry_boundary(unsigned long index) -{ - unsigned long linear_addr; - - linear_addr = sizeof(struct rmap_list_entry *) * index; - return index && !offset_in_page(linear_addr); -} - -static inline void try_free_last_pool(struct vma_slot *slot, - unsigned long index) -{ - unsigned long pool_index; - - pool_index = get_pool_index(slot, index); - if (slot->rmap_list_pool[pool_index] && - !slot->pool_counts[pool_index]) { - __free_page(slot->rmap_list_pool[pool_index]); - slot->rmap_list_pool[pool_index] = NULL; - slot->flags |= UKSM_SLOT_NEED_SORT; - } - -} - -static inline unsigned long vma_item_index(struct vm_area_struct *vma, - struct rmap_item *item) -{ - return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; -} - -static int within_same_pool(struct vma_slot *slot, - unsigned long i, unsigned long j) -{ - unsigned long pool_i, pool_j; - - pool_i = get_pool_index(slot, i); - pool_j = get_pool_index(slot, j); - - return (pool_i == pool_j); -} - -static void sort_rmap_entry_list(struct vma_slot *slot) -{ - unsigned long i, j; - struct rmap_list_entry *entry, *swap_entry; - - entry = get_rmap_list_entry(slot, 0, 0); - for (i = 0; i < slot->pages; ) { - - if (!entry) - goto skip_whole_pool; - - if (entry_is_new(entry)) - goto next_entry; - - if (is_addr(entry->addr)) { - entry->addr = 0; - goto next_entry; - } - - j = vma_item_index(slot->vma, entry->item); - if (j == i) - goto next_entry; - - if (within_same_pool(slot, i, j)) - swap_entry = entry + j - i; - else - swap_entry = get_rmap_list_entry(slot, j, 1); - - swap_entries(entry, i, swap_entry, j); - if (!within_same_pool(slot, i, j)) - put_rmap_list_entry(slot, j); - continue; - -skip_whole_pool: - i += PAGE_SIZE / sizeof(*entry); - if (i < slot->pages) - entry = get_rmap_list_entry(slot, i, 0); - continue; - -next_entry: - if (i >= slot->pages - 1 || - !within_same_pool(slot, i, i + 1)) { - put_rmap_list_entry(slot, i); - if (i + 1 < slot->pages) - entry = get_rmap_list_entry(slot, i + 1, 0); - } else - entry++; - i++; - continue; - } - - /* free empty pool entries which contain no rmap_item */ - /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ - for (i = 0; i < slot->pool_size; i++) { - unsigned char has_rmap; - void *addr; - - if (!slot->rmap_list_pool[i]) - continue; - - has_rmap = 0; - addr = kmap(slot->rmap_list_pool[i]); - BUG_ON(!addr); - for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { - entry = (struct rmap_list_entry *)addr + j; - if (is_addr(entry->addr)) - continue; - if (!entry->item) - continue; - has_rmap = 1; - } - kunmap(slot->rmap_list_pool[i]); - if (!has_rmap) { - BUG_ON(slot->pool_counts[i]); - __free_page(slot->rmap_list_pool[i]); - slot->rmap_list_pool[i] = NULL; - } - } - - slot->flags &= ~UKSM_SLOT_NEED_SORT; -} - -/* - * vma_fully_scanned() - if all the pages in this slot have been scanned. - */ -static inline int vma_fully_scanned(struct vma_slot *slot) -{ - return slot->pages_scanned == slot->pages; -} - -/** - * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to - * its random permutation. This function is embedded with the random - * permutation index management code. - */ -static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) -{ - unsigned long rand_range, addr, swap_index, scan_index; - struct rmap_item *item = NULL; - struct rmap_list_entry *scan_entry, *swap_entry = NULL; - struct page *page; - - scan_index = swap_index = slot->pages_scanned % slot->pages; - - if (pool_entry_boundary(scan_index)) - try_free_last_pool(slot, scan_index - 1); - - if (vma_fully_scanned(slot)) { - if (slot->flags & UKSM_SLOT_NEED_SORT) - slot->flags |= UKSM_SLOT_NEED_RERAND; - else - slot->flags &= ~UKSM_SLOT_NEED_RERAND; - if (slot->flags & UKSM_SLOT_NEED_SORT) - sort_rmap_entry_list(slot); - } - - scan_entry = get_rmap_list_entry(slot, scan_index, 1); - if (!scan_entry) - return NULL; - - if (entry_is_new(scan_entry)) { - scan_entry->addr = get_index_orig_addr(slot, scan_index); - set_is_addr(scan_entry->addr); - } - - if (slot->flags & UKSM_SLOT_NEED_RERAND) { - rand_range = slot->pages - scan_index; - BUG_ON(!rand_range); - swap_index = scan_index + (prandom_u32() % rand_range); - } - - if (swap_index != scan_index) { - swap_entry = get_rmap_list_entry(slot, swap_index, 1); - if (entry_is_new(swap_entry)) { - swap_entry->addr = get_index_orig_addr(slot, - swap_index); - set_is_addr(swap_entry->addr); - } - swap_entries(scan_entry, scan_index, swap_entry, swap_index); - } - - addr = get_entry_address(scan_entry); - item = get_entry_item(scan_entry); - BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); - - page = follow_page(slot->vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) - goto nopage; - - if (!PageAnon(page) && !page_trans_compound_anon(page)) - goto putpage; - - /*check is zero_page pfn or uksm_zero_page*/ - if ((page_to_pfn(page) == zero_pfn) - || (page_to_pfn(page) == uksm_zero_pfn)) - goto putpage; - - flush_anon_page(slot->vma, page, addr); - flush_dcache_page(page); - - - *hash = page_hash(page, hash_strength, 1); - inc_uksm_pages_scanned(); - /*if the page content all zero, re-map to zero-page*/ - if (find_zero_page_hash(hash_strength, *hash)) { - if (!cmp_and_merge_zero_page(slot->vma, page)) { - slot->pages_merged++; - inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); - dec_mm_counter(slot->mm, MM_ANONPAGES); - - /* For full-zero pages, no need to create rmap item */ - goto putpage; - } else { - inc_rshash_neg(memcmp_cost / 2); - } - } - - if (!item) { - item = alloc_rmap_item(); - if (item) { - /* It has already been zeroed */ - item->slot = slot; - item->address = addr; - item->entry_index = scan_index; - scan_entry->item = item; - inc_rmap_list_pool_count(slot, scan_index); - } else - goto putpage; - } - - BUG_ON(item->slot != slot); - /* the page may have changed */ - item->page = page; - put_rmap_list_entry(slot, scan_index); - if (swap_entry) - put_rmap_list_entry(slot, swap_index); - return item; - -putpage: - put_page(page); - page = NULL; -nopage: - /* no page, store addr back and free rmap_item if possible */ - free_entry_item(scan_entry); - put_rmap_list_entry(slot, scan_index); - if (swap_entry) - put_rmap_list_entry(slot, swap_index); - return NULL; -} - -static inline int in_stable_tree(struct rmap_item *rmap_item) -{ - return rmap_item->address & STABLE_FLAG; -} - -/** - * scan_vma_one_page() - scan the next page in a vma_slot. Called with - * mmap_sem locked. - */ -static noinline void scan_vma_one_page(struct vma_slot *slot) -{ - u32 hash; - struct mm_struct *mm; - struct rmap_item *rmap_item = NULL; - struct vm_area_struct *vma = slot->vma; - - mm = vma->vm_mm; - BUG_ON(!mm); - BUG_ON(!slot); - - rmap_item = get_next_rmap_item(slot, &hash); - if (!rmap_item) - goto out1; - - if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) - goto out2; - - cmp_and_merge_page(rmap_item, hash); -out2: - put_page(rmap_item->page); -out1: - slot->pages_scanned++; - if (slot->fully_scanned_round != fully_scanned_round) - scanned_virtual_pages++; - - if (vma_fully_scanned(slot)) - slot->fully_scanned_round = fully_scanned_round; -} - -static inline unsigned long rung_get_pages(struct scan_rung *rung) -{ - struct slot_tree_node *node; - - if (!rung->vma_root.rnode) - return 0; - - node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); - - return node->size; -} - -#define RUNG_SAMPLED_MIN 3 - -static inline -void uksm_calc_rung_step(struct scan_rung *rung, - unsigned long page_time, unsigned long ratio) -{ - unsigned long sampled, pages; - - /* will be fully scanned ? */ - if (!rung->cover_msecs) { - rung->step = 1; - return; - } - - sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) - * ratio / page_time; - - /* - * Before we finsish a scan round and expensive per-round jobs, - * we need to have a chance to estimate the per page time. So - * the sampled number can not be too small. - */ - if (sampled < RUNG_SAMPLED_MIN) - sampled = RUNG_SAMPLED_MIN; - - pages = rung_get_pages(rung); - if (likely(pages > sampled)) - rung->step = pages / sampled; - else - rung->step = 1; -} - -static inline int step_need_recalc(struct scan_rung *rung) -{ - unsigned long pages, stepmax; - - pages = rung_get_pages(rung); - stepmax = pages / RUNG_SAMPLED_MIN; - - return pages && (rung->step > pages || - (stepmax && rung->step > stepmax)); -} - -static inline -void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) -{ - struct vma_slot *slot; - - if (finished) - rung->flags |= UKSM_RUNG_ROUND_FINISHED; - - if (step_recalc || step_need_recalc(rung)) { - uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); - BUG_ON(step_need_recalc(rung)); - } - - slot_iter_index = prandom_u32() % rung->step; - BUG_ON(!rung->vma_root.rnode); - slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); - BUG_ON(!slot); - - rung->current_scan = slot; - rung->current_offset = slot_iter_index; -} - -static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) -{ - return &slot->rung->vma_root; -} - -/* - * return if resetted. - */ -static int advance_current_scan(struct scan_rung *rung) -{ - unsigned short n; - struct vma_slot *slot, *next = NULL; - - BUG_ON(!rung->vma_root.num); - - slot = rung->current_scan; - n = (slot->pages - rung->current_offset) % rung->step; - slot_iter_index = rung->step - n; - next = sradix_tree_next(&rung->vma_root, slot->snode, - slot->sindex, slot_iter); - - if (next) { - rung->current_offset = slot_iter_index; - rung->current_scan = next; - return 0; - } else { - reset_current_scan(rung, 1, 0); - return 1; - } -} - -static inline void rung_rm_slot(struct vma_slot *slot) -{ - struct scan_rung *rung = slot->rung; - struct sradix_tree_root *root; - - if (rung->current_scan == slot) - advance_current_scan(rung); - - root = slot_get_root(slot); - sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); - slot->snode = NULL; - if (step_need_recalc(rung)) { - uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); - BUG_ON(step_need_recalc(rung)); - } - - /* In case advance_current_scan loop back to this slot again */ - if (rung->vma_root.num && rung->current_scan == slot) - reset_current_scan(slot->rung, 1, 0); -} - -static inline void rung_add_new_slots(struct scan_rung *rung, - struct vma_slot **slots, unsigned long num) -{ - int err; - struct vma_slot *slot; - unsigned long i; - struct sradix_tree_root *root = &rung->vma_root; - - err = sradix_tree_enter(root, (void **)slots, num); - BUG_ON(err); - - for (i = 0; i < num; i++) { - slot = slots[i]; - slot->rung = rung; - BUG_ON(vma_fully_scanned(slot)); - } - - if (rung->vma_root.num == num) - reset_current_scan(rung, 0, 1); -} - -static inline int rung_add_one_slot(struct scan_rung *rung, - struct vma_slot *slot) -{ - int err; - - err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); - if (err) - return err; - - slot->rung = rung; - if (rung->vma_root.num == 1) - reset_current_scan(rung, 0, 1); - - return 0; -} - -/* - * Return true if the slot is deleted from its rung. - */ -static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) -{ - struct scan_rung *old_rung = slot->rung; - int err; - - if (old_rung == rung) - return 0; - - rung_rm_slot(slot); - err = rung_add_one_slot(rung, slot); - if (err) { - err = rung_add_one_slot(old_rung, slot); - WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ - } - - return 1; -} - -static inline int vma_rung_up(struct vma_slot *slot) -{ - struct scan_rung *rung; - - rung = slot->rung; - if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) - rung++; - - return vma_rung_enter(slot, rung); -} - -static inline int vma_rung_down(struct vma_slot *slot) -{ - struct scan_rung *rung; - - rung = slot->rung; - if (slot->rung != &uksm_scan_ladder[0]) - rung--; - - return vma_rung_enter(slot, rung); -} - -/** - * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. - */ -static unsigned long cal_dedup_ratio(struct vma_slot *slot) -{ - unsigned long ret; - - BUG_ON(slot->pages_scanned == slot->last_scanned); - - ret = slot->pages_merged; - - /* Thrashing area filtering */ - if (ret && uksm_thrash_threshold) { - if (slot->pages_cowed * 100 / slot->pages_merged - > uksm_thrash_threshold) { - ret = 0; - } else { - ret = slot->pages_merged - slot->pages_cowed; - } - } - - return ret; -} - -/** - * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. - */ -static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) -{ - unsigned long ret; - unsigned long pages_scanned; - - pages_scanned = slot->pages_scanned; - if (!pages_scanned) { - if (uksm_thrash_threshold) - return 0; - else - pages_scanned = slot->pages_scanned; - } - - ret = slot->pages_bemerged * 100 / pages_scanned; - - /* Thrashing area filtering */ - if (ret && uksm_thrash_threshold) { - if (slot->pages_cowed * 100 / slot->pages_bemerged - > uksm_thrash_threshold) { - ret = 0; - } else { - ret = slot->pages_bemerged - slot->pages_cowed; - } - } - - return ret; -} - -/** - * stable_node_reinsert() - When the hash_strength has been adjusted, the - * stable tree need to be restructured, this is the function re-inserting the - * stable node. - */ -static inline void stable_node_reinsert(struct stable_node *new_node, - struct page *page, - struct rb_root *root_treep, - struct list_head *tree_node_listp, - u32 hash) -{ - struct rb_node **new = &root_treep->rb_node; - struct rb_node *parent = NULL; - struct stable_node *stable_node; - struct tree_node *tree_node; - struct page *tree_page; - int cmp; - - while (*new) { - int cmp; - - tree_node = rb_entry(*new, struct tree_node, node); - - cmp = hash_cmp(hash, tree_node->hash); - - if (cmp < 0) { - parent = *new; - new = &parent->rb_left; - } else if (cmp > 0) { - parent = *new; - new = &parent->rb_right; - } else - break; - } - - if (*new) { - /* find a stable tree node with same first level hash value */ - stable_node_hash_max(new_node, page, hash); - if (tree_node->count == 1) { - stable_node = rb_entry(tree_node->sub_root.rb_node, - struct stable_node, node); - tree_page = get_uksm_page(stable_node, 1, 0); - if (tree_page) { - stable_node_hash_max(stable_node, - tree_page, hash); - put_page(tree_page); - - /* prepare for stable node insertion */ - - cmp = hash_cmp(new_node->hash_max, - stable_node->hash_max); - parent = &stable_node->node; - if (cmp < 0) - new = &parent->rb_left; - else if (cmp > 0) - new = &parent->rb_right; - else - goto failed; - - goto add_node; - } else { - /* the only stable_node deleted, the tree node - * was not deleted. - */ - goto tree_node_reuse; - } - } - - /* well, search the collision subtree */ - new = &tree_node->sub_root.rb_node; - parent = NULL; - BUG_ON(!*new); - while (*new) { - int cmp; - - stable_node = rb_entry(*new, struct stable_node, node); - - cmp = hash_cmp(new_node->hash_max, - stable_node->hash_max); - - if (cmp < 0) { - parent = *new; - new = &parent->rb_left; - } else if (cmp > 0) { - parent = *new; - new = &parent->rb_right; - } else { - /* oh, no, still a collision */ - goto failed; - } - } - - goto add_node; - } - - /* no tree node found */ - tree_node = alloc_tree_node(tree_node_listp); - if (!tree_node) { - printk(KERN_ERR "UKSM: memory allocation error!\n"); - goto failed; - } else { - tree_node->hash = hash; - rb_link_node(&tree_node->node, parent, new); - rb_insert_color(&tree_node->node, root_treep); - -tree_node_reuse: - /* prepare for stable node insertion */ - parent = NULL; - new = &tree_node->sub_root.rb_node; - } - -add_node: - rb_link_node(&new_node->node, parent, new); - rb_insert_color(&new_node->node, &tree_node->sub_root); - new_node->tree_node = tree_node; - tree_node->count++; - return; - -failed: - /* This can only happen when two nodes have collided - * in two levels. - */ - new_node->tree_node = NULL; - return; -} - -static inline void free_all_tree_nodes(struct list_head *list) -{ - struct tree_node *node, *tmp; - - list_for_each_entry_safe(node, tmp, list, all_list) { - free_tree_node(node); - } -} - -/** - * stable_tree_delta_hash() - Delta hash the stable tree from previous hash - * strength to the current hash_strength. It re-structures the hole tree. - */ -static inline void stable_tree_delta_hash(u32 prev_hash_strength) -{ - struct stable_node *node, *tmp; - struct rb_root *root_new_treep; - struct list_head *new_tree_node_listp; - - stable_tree_index = (stable_tree_index + 1) % 2; - root_new_treep = &root_stable_tree[stable_tree_index]; - new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; - *root_new_treep = RB_ROOT; - BUG_ON(!list_empty(new_tree_node_listp)); - - /* - * we need to be safe, the node could be removed by get_uksm_page() - */ - list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { - void *addr; - struct page *node_page; - u32 hash; - - /* - * We are completely re-structuring the stable nodes to a new - * stable tree. We don't want to touch the old tree unlinks and - * old tree_nodes. The old tree_nodes will be freed at once. - */ - node_page = get_uksm_page(node, 0, 0); - if (!node_page) - continue; - - if (node->tree_node) { - hash = node->tree_node->hash; - - addr = kmap_atomic(node_page); - - hash = delta_hash(addr, prev_hash_strength, - hash_strength, hash); - kunmap_atomic(addr); - } else { - /* - *it was not inserted to rbtree due to collision in last - *round scan. - */ - hash = page_hash(node_page, hash_strength, 0); - } - - stable_node_reinsert(node, node_page, root_new_treep, - new_tree_node_listp, hash); - put_page(node_page); - } - - root_stable_treep = root_new_treep; - free_all_tree_nodes(stable_tree_node_listp); - BUG_ON(!list_empty(stable_tree_node_listp)); - stable_tree_node_listp = new_tree_node_listp; -} - -static inline void inc_hash_strength(unsigned long delta) -{ - hash_strength += 1 << delta; - if (hash_strength > HASH_STRENGTH_MAX) - hash_strength = HASH_STRENGTH_MAX; -} - -static inline void dec_hash_strength(unsigned long delta) -{ - unsigned long change = 1 << delta; - - if (hash_strength <= change + 1) - hash_strength = 1; - else - hash_strength -= change; -} - -static inline void inc_hash_strength_delta(void) -{ - hash_strength_delta++; - if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) - hash_strength_delta = HASH_STRENGTH_DELTA_MAX; -} - -/* -static inline unsigned long get_current_neg_ratio(void) -{ - if (!rshash_pos || rshash_neg > rshash_pos) - return 100; - - return div64_u64(100 * rshash_neg , rshash_pos); -} -*/ - -static inline unsigned long get_current_neg_ratio(void) -{ - u64 pos = benefit.pos; - u64 neg = benefit.neg; - - if (!neg) - return 0; - - if (!pos || neg > pos) - return 100; - - if (neg > div64_u64(U64_MAX, 100)) - pos = div64_u64(pos, 100); - else - neg *= 100; - - return div64_u64(neg, pos); -} - -static inline unsigned long get_current_benefit(void) -{ - u64 pos = benefit.pos; - u64 neg = benefit.neg; - u64 scanned = benefit.scanned; - - if (neg > pos) - return 0; - - return div64_u64((pos - neg), scanned); -} - -static inline int judge_rshash_direction(void) -{ - u64 current_neg_ratio, stable_benefit; - u64 current_benefit, delta = 0; - int ret = STILL; - - /* Try to probe a value after the boot, and in case the system - are still for a long time. */ - if ((fully_scanned_round & 0xFFULL) == 10) { - ret = OBSCURE; - goto out; - } - - current_neg_ratio = get_current_neg_ratio(); - - if (current_neg_ratio == 0) { - rshash_neg_cont_zero++; - if (rshash_neg_cont_zero > 2) - return GO_DOWN; - else - return STILL; - } - rshash_neg_cont_zero = 0; - - if (current_neg_ratio > 90) { - ret = GO_UP; - goto out; - } - - current_benefit = get_current_benefit(); - stable_benefit = rshash_state.stable_benefit; - - if (!stable_benefit) { - ret = OBSCURE; - goto out; - } - - if (current_benefit > stable_benefit) - delta = current_benefit - stable_benefit; - else if (current_benefit < stable_benefit) - delta = stable_benefit - current_benefit; - - delta = div64_u64(100 * delta , stable_benefit); - - if (delta > 50) { - rshash_cont_obscure++; - if (rshash_cont_obscure > 2) - return OBSCURE; - else - return STILL; - } - -out: - rshash_cont_obscure = 0; - return ret; -} - -/** - * rshash_adjust() - The main function to control the random sampling state - * machine for hash strength adapting. - * - * return true if hash_strength has changed. - */ -static inline int rshash_adjust(void) -{ - unsigned long prev_hash_strength = hash_strength; - - if (!encode_benefit()) - return 0; - - switch (rshash_state.state) { - case RSHASH_STILL: - switch (judge_rshash_direction()) { - case GO_UP: - if (rshash_state.pre_direct == GO_DOWN) - hash_strength_delta = 0; - - inc_hash_strength(hash_strength_delta); - inc_hash_strength_delta(); - rshash_state.stable_benefit = get_current_benefit(); - rshash_state.pre_direct = GO_UP; - break; - - case GO_DOWN: - if (rshash_state.pre_direct == GO_UP) - hash_strength_delta = 0; - - dec_hash_strength(hash_strength_delta); - inc_hash_strength_delta(); - rshash_state.stable_benefit = get_current_benefit(); - rshash_state.pre_direct = GO_DOWN; - break; - - case OBSCURE: - rshash_state.stable_point = hash_strength; - rshash_state.turn_point_down = hash_strength; - rshash_state.turn_point_up = hash_strength; - rshash_state.turn_benefit_down = get_current_benefit(); - rshash_state.turn_benefit_up = get_current_benefit(); - rshash_state.lookup_window_index = 0; - rshash_state.state = RSHASH_TRYDOWN; - dec_hash_strength(hash_strength_delta); - inc_hash_strength_delta(); - break; - - case STILL: - break; - default: - BUG(); - } - break; - - case RSHASH_TRYDOWN: - if (rshash_state.lookup_window_index++ % 5 == 0) - rshash_state.below_count = 0; - - if (get_current_benefit() < rshash_state.stable_benefit) - rshash_state.below_count++; - else if (get_current_benefit() > - rshash_state.turn_benefit_down) { - rshash_state.turn_point_down = hash_strength; - rshash_state.turn_benefit_down = get_current_benefit(); - } - - if (rshash_state.below_count >= 3 || - judge_rshash_direction() == GO_UP || - hash_strength == 1) { - hash_strength = rshash_state.stable_point; - hash_strength_delta = 0; - inc_hash_strength(hash_strength_delta); - inc_hash_strength_delta(); - rshash_state.lookup_window_index = 0; - rshash_state.state = RSHASH_TRYUP; - hash_strength_delta = 0; - } else { - dec_hash_strength(hash_strength_delta); - inc_hash_strength_delta(); - } - break; - - case RSHASH_TRYUP: - if (rshash_state.lookup_window_index++ % 5 == 0) - rshash_state.below_count = 0; - - if (get_current_benefit() < rshash_state.turn_benefit_down) - rshash_state.below_count++; - else if (get_current_benefit() > rshash_state.turn_benefit_up) { - rshash_state.turn_point_up = hash_strength; - rshash_state.turn_benefit_up = get_current_benefit(); - } - - if (rshash_state.below_count >= 3 || - judge_rshash_direction() == GO_DOWN || - hash_strength == HASH_STRENGTH_MAX) { - hash_strength = rshash_state.turn_benefit_up > - rshash_state.turn_benefit_down ? - rshash_state.turn_point_up : - rshash_state.turn_point_down; - - rshash_state.state = RSHASH_PRE_STILL; - } else { - inc_hash_strength(hash_strength_delta); - inc_hash_strength_delta(); - } - - break; - - case RSHASH_NEW: - case RSHASH_PRE_STILL: - rshash_state.stable_benefit = get_current_benefit(); - rshash_state.state = RSHASH_STILL; - hash_strength_delta = 0; - break; - default: - BUG(); - } - - /* rshash_neg = rshash_pos = 0; */ - reset_benefit(); - - if (prev_hash_strength != hash_strength) - stable_tree_delta_hash(prev_hash_strength); - - return prev_hash_strength != hash_strength; -} - -/** - * round_update_ladder() - The main function to do update of all the - * adjustments whenever a scan round is finished. - */ -static noinline void round_update_ladder(void) -{ - int i; - unsigned long dedup; - struct vma_slot *slot, *tmp_slot; - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; - } - - list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { - - /* slot may be rung_rm_slot() when mm exits */ - if (slot->snode) { - dedup = cal_dedup_ratio_old(slot); - if (dedup && dedup >= uksm_abundant_threshold) - vma_rung_up(slot); - } - - slot->pages_bemerged = 0; - slot->pages_cowed = 0; - - list_del_init(&slot->dedup_list); - } -} - -static void uksm_del_vma_slot(struct vma_slot *slot) -{ - int i, j; - struct rmap_list_entry *entry; - - if (slot->snode) { - /* - * In case it just failed when entering the rung, it's not - * necessary. - */ - rung_rm_slot(slot); - } - - if (!list_empty(&slot->dedup_list)) - list_del(&slot->dedup_list); - - if (!slot->rmap_list_pool || !slot->pool_counts) { - /* In case it OOMed in uksm_vma_enter() */ - goto out; - } - - for (i = 0; i < slot->pool_size; i++) { - void *addr; - - if (!slot->rmap_list_pool[i]) - continue; - - addr = kmap(slot->rmap_list_pool[i]); - for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { - entry = (struct rmap_list_entry *)addr + j; - if (is_addr(entry->addr)) - continue; - if (!entry->item) - continue; - - remove_rmap_item_from_tree(entry->item); - free_rmap_item(entry->item); - slot->pool_counts[i]--; - } - BUG_ON(slot->pool_counts[i]); - kunmap(slot->rmap_list_pool[i]); - __free_page(slot->rmap_list_pool[i]); - } - kfree(slot->rmap_list_pool); - kfree(slot->pool_counts); - -out: - slot->rung = NULL; - if (slot->flags & UKSM_SLOT_IN_UKSM) { - BUG_ON(uksm_pages_total < slot->pages); - uksm_pages_total -= slot->pages; - } - - if (slot->fully_scanned_round == fully_scanned_round) - scanned_virtual_pages -= slot->pages; - else - scanned_virtual_pages -= slot->pages_scanned; - free_vma_slot(slot); -} - - -#define SPIN_LOCK_PERIOD 32 -static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; -static inline void cleanup_vma_slots(void) -{ - struct vma_slot *slot; - int i; - - i = 0; - spin_lock(&vma_slot_list_lock); - while (!list_empty(&vma_slot_del)) { - slot = list_entry(vma_slot_del.next, - struct vma_slot, slot_list); - list_del(&slot->slot_list); - cleanup_slots[i++] = slot; - if (i == SPIN_LOCK_PERIOD) { - spin_unlock(&vma_slot_list_lock); - while (--i >= 0) - uksm_del_vma_slot(cleanup_slots[i]); - i = 0; - spin_lock(&vma_slot_list_lock); - } - } - spin_unlock(&vma_slot_list_lock); - - while (--i >= 0) - uksm_del_vma_slot(cleanup_slots[i]); -} - -/* -*expotional moving average formula -*/ -static inline unsigned long ema(unsigned long curr, unsigned long last_ema) -{ - /* - * For a very high burst, even the ema cannot work well, a false very - * high per-page time estimation can result in feedback in very high - * overhead of context swith and rung update -- this will then lead - * to higher per-paper time, this may not converge. - * - * Instead, we try to approach this value in a binary manner. - */ - if (curr > last_ema * 10) - return last_ema * 2; - - return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; -} - -/* - * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to - * nanoseconds based on current uksm_sleep_jiffies. - */ -static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) -{ - return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / - (TIME_RATIO_SCALE - ratio) * ratio; -} - - -static inline unsigned long rung_real_ratio(int cpu_time_ratio) -{ - unsigned long ret; - - BUG_ON(!cpu_time_ratio); - - if (cpu_time_ratio > 0) - ret = cpu_time_ratio; - else - ret = (unsigned long)(-cpu_time_ratio) * - uksm_max_cpu_percentage / 100UL; - - return ret ? ret : 1; -} - -static noinline void uksm_calc_scan_pages(void) -{ - struct scan_rung *ladder = uksm_scan_ladder; - unsigned long sleep_usecs, nsecs; - unsigned long ratio; - int i; - unsigned long per_page; - - if (uksm_ema_page_time > 100000 || - (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) - uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; - - per_page = uksm_ema_page_time; - BUG_ON(!per_page); - - /* - * For every 8 eval round, we try to probe a uksm_sleep_jiffies value - * based on saved user input. - */ - if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) - uksm_sleep_jiffies = uksm_sleep_saved; - - /* We require a rung scan at least 1 page in a period. */ - nsecs = per_page; - ratio = rung_real_ratio(ladder[0].cpu_ratio); - if (cpu_ratio_to_nsec(ratio) < nsecs) { - sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio - / NSEC_PER_USEC; - uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; - } - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - ratio = rung_real_ratio(ladder[i].cpu_ratio); - ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / - per_page; - BUG_ON(!ladder[i].pages_to_scan); - uksm_calc_rung_step(&ladder[i], per_page, ratio); - } -} - -/* - * From the scan time of this round (ns) to next expected min sleep time - * (ms), be careful of the possible overflows. ratio is taken from - * rung_real_ratio() - */ -static inline -unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) -{ - scan_time >>= 20; /* to msec level now */ - BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); - - return (unsigned int) ((unsigned long) scan_time * - (TIME_RATIO_SCALE - ratio) / ratio); -} - -#define __round_mask(x, y) ((__typeof__(x))((y)-1)) -#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) - -static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) -{ - struct scan_rung *rung; - - rung = &uksm_scan_ladder[0]; - rung_add_new_slots(rung, slots, num); -} - -static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; - -static void uksm_enter_all_slots(void) -{ - struct vma_slot *slot; - unsigned long index; - struct list_head empty_vma_list; - int i; - - i = 0; - index = 0; - INIT_LIST_HEAD(&empty_vma_list); - - spin_lock(&vma_slot_list_lock); - while (!list_empty(&vma_slot_new)) { - slot = list_entry(vma_slot_new.next, - struct vma_slot, slot_list); - - if (!slot->vma->anon_vma) { - list_move(&slot->slot_list, &empty_vma_list); - } else if (vma_can_enter(slot->vma)) { - batch_slots[index++] = slot; - list_del_init(&slot->slot_list); - } else { - list_move(&slot->slot_list, &vma_slot_noadd); - } - - if (++i == SPIN_LOCK_PERIOD || - (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { - spin_unlock(&vma_slot_list_lock); - - if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { - uksm_vma_enter(batch_slots, index); - index = 0; - } - i = 0; - cond_resched(); - spin_lock(&vma_slot_list_lock); - } - } - - list_splice(&empty_vma_list, &vma_slot_new); - - spin_unlock(&vma_slot_list_lock); - - if (index) - uksm_vma_enter(batch_slots, index); - -} - -static inline int rung_round_finished(struct scan_rung *rung) -{ - return rung->flags & UKSM_RUNG_ROUND_FINISHED; -} - -static inline void judge_slot(struct vma_slot *slot) -{ - struct scan_rung *rung = slot->rung; - unsigned long dedup; - int deleted; - - dedup = cal_dedup_ratio(slot); - if (vma_fully_scanned(slot) && uksm_thrash_threshold) - deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); - else if (dedup && dedup >= uksm_abundant_threshold) - deleted = vma_rung_up(slot); - else - deleted = vma_rung_down(slot); - - slot->pages_merged = 0; - slot->pages_cowed = 0; - - if (vma_fully_scanned(slot)) - slot->pages_scanned = 0; - - slot->last_scanned = slot->pages_scanned; - - /* If its deleted in above, then rung was already advanced. */ - if (!deleted) - advance_current_scan(rung); -} - - -static inline int hash_round_finished(void) -{ - if (scanned_virtual_pages > (uksm_pages_total >> 2)) { - scanned_virtual_pages = 0; - if (uksm_pages_scanned) - fully_scanned_round++; - - return 1; - } else { - return 0; - } -} - -#define UKSM_MMSEM_BATCH 5 -#define BUSY_RETRY 100 - -/** - * uksm_do_scan() - the main worker function. - */ -static noinline void uksm_do_scan(void) -{ - struct vma_slot *slot, *iter; - struct mm_struct *busy_mm; - unsigned char round_finished, all_rungs_emtpy; - int i, err, mmsem_batch; - unsigned long pcost; - long long delta_exec; - unsigned long vpages, max_cpu_ratio; - unsigned long long start_time, end_time, scan_time; - unsigned int expected_jiffies; - - might_sleep(); - - vpages = 0; - - start_time = task_sched_runtime(current); - max_cpu_ratio = 0; - mmsem_batch = 0; - - for (i = 0; i < SCAN_LADDER_SIZE;) { - struct scan_rung *rung = &uksm_scan_ladder[i]; - unsigned long ratio; - int busy_retry; - - if (!rung->pages_to_scan) { - i++; - continue; - } - - if (!rung->vma_root.num) { - rung->pages_to_scan = 0; - i++; - continue; - } - - ratio = rung_real_ratio(rung->cpu_ratio); - if (ratio > max_cpu_ratio) - max_cpu_ratio = ratio; - - busy_retry = BUSY_RETRY; - /* - * Do not consider rung_round_finished() here, just used up the - * rung->pages_to_scan quota. - */ - while (rung->pages_to_scan && rung->vma_root.num && - likely(!freezing(current))) { - int reset = 0; - - slot = rung->current_scan; - - BUG_ON(vma_fully_scanned(slot)); - - if (mmsem_batch) { - err = 0; - } else { - err = try_down_read_slot_mmap_sem(slot); - } - - if (err == -ENOENT) { -rm_slot: - rung_rm_slot(slot); - continue; - } - - busy_mm = slot->mm; - - if (err == -EBUSY) { - /* skip other vmas on the same mm */ - do { - reset = advance_current_scan(rung); - iter = rung->current_scan; - busy_retry--; - if (iter->vma->vm_mm != busy_mm || - !busy_retry || reset) - break; - } while (1); - - if (iter->vma->vm_mm != busy_mm) { - continue; - } else { - /* scan round finsished */ - break; - } - } - - BUG_ON(!vma_can_enter(slot->vma)); - if (uksm_test_exit(slot->vma->vm_mm)) { - mmsem_batch = 0; - up_read(&slot->vma->vm_mm->mmap_sem); - goto rm_slot; - } - - if (mmsem_batch) - mmsem_batch--; - else - mmsem_batch = UKSM_MMSEM_BATCH; - - /* Ok, we have take the mmap_sem, ready to scan */ - scan_vma_one_page(slot); - rung->pages_to_scan--; - vpages++; - - if (rung->current_offset + rung->step > slot->pages - 1 - || vma_fully_scanned(slot)) { - up_read(&slot->vma->vm_mm->mmap_sem); - judge_slot(slot); - mmsem_batch = 0; - } else { - rung->current_offset += rung->step; - if (!mmsem_batch) - up_read(&slot->vma->vm_mm->mmap_sem); - } - - busy_retry = BUSY_RETRY; - cond_resched(); - } - - if (mmsem_batch) { - up_read(&slot->vma->vm_mm->mmap_sem); - mmsem_batch = 0; - } - - if (freezing(current)) - break; - - cond_resched(); - } - end_time = task_sched_runtime(current); - delta_exec = end_time - start_time; - - if (freezing(current)) - return; - - cleanup_vma_slots(); - uksm_enter_all_slots(); - - round_finished = 1; - all_rungs_emtpy = 1; - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - struct scan_rung *rung = &uksm_scan_ladder[i]; - - if (rung->vma_root.num) { - all_rungs_emtpy = 0; - if (!rung_round_finished(rung)) - round_finished = 0; - } - } - - if (all_rungs_emtpy) - round_finished = 0; - - if (round_finished) { - round_update_ladder(); - uksm_eval_round++; - - if (hash_round_finished() && rshash_adjust()) { - /* Reset the unstable root iff hash strength changed */ - uksm_hash_round++; - root_unstable_tree = RB_ROOT; - free_all_tree_nodes(&unstable_tree_node_list); - } - - /* - * A number of pages can hang around indefinitely on per-cpu - * pagevecs, raised page count preventing write_protect_page - * from merging them. Though it doesn't really matter much, - * it is puzzling to see some stuck in pages_volatile until - * other activity jostles them out, and they also prevented - * LTP's KSM test from succeeding deterministically; so drain - * them here (here rather than on entry to uksm_do_scan(), - * so we don't IPI too often when pages_to_scan is set low). - */ - lru_add_drain_all(); - } - - - if (vpages && delta_exec > 0) { - pcost = (unsigned long) delta_exec / vpages; - if (likely(uksm_ema_page_time)) - uksm_ema_page_time = ema(pcost, uksm_ema_page_time); - else - uksm_ema_page_time = pcost; - } - - uksm_calc_scan_pages(); - uksm_sleep_real = uksm_sleep_jiffies; - /* in case of radical cpu bursts, apply the upper bound */ - end_time = task_sched_runtime(current); - if (max_cpu_ratio && end_time > start_time) { - scan_time = end_time - start_time; - expected_jiffies = msecs_to_jiffies( - scan_time_to_sleep(scan_time, max_cpu_ratio)); - - if (expected_jiffies > uksm_sleep_real) - uksm_sleep_real = expected_jiffies; - - /* We have a 1 second up bound for responsiveness. */ - if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) - uksm_sleep_real = msecs_to_jiffies(1000); - } - - return; -} - -static int ksmd_should_run(void) -{ - return uksm_run & UKSM_RUN_MERGE; -} - -static int uksm_scan_thread(void *nothing) -{ - set_freezable(); - set_user_nice(current, 5); - - while (!kthread_should_stop()) { - mutex_lock(&uksm_thread_mutex); - if (ksmd_should_run()) { - uksm_do_scan(); - } - mutex_unlock(&uksm_thread_mutex); - - try_to_freeze(); - - if (ksmd_should_run()) { - schedule_timeout_interruptible(uksm_sleep_real); - uksm_sleep_times++; - } else { - wait_event_freezable(uksm_thread_wait, - ksmd_should_run() || kthread_should_stop()); - } - } - return 0; -} - -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) -{ - struct stable_node *stable_node; - struct node_vma *node_vma; - struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; - int search_new_forks = 0; - unsigned long address; - - VM_BUG_ON_PAGE(!PageKsm(page), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - - stable_node = page_stable_node(page); - if (!stable_node) - return ret; -again: - hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { - hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { - struct anon_vma *anon_vma = rmap_item->anon_vma; - struct anon_vma_chain *vmac; - struct vm_area_struct *vma; - - anon_vma_lock_read(anon_vma); - anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, - 0, ULONG_MAX) { - vma = vmac->vma; - address = get_rmap_addr(rmap_item); - - if (address < vma->vm_start || - address >= vma->vm_end) - continue; - - if ((rmap_item->slot->vma == vma) == - search_new_forks) - continue; - - if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) - continue; - - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) { - anon_vma_unlock_read(anon_vma); - goto out; - } - - if (rwc->done && rwc->done(page)) { - anon_vma_unlock_read(anon_vma); - goto out; - } - } - anon_vma_unlock_read(anon_vma); - } - } - if (!search_new_forks++) - goto again; -out: - return ret; -} - -#ifdef CONFIG_MIGRATION -/* Common ksm interface but may be specific to uksm */ -void ksm_migrate_page(struct page *newpage, struct page *oldpage) -{ - struct stable_node *stable_node; - - VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); - VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON(newpage->mapping != oldpage->mapping); - - stable_node = page_stable_node(newpage); - if (stable_node) { - VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); - stable_node->kpfn = page_to_pfn(newpage); - } -} -#endif /* CONFIG_MIGRATION */ - -#ifdef CONFIG_MEMORY_HOTREMOVE -static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, - unsigned long end_pfn) -{ - struct rb_node *node; - - for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { - struct stable_node *stable_node; - - stable_node = rb_entry(node, struct stable_node, node); - if (stable_node->kpfn >= start_pfn && - stable_node->kpfn < end_pfn) - return stable_node; - } - return NULL; -} - -static int uksm_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mn = arg; - struct stable_node *stable_node; - - switch (action) { - case MEM_GOING_OFFLINE: - /* - * Keep it very simple for now: just lock out ksmd and - * MADV_UNMERGEABLE while any memory is going offline. - * mutex_lock_nested() is necessary because lockdep was alarmed - * that here we take uksm_thread_mutex inside notifier chain - * mutex, and later take notifier chain mutex inside - * uksm_thread_mutex to unlock it. But that's safe because both - * are inside mem_hotplug_mutex. - */ - mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); - break; - - case MEM_OFFLINE: - /* - * Most of the work is done by page migration; but there might - * be a few stable_nodes left over, still pointing to struct - * pages which have been offlined: prune those from the tree. - */ - while ((stable_node = uksm_check_stable_tree(mn->start_pfn, - mn->start_pfn + mn->nr_pages)) != NULL) - remove_node_from_stable_tree(stable_node, 1, 1); - /* fallthrough */ - - case MEM_CANCEL_OFFLINE: - mutex_unlock(&uksm_thread_mutex); - break; - } - return NOTIFY_OK; -} -#endif /* CONFIG_MEMORY_HOTREMOVE */ - -#ifdef CONFIG_SYSFS -/* - * This all compiles without CONFIG_SYSFS, but is a waste of space. - */ - -#define UKSM_ATTR_RO(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_RO(_name) -#define UKSM_ATTR(_name) \ - static struct kobj_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) - -static ssize_t max_cpu_percentage_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%u\n", uksm_max_cpu_percentage); -} - -static ssize_t max_cpu_percentage_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long max_cpu_percentage; - int err; - - err = kstrtoul(buf, 10, &max_cpu_percentage); - if (err || max_cpu_percentage > 100) - return -EINVAL; - - if (max_cpu_percentage == 100) - max_cpu_percentage = 99; - else if (max_cpu_percentage < 10) - max_cpu_percentage = 10; - - uksm_max_cpu_percentage = max_cpu_percentage; - - return count; -} -UKSM_ATTR(max_cpu_percentage); - -static ssize_t sleep_millisecs_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); -} - -static ssize_t sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long msecs; - int err; - - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > MSEC_PER_SEC) - return -EINVAL; - - uksm_sleep_jiffies = msecs_to_jiffies(msecs); - uksm_sleep_saved = uksm_sleep_jiffies; - - return count; -} -UKSM_ATTR(sleep_millisecs); - - -static ssize_t cpu_governor_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); - int i; - - buf[0] = '\0'; - for (i = 0; i < n ; i++) { - if (uksm_cpu_governor == i) - strcat(buf, "["); - - strcat(buf, uksm_cpu_governor_str[i]); - - if (uksm_cpu_governor == i) - strcat(buf, "]"); - - strcat(buf, " "); - } - strcat(buf, "\n"); - - return strlen(buf); -} - -static inline void init_performance_values(void) -{ - int i; - struct scan_rung *rung; - struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; - - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - rung = uksm_scan_ladder + i; - rung->cpu_ratio = preset->cpu_ratio[i]; - rung->cover_msecs = preset->cover_msecs[i]; - } - - uksm_max_cpu_percentage = preset->max_cpu; -} - -static ssize_t cpu_governor_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); - - for (n--; n >=0 ; n--) { - if (!strncmp(buf, uksm_cpu_governor_str[n], - strlen(uksm_cpu_governor_str[n]))) - break; - } - - if (n < 0) - return -EINVAL; - else - uksm_cpu_governor = n; - - init_performance_values(); - - return count; -} -UKSM_ATTR(cpu_governor); - -static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - return sprintf(buf, "%u\n", uksm_run); -} - -static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long flags; - - err = kstrtoul(buf, 10, &flags); - if (err || flags > UINT_MAX) - return -EINVAL; - if (flags > UKSM_RUN_MERGE) - return -EINVAL; - - mutex_lock(&uksm_thread_mutex); - if (uksm_run != flags) { - uksm_run = flags; - } - mutex_unlock(&uksm_thread_mutex); - - if (flags & UKSM_RUN_MERGE) - wake_up_interruptible(&uksm_thread_wait); - - return count; -} -UKSM_ATTR(run); - -static ssize_t abundant_threshold_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%u\n", uksm_abundant_threshold); -} - -static ssize_t abundant_threshold_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long flags; - - err = kstrtoul(buf, 10, &flags); - if (err || flags > 99) - return -EINVAL; - - uksm_abundant_threshold = flags; - - return count; -} -UKSM_ATTR(abundant_threshold); - -static ssize_t thrash_threshold_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%u\n", uksm_thrash_threshold); -} - -static ssize_t thrash_threshold_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int err; - unsigned long flags; - - err = kstrtoul(buf, 10, &flags); - if (err || flags > 99) - return -EINVAL; - - uksm_thrash_threshold = flags; - - return count; -} -UKSM_ATTR(thrash_threshold); - -static ssize_t cpu_ratios_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - int i, size; - struct scan_rung *rung; - char *p = buf; - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - rung = &uksm_scan_ladder[i]; - - if (rung->cpu_ratio > 0) - size = sprintf(p, "%d ", rung->cpu_ratio); - else - size = sprintf(p, "MAX/%d ", - TIME_RATIO_SCALE / -rung->cpu_ratio); - - p += size; - } - - *p++ = '\n'; - *p = '\0'; - - return p - buf; -} - -static ssize_t cpu_ratios_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int i, cpuratios[SCAN_LADDER_SIZE], err; - unsigned long value; - struct scan_rung *rung; - char *p, *end = NULL; - - p = kzalloc(count, GFP_KERNEL); - if (!p) - return -ENOMEM; - - memcpy(p, buf, count); - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - if (i != SCAN_LADDER_SIZE -1) { - end = strchr(p, ' '); - if (!end) - return -EINVAL; - - *end = '\0'; - } - - if (strstr(p, "MAX/")) { - p = strchr(p, '/') + 1; - err = kstrtoul(p, 10, &value); - if (err || value > TIME_RATIO_SCALE || !value) - return -EINVAL; - - cpuratios[i] = - (int) (TIME_RATIO_SCALE / value); - } else { - err = kstrtoul(p, 10, &value); - if (err || value > TIME_RATIO_SCALE || !value) - return -EINVAL; - - cpuratios[i] = value; - } - - p = end + 1; - } - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - rung = &uksm_scan_ladder[i]; - - rung->cpu_ratio = cpuratios[i]; - } - - return count; -} -UKSM_ATTR(cpu_ratios); - -static ssize_t eval_intervals_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - int i, size; - struct scan_rung *rung; - char *p = buf; - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - rung = &uksm_scan_ladder[i]; - size = sprintf(p, "%u ", rung->cover_msecs); - p += size; - } - - *p++ = '\n'; - *p = '\0'; - - return p - buf; -} - -static ssize_t eval_intervals_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - int i, err; - unsigned long values[SCAN_LADDER_SIZE]; - struct scan_rung *rung; - char *p, *end = NULL; - - p = kzalloc(count, GFP_KERNEL); - if (!p) - return -ENOMEM; - - memcpy(p, buf, count); - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - if (i != SCAN_LADDER_SIZE -1) { - end = strchr(p, ' '); - if (!end) - return -EINVAL; - - *end = '\0'; - } - - err = kstrtoul(p, 10, &values[i]); - if (err) - return -EINVAL; - - p = end + 1; - } - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - rung = &uksm_scan_ladder[i]; - - rung->cover_msecs = values[i]; - } - - return count; -} -UKSM_ATTR(eval_intervals); - -static ssize_t ema_per_page_time_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", uksm_ema_page_time); -} -UKSM_ATTR_RO(ema_per_page_time); - -static ssize_t pages_shared_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", uksm_pages_shared); -} -UKSM_ATTR_RO(pages_shared); - -static ssize_t pages_sharing_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", uksm_pages_sharing); -} -UKSM_ATTR_RO(pages_sharing); - -static ssize_t pages_unshared_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", uksm_pages_unshared); -} -UKSM_ATTR_RO(pages_unshared); - -static ssize_t full_scans_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%llu\n", fully_scanned_round); -} -UKSM_ATTR_RO(full_scans); - -static ssize_t pages_scanned_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - unsigned long base = 0; - u64 delta, ret; - - if (pages_scanned_stored) { - base = pages_scanned_base; - ret = pages_scanned_stored; - delta = uksm_pages_scanned >> base; - if (CAN_OVERFLOW_U64(ret, delta)) { - ret >>= 1; - delta >>= 1; - base++; - ret += delta; - } - } else { - ret = uksm_pages_scanned; - } - - while (ret > ULONG_MAX) { - ret >>= 1; - base++; - } - - if (base) - return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); - else - return sprintf(buf, "%lu\n", (unsigned long)ret); -} -UKSM_ATTR_RO(pages_scanned); - -static ssize_t hash_strength_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%lu\n", hash_strength); -} -UKSM_ATTR_RO(hash_strength); - -static ssize_t sleep_times_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%llu\n", uksm_sleep_times); -} -UKSM_ATTR_RO(sleep_times); - - -static struct attribute *uksm_attrs[] = { - &max_cpu_percentage_attr.attr, - &sleep_millisecs_attr.attr, - &cpu_governor_attr.attr, - &run_attr.attr, - &ema_per_page_time_attr.attr, - &pages_shared_attr.attr, - &pages_sharing_attr.attr, - &pages_unshared_attr.attr, - &full_scans_attr.attr, - &pages_scanned_attr.attr, - &hash_strength_attr.attr, - &sleep_times_attr.attr, - &thrash_threshold_attr.attr, - &abundant_threshold_attr.attr, - &cpu_ratios_attr.attr, - &eval_intervals_attr.attr, - NULL, -}; - -static struct attribute_group uksm_attr_group = { - .attrs = uksm_attrs, - .name = "uksm", -}; -#endif /* CONFIG_SYSFS */ - -static inline void init_scan_ladder(void) -{ - int i; - struct scan_rung *rung; - - for (i = 0; i < SCAN_LADDER_SIZE; i++) { - rung = uksm_scan_ladder + i; - slot_tree_init_root(&rung->vma_root); - } - - init_performance_values(); - uksm_calc_scan_pages(); -} - -static inline int cal_positive_negative_costs(void) -{ - struct page *p1, *p2; - unsigned char *addr1, *addr2; - unsigned long i, time_start, hash_cost; - unsigned long loopnum = 0; - - /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ - volatile u32 hash; - volatile int ret; - - p1 = alloc_page(GFP_KERNEL); - if (!p1) - return -ENOMEM; - - p2 = alloc_page(GFP_KERNEL); - if (!p2) - return -ENOMEM; - - addr1 = kmap_atomic(p1); - addr2 = kmap_atomic(p2); - memset(addr1, prandom_u32(), PAGE_SIZE); - memcpy(addr2, addr1, PAGE_SIZE); - - /* make sure that the two pages differ in last byte */ - addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; - kunmap_atomic(addr2); - kunmap_atomic(addr1); - - time_start = jiffies; - while (jiffies - time_start < 100) { - for (i = 0; i < 100; i++) - hash = page_hash(p1, HASH_STRENGTH_FULL, 0); - loopnum += 100; - } - hash_cost = (jiffies - time_start); - - time_start = jiffies; - for (i = 0; i < loopnum; i++) - ret = pages_identical(p1, p2); - memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); - memcmp_cost /= hash_cost; - printk(KERN_INFO "UKSM: relative memcmp_cost = %lu " - "hash=%u cmp_ret=%d.\n", - memcmp_cost, hash, ret); - - __free_page(p1); - __free_page(p2); - return 0; -} - -static int init_zeropage_hash_table(void) -{ - struct page *page; - char *addr; - int i; - - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - addr = kmap_atomic(page); - memset(addr, 0, PAGE_SIZE); - kunmap_atomic(addr); - - zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32), - GFP_KERNEL); - if (!zero_hash_table) - return -ENOMEM; - - for (i = 0; i < HASH_STRENGTH_MAX; i++) - zero_hash_table[i] = page_hash(page, i, 0); - - __free_page(page); - - return 0; -} - -static inline int init_random_sampling(void) -{ - unsigned long i; - random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!random_nums) - return -ENOMEM; - - for (i = 0; i < HASH_STRENGTH_FULL; i++) - random_nums[i] = i; - - for (i = 0; i < HASH_STRENGTH_FULL; i++) { - unsigned long rand_range, swap_index, tmp; - - rand_range = HASH_STRENGTH_FULL - i; - swap_index = i + prandom_u32() % rand_range; - tmp = random_nums[i]; - random_nums[i] = random_nums[swap_index]; - random_nums[swap_index] = tmp; - } - - rshash_state.state = RSHASH_NEW; - rshash_state.below_count = 0; - rshash_state.lookup_window_index = 0; - - return cal_positive_negative_costs(); -} - -static int __init uksm_slab_init(void) -{ - rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); - if (!rmap_item_cache) - goto out; - - stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); - if (!stable_node_cache) - goto out_free1; - - node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); - if (!node_vma_cache) - goto out_free2; - - vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); - if (!vma_slot_cache) - goto out_free3; - - tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); - if (!tree_node_cache) - goto out_free4; - - return 0; - -out_free4: - kmem_cache_destroy(vma_slot_cache); -out_free3: - kmem_cache_destroy(node_vma_cache); -out_free2: - kmem_cache_destroy(stable_node_cache); -out_free1: - kmem_cache_destroy(rmap_item_cache); -out: - return -ENOMEM; -} - -static void __init uksm_slab_free(void) -{ - kmem_cache_destroy(stable_node_cache); - kmem_cache_destroy(rmap_item_cache); - kmem_cache_destroy(node_vma_cache); - kmem_cache_destroy(vma_slot_cache); - kmem_cache_destroy(tree_node_cache); -} - -/* Common interface to ksm, different to it. */ -int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) -{ - int err; - - switch (advice) { - case MADV_MERGEABLE: - return 0; /* just ignore the advice */ - - case MADV_UNMERGEABLE: - if (!(*vm_flags & VM_MERGEABLE)) - return 0; /* just ignore the advice */ - - if (vma->anon_vma) { - err = unmerge_uksm_pages(vma, start, end); - if (err) - return err; - } - - uksm_remove_vma(vma); - *vm_flags &= ~VM_MERGEABLE; - break; - } - - return 0; -} - -/* Common interface to ksm, actually the same. */ -struct page *ksm_might_need_to_copy(struct page *page, - struct vm_area_struct *vma, unsigned long address) -{ - struct anon_vma *anon_vma = page_anon_vma(page); - struct page *new_page; - - if (PageKsm(page)) { - if (page_stable_node(page)) - return page; /* no need to copy it */ - } else if (!anon_vma) { - return page; /* no need to copy it */ - } else if (anon_vma->root == vma->anon_vma->root && - page->index == linear_page_index(vma, address)) { - return page; /* still no need to copy it */ - } - if (!PageUptodate(page)) - return page; /* let do_swap_page report the error */ - - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (new_page) { - copy_user_highpage(new_page, page, address, vma); - - SetPageDirty(new_page); - __SetPageUptodate(new_page); - __set_page_locked(new_page); - } - - return new_page; -} - -static int __init uksm_init(void) -{ - struct task_struct *uksm_thread; - int err; - - uksm_sleep_jiffies = msecs_to_jiffies(100); - uksm_sleep_saved = uksm_sleep_jiffies; - - slot_tree_init(); - init_scan_ladder(); - - - err = init_random_sampling(); - if (err) - goto out_free2; - - err = uksm_slab_init(); - if (err) - goto out_free1; - - err = init_zeropage_hash_table(); - if (err) - goto out_free0; - - uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); - if (IS_ERR(uksm_thread)) { - printk(KERN_ERR "uksm: creating kthread failed\n"); - err = PTR_ERR(uksm_thread); - goto out_free; - } - -#ifdef CONFIG_SYSFS - err = sysfs_create_group(mm_kobj, &uksm_attr_group); - if (err) { - printk(KERN_ERR "uksm: register sysfs failed\n"); - kthread_stop(uksm_thread); - goto out_free; - } -#else - uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ - -#endif /* CONFIG_SYSFS */ - -#ifdef CONFIG_MEMORY_HOTREMOVE - /* - * Choose a high priority since the callback takes uksm_thread_mutex: - * later callbacks could only be taking locks which nest within that. - */ - hotplug_memory_notifier(uksm_memory_callback, 100); -#endif - return 0; - -out_free: - kfree(zero_hash_table); -out_free0: - uksm_slab_free(); -out_free1: - kfree(random_nums); -out_free2: - kfree(uksm_scan_ladder); - return err; -} - -#ifdef MODULE -subsys_initcall(ksm_init); -#else -late_initcall(uksm_init); -#endif - diff --git a/mm/vmstat.c b/mm/vmstat.c index c301bd6e0..084c6725b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -764,9 +764,6 @@ const char * const vmstat_text[] = { "nr_anon_transparent_hugepages", "nr_free_cma", -#ifdef CONFIG_UKSM - "nr_uksm_zero_pages", -#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", |