/*
 * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
 *
 * This is an improvement upon KSM. Some basic data structures and routines
 * are borrowed from ksm.c .
 *
 * Its new features:
 * 1. Full system scan:
 *      It automatically scans all user processes' anonymous VMAs. Kernel-user
 *      interaction to submit a memory area to KSM is no longer needed.
 *
 * 2. Rich area detection:
 *      It automatically detects rich areas containing abundant duplicated
 *      pages based. Rich areas are given a full scan speed. Poor areas are
 *      sampled at a reasonable speed with very low CPU consumption.
 *
 * 3. Ultra Per-page scan speed improvement:
 *      A new hash algorithm is proposed. As a result, on a machine with
 *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
 *      can scan memory areas that does not contain duplicated pages at speed of
 *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
 *      477MB/sec ~ 923MB/sec.
 *
 * 4. Thrashing area avoidance:
 *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
 *      filtered out. My benchmark shows it's more efficient than KSM's per-page
 *      hash value based volatile page detection.
 *
 *
 * 5. Misc changes upon KSM:
 *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
 *        comparison. It's much faster than default C version on x86.
 *      * rmap_item now has an struct *page member to loosely cache a
 *        address-->page mapping, which reduces too much time-costly
 *        follow_page().
 *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
 *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
 *        ksm is needed for this case.
 *
 * 6. Full Zero Page consideration(contributed by Figo Zhang)
 *    Now uksmd consider full zero pages as special pages and merge them to an
 *    special unswappable uksm zero page.
 */

#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
#include <linux/jhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/memory.h>
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <crypto/hash.h>
#include <linux/random.h>
#include <linux/math64.h>
#include <linux/gcd.h>
#include <linux/freezer.h>
#include <linux/sradix-tree.h>

#include <asm/tlbflush.h>
#include "internal.h"

#ifdef CONFIG_X86
#undef memcmp

#ifdef CONFIG_X86_32
#define memcmp memcmpx86_32
/*
 * Compare 4-byte-aligned address s1 and s2, with length n
 */
int memcmpx86_32(void *s1, void *s2, size_t n)
{
	size_t num = n / 4;
	register int res;

	__asm__ __volatile__
	(
	 "testl %3,%3\n\t"
	 "repe; cmpsd\n\t"
	 "je        1f\n\t"
	 "sbbl      %0,%0\n\t"
	 "orl       $1,%0\n"
	 "1:"
	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
	 : "0" (0)
	 : "cc");

	return res;
}

/*
 * Check the page is all zero ?
 */
static int is_full_zero(const void *s1, size_t len)
{
	unsigned char same;

	len /= 4;

	__asm__ __volatile__
	("repe; scasl;"
	 "sete %0"
	 : "=qm" (same), "+D" (s1), "+c" (len)
	 : "a" (0)
	 : "cc");

	return same;
}


#elif defined(CONFIG_X86_64)
#define memcmp memcmpx86_64
/*
 * Compare 8-byte-aligned address s1 and s2, with length n
 */
int memcmpx86_64(void *s1, void *s2, size_t n)
{
	size_t num = n / 8;
	register int res;

	__asm__ __volatile__
	(
	 "testq %q3,%q3\n\t"
	 "repe; cmpsq\n\t"
	 "je        1f\n\t"
	 "sbbq      %q0,%q0\n\t"
	 "orq       $1,%q0\n"
	 "1:"
	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
	 : "0" (0)
	 : "cc");

	return res;
}

static int is_full_zero(const void *s1, size_t len)
{
	unsigned char same;

	len /= 8;

	__asm__ __volatile__
	("repe; scasq;"
	 "sete %0"
	 : "=qm" (same), "+D" (s1), "+c" (len)
	 : "a" (0)
	 : "cc");

	return same;
}

#endif
#else
static int is_full_zero(const void *s1, size_t len)
{
	unsigned long *src = s1;
	int i;

	len /= sizeof(*src);

	for (i = 0; i < len; i++) {
		if (src[i])
			return 0;
	}

	return 1;
}
#endif

#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
#define TIME_RATIO_SCALE	10000

#define SLOT_TREE_NODE_SHIFT	8
#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
struct slot_tree_node {
	unsigned long size;
	struct sradix_tree_node snode;
	void *stores[SLOT_TREE_NODE_STORE_SIZE];
};

static struct kmem_cache *slot_tree_node_cachep;

static struct sradix_tree_node *slot_tree_node_alloc(void)
{
	struct slot_tree_node *p;
	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
			      __GFP_NORETRY | __GFP_NOWARN);
	if (!p)
		return NULL;

	return &p->snode;
}

static void slot_tree_node_free(struct sradix_tree_node *node)
{
	struct slot_tree_node *p;

	p = container_of(node, struct slot_tree_node, snode);
	kmem_cache_free(slot_tree_node_cachep, p);
}

static void slot_tree_node_extend(struct sradix_tree_node *parent,
				  struct sradix_tree_node *child)
{
	struct slot_tree_node *p, *c;

	p = container_of(parent, struct slot_tree_node, snode);
	c = container_of(child, struct slot_tree_node, snode);

	p->size += c->size;
}

void slot_tree_node_assign(struct sradix_tree_node *node,
			   unsigned index, void *item)
{
	struct vma_slot *slot = item;
	struct slot_tree_node *cur;

	slot->snode = node;
	slot->sindex = index;

	while (node) {
		cur = container_of(node, struct slot_tree_node, snode);
		cur->size += slot->pages;
		node = node->parent;
	}
}

void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
{
	struct vma_slot *slot;
	struct slot_tree_node *cur;
	unsigned long pages;

	if (node->height == 1) {
		slot = node->stores[offset];
		pages = slot->pages;
	} else {
		cur = container_of(node->stores[offset],
				   struct slot_tree_node, snode);
		pages = cur->size;
	}

	while (node) {
		cur = container_of(node, struct slot_tree_node, snode);
		cur->size -= pages;
		node = node->parent;
	}
}

unsigned long slot_iter_index;
int slot_iter(void *item,  unsigned long height)
{
	struct slot_tree_node *node;
	struct vma_slot *slot;

	if (height == 1) {
		slot = item;
		if (slot_iter_index < slot->pages) {
			/*in this one*/
			return 1;
		} else {
			slot_iter_index -= slot->pages;
			return 0;
		}

	} else {
		node = container_of(item, struct slot_tree_node, snode);
		if (slot_iter_index < node->size) {
			/*in this one*/
			return 1;
		} else {
			slot_iter_index -= node->size;
			return 0;
		}
	}
}


static inline void slot_tree_init_root(struct sradix_tree_root *root)
{
	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
	root->alloc = slot_tree_node_alloc;
	root->free = slot_tree_node_free;
	root->extend = slot_tree_node_extend;
	root->assign = slot_tree_node_assign;
	root->rm = slot_tree_node_rm;
}

void slot_tree_init(void)
{
	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
				sizeof(struct slot_tree_node), 0,
				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
				NULL);
}


/* Each rung of this ladder is a list of VMAs having a same scan ratio */
struct scan_rung {
	//struct list_head scanned_list;
	struct sradix_tree_root vma_root;
	struct sradix_tree_root vma_root2;

	struct vma_slot *current_scan;
	unsigned long current_offset;

	/*
	 * The initial value for current_offset, it should loop over
	 * [0~ step - 1] to let all slot have its chance to be scanned.
	 */
	unsigned long offset_init;
	unsigned long step; /* dynamic step for current_offset */
	unsigned int flags;
	unsigned long pages_to_scan;
	//unsigned long fully_scanned_slots;
	/*
	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
	 * the cpu time ratio it can spend in rung_i for every scan
	 * period. if < 0, then it is the cpu time ratio relative to the
	 * max cpu percentage user specified. Both in unit of
	 * 1/TIME_RATIO_SCALE
	 */
	int cpu_ratio;

	/*
	 * How long it will take for all slots in this rung to be fully
	 * scanned? If it's zero, we don't care about the cover time:
	 * it's fully scanned.
	 */
	unsigned int cover_msecs;
	//unsigned long vma_num;
	//unsigned long pages; /* Sum of all slot's pages in rung */
};

/**
 * node of either the stable or unstale rbtree
 *
 */
struct tree_node {
	struct rb_node node; /* link in the main (un)stable rbtree */
	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
	u32 hash;
	unsigned long count; /* TODO: merged with sub_root */
	struct list_head all_list; /* all tree nodes in stable/unstable tree */
};

/**
 * struct stable_node - node of the stable rbtree
 * @node: rb node of this ksm page in the stable tree
 * @hlist: hlist head of rmap_items using this ksm page
 * @kpfn: page frame number of this ksm page
 */
struct stable_node {
	struct rb_node node; /* link in sub-rbtree */
	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
	struct hlist_head hlist;
	unsigned long kpfn;
	u32 hash_max; /* if ==0 then it's not been calculated yet */
	struct list_head all_list; /* in a list for all stable nodes */
};

/**
 * struct node_vma - group rmap_items linked in a same stable
 * node together.
 */
struct node_vma {
	union {
		struct vma_slot *slot;
		unsigned long key;  /* slot is used as key sorted on hlist */
	};
	struct hlist_node hlist;
	struct hlist_head rmap_hlist;
	struct stable_node *head;
};

/**
 * struct rmap_item - reverse mapping item for virtual addresses
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
 */
struct rmap_item {
	struct vma_slot *slot;
	struct page *page;
	unsigned long address;	/* + low bits used for flags below */
	unsigned long hash_round;
	unsigned long entry_index;
	union {
		struct {/* when in unstable tree */
			struct rb_node node;
			struct tree_node *tree_node;
			u32 hash_max;
		};
		struct { /* when in stable tree */
			struct node_vma *head;
			struct hlist_node hlist;
			struct anon_vma *anon_vma;
		};
	};
} __attribute__((aligned(4)));

struct rmap_list_entry {
	union {
		struct rmap_item *item;
		unsigned long addr;
	};
	/* lowest bit is used for is_addr tag */
} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/


/* Basic data structure definition ends */


/*
 * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
 * The flags use the low bits of rmap_item.address
 */
#define UNSTABLE_FLAG	0x1
#define STABLE_FLAG	0x2
#define get_rmap_addr(x)	((x)->address & PAGE_MASK)

/*
 * rmap_list_entry helpers
 */
#define IS_ADDR_FLAG	1
#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))


/*
 * High speed caches for frequently allocated and freed structs
 */
static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *node_vma_cache;
static struct kmem_cache *vma_slot_cache;
static struct kmem_cache *tree_node_cache;
#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
		sizeof(struct __struct), __alignof__(struct __struct),\
		(__flags), NULL)

/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
#define SCAN_LADDER_SIZE 4
static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];

/* The evaluation rounds uksmd has finished */
static unsigned long long uksm_eval_round = 1;

/*
 * we add 1 to this var when we consider we should rebuild the whole
 * unstable tree.
 */
static unsigned long uksm_hash_round = 1;

/*
 * How many times the whole memory is scanned.
 */
static unsigned long long fully_scanned_round = 1;

/* The total number of virtual pages of all vma slots */
static u64 uksm_pages_total;

/* The number of pages has been scanned since the start up */
static u64 uksm_pages_scanned;

static u64 scanned_virtual_pages;

/* The number of pages has been scanned since last encode_benefit call */
static u64 uksm_pages_scanned_last;

/* If the scanned number is tooo large, we encode it here */
static u64 pages_scanned_stored;

static unsigned long pages_scanned_base;

/* The number of nodes in the stable tree */
static unsigned long uksm_pages_shared;

/* The number of page slots additionally sharing those nodes */
static unsigned long uksm_pages_sharing;

/* The number of nodes in the unstable tree */
static unsigned long uksm_pages_unshared;

/*
 * Milliseconds ksmd should sleep between scans,
 * >= 100ms to be consistent with
 * scan_time_to_sleep_msec()
 */
static unsigned int uksm_sleep_jiffies;

/* The real value for the uksmd next sleep */
static unsigned int uksm_sleep_real;

/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
static unsigned int uksm_sleep_saved;

/* Max percentage of cpu utilization ksmd can take to scan in one batch */
static unsigned int uksm_max_cpu_percentage;

static int uksm_cpu_governor;

static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };

struct uksm_cpu_preset_s {
	int cpu_ratio[SCAN_LADDER_SIZE];
	unsigned int cover_msecs[SCAN_LADDER_SIZE];
	unsigned int max_cpu; /* percentage */
};

struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
};

/* The default value for uksm_ema_page_time if it's not initialized */
#define UKSM_PAGE_TIME_DEFAULT	500

/*cost to scan one page by expotional moving average in nsecs */
static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;

/* The expotional moving average alpha weight, in percentage. */
#define EMA_ALPHA	20

/*
 * The threshold used to filter out thrashing areas,
 * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
 * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
 * will be considered as having a zero duplication ratio.
 */
static unsigned int uksm_thrash_threshold = 50;

/* How much dedup ratio is considered to be abundant*/
static unsigned int uksm_abundant_threshold = 10;

/* All slots having merged pages in this eval round. */
struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);

/* How many times the ksmd has slept since startup */
static unsigned long long uksm_sleep_times;

#define UKSM_RUN_STOP	0
#define UKSM_RUN_MERGE	1
static unsigned int uksm_run = 1;

static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
static DEFINE_MUTEX(uksm_thread_mutex);

/*
 * List vma_slot_new is for newly created vma_slot waiting to be added by
 * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
 * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
 * VMA has been removed/freed.
 */
struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
static DEFINE_SPINLOCK(vma_slot_list_lock);

/* The unstable tree heads */
static struct rb_root root_unstable_tree = RB_ROOT;

/*
 * All tree_nodes are in a list to be freed at once when unstable tree is
 * freed after each scan round.
 */
static struct list_head unstable_tree_node_list =
				LIST_HEAD_INIT(unstable_tree_node_list);

/* List contains all stable nodes */
static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);

/*
 * When the hash strength is changed, the stable tree must be delta_hashed and
 * re-structured. We use two set of below structs to speed up the
 * re-structuring of stable tree.
 */
static struct list_head
stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
			    LIST_HEAD_INIT(stable_tree_node_list[1])};

static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
static struct rb_root *root_stable_treep = &root_stable_tree[0];
static unsigned long stable_tree_index;

/* The hash strength needed to hash a full page */
#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))

/* The hash strength needed for loop-back hashing */
#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)

/* The random offsets in a page */
static u32 *random_nums;

/* The hash strength */
static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;

/* The delta value each time the hash strength increases or decreases */
static unsigned long hash_strength_delta;
#define HASH_STRENGTH_DELTA_MAX	5

/* The time we have saved due to random_sample_hash */
static u64 rshash_pos;

/* The time we have wasted due to hash collision */
static u64 rshash_neg;

struct uksm_benefit {
	u64 pos;
	u64 neg;
	u64 scanned;
	unsigned long base;
} benefit;

/*
 * The relative cost of memcmp, compared to 1 time unit of random sample
 * hash, this value is tested when ksm module is initialized
 */
static unsigned long memcmp_cost;

static unsigned long  rshash_neg_cont_zero;
static unsigned long  rshash_cont_obscure;

/* The possible states of hash strength adjustment heuristic */
enum rshash_states {
		RSHASH_STILL,
		RSHASH_TRYUP,
		RSHASH_TRYDOWN,
		RSHASH_NEW,
		RSHASH_PRE_STILL,
};

/* The possible direction we are about to adjust hash strength */
enum rshash_direct {
	GO_UP,
	GO_DOWN,
	OBSCURE,
	STILL,
};

/* random sampling hash state machine */
static struct {
	enum rshash_states state;
	enum rshash_direct pre_direct;
	u8 below_count;
	/* Keep a lookup window of size 5, iff above_count/below_count > 3
	 * in this window we stop trying.
	 */
	u8 lookup_window_index;
	u64 stable_benefit;
	unsigned long turn_point_down;
	unsigned long turn_benefit_down;
	unsigned long turn_point_up;
	unsigned long turn_benefit_up;
	unsigned long stable_point;
} rshash_state;

/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
static u32 *zero_hash_table;

static inline struct node_vma *alloc_node_vma(void)
{
	struct node_vma *node_vma;
	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
				     __GFP_NORETRY | __GFP_NOWARN);
	if (node_vma) {
		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
		INIT_HLIST_NODE(&node_vma->hlist);
	}
	return node_vma;
}

static inline void free_node_vma(struct node_vma *node_vma)
{
	kmem_cache_free(node_vma_cache, node_vma);
}


static inline struct vma_slot *alloc_vma_slot(void)
{
	struct vma_slot *slot;

	/*
	 * In case ksm is not initialized by now.
	 * Oops, we need to consider the call site of uksm_init() in the future.
	 */
	if (!vma_slot_cache)
		return NULL;

	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
				 __GFP_NORETRY | __GFP_NOWARN);
	if (slot) {
		INIT_LIST_HEAD(&slot->slot_list);
		INIT_LIST_HEAD(&slot->dedup_list);
		slot->flags |= UKSM_SLOT_NEED_RERAND;
	}
	return slot;
}

static inline void free_vma_slot(struct vma_slot *vma_slot)
{
	kmem_cache_free(vma_slot_cache, vma_slot);
}


static inline struct rmap_item *alloc_rmap_item(void)
{
	struct rmap_item *rmap_item;

	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
				      __GFP_NORETRY | __GFP_NOWARN);
	if (rmap_item) {
		/* bug on lowest bit is not clear for flag use */
		BUG_ON(is_addr(rmap_item));
	}
	return rmap_item;
}

static inline void free_rmap_item(struct rmap_item *rmap_item)
{
	rmap_item->slot = NULL;	/* debug safety */
	kmem_cache_free(rmap_item_cache, rmap_item);
}

static inline struct stable_node *alloc_stable_node(void)
{
	struct stable_node *node;
	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
	if (!node)
		return NULL;

	INIT_HLIST_HEAD(&node->hlist);
	list_add(&node->all_list, &stable_node_list);
	return node;
}

static inline void free_stable_node(struct stable_node *stable_node)
{
	list_del(&stable_node->all_list);
	kmem_cache_free(stable_node_cache, stable_node);
}

static inline struct tree_node *alloc_tree_node(struct list_head *list)
{
	struct tree_node *node;
	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
	if (!node)
		return NULL;

	list_add(&node->all_list, list);
	return node;
}

static inline void free_tree_node(struct tree_node *node)
{
	list_del(&node->all_list);
	kmem_cache_free(tree_node_cache, node);
}

static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
{
	struct anon_vma *anon_vma = rmap_item->anon_vma;

	put_anon_vma(anon_vma);
}


/**
 * Remove a stable node from stable_tree, may unlink from its tree_node and
 * may remove its parent tree_node if no other stable node is pending.
 *
 * @stable_node 	The node need to be removed
 * @unlink_rb 		Will this node be unlinked from the rbtree?
 * @remove_tree_	node Will its tree_node be removed if empty?
 */
static void remove_node_from_stable_tree(struct stable_node *stable_node,
					 int unlink_rb,  int remove_tree_node)
{
	struct node_vma *node_vma;
	struct rmap_item *rmap_item;
	struct hlist_node *n;

	if (!hlist_empty(&stable_node->hlist)) {
		hlist_for_each_entry_safe(node_vma, n,
					  &stable_node->hlist, hlist) {
			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
				uksm_pages_sharing--;

				uksm_drop_anon_vma(rmap_item);
				rmap_item->address &= PAGE_MASK;
			}
			free_node_vma(node_vma);
			cond_resched();
		}

		/* the last one is counted as shared */
		uksm_pages_shared--;
		uksm_pages_sharing++;
	}

	if (stable_node->tree_node && unlink_rb) {
		rb_erase(&stable_node->node,
			 &stable_node->tree_node->sub_root);

		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
		    remove_tree_node) {
			rb_erase(&stable_node->tree_node->node,
				 root_stable_treep);
			free_tree_node(stable_node->tree_node);
		} else {
			stable_node->tree_node->count--;
		}
	}

	free_stable_node(stable_node);
}


/*
 * get_uksm_page: checks if the page indicated by the stable node
 * is still its ksm page, despite having held no reference to it.
 * In which case we can trust the content of the page, and it
 * returns the gotten page; but if the page has now been zapped,
 * remove the stale node from the stable tree and return NULL.
 *
 * You would expect the stable_node to hold a reference to the ksm page.
 * But if it increments the page's count, swapping out has to wait for
 * ksmd to come around again before it can free the page, which may take
 * seconds or even minutes: much too unresponsive.  So instead we use a
 * "keyhole reference": access to the ksm page from the stable node peeps
 * out through its keyhole to see if that page still holds the right key,
 * pointing back to this stable node.  This relies on freeing a PageAnon
 * page to reset its page->mapping to NULL, and relies on no other use of
 * a page to put something that might look like our key in page->mapping.
 *
 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
 * but this is different - made simpler by uksm_thread_mutex being held, but
 * interesting for assuming that no other use of the struct page could ever
 * put our expected_mapping into page->mapping (or a field of the union which
 * coincides with page->mapping).  The RCU calls are not for KSM at all, but
 * to keep the page_count protocol described with page_cache_get_speculative.
 *
 * Note: it is possible that get_uksm_page() will return NULL one moment,
 * then page the next, if the page is in between page_freeze_refs() and
 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
 * is on its way to being freed; but it is an anomaly to bear in mind.
 *
 * @unlink_rb: 		if the removal of this node will firstly unlink from
 * its rbtree. stable_node_reinsert will prevent this when restructuring the
 * node from its old tree.
 *
 * @remove_tree_node:	if this is the last one of its tree_node, will the
 * tree_node be freed ? If we are inserting stable node, this tree_node may
 * be reused, so don't free it.
 */
static struct page *get_uksm_page(struct stable_node *stable_node,
				 int unlink_rb, int remove_tree_node)
{
	struct page *page;
	void *expected_mapping;

	page = pfn_to_page(stable_node->kpfn);
	expected_mapping = (void *)((unsigned long)stable_node |
				    PAGE_MAPPING_KSM);
	rcu_read_lock();
	if (page->mapping != expected_mapping)
		goto stale;
	if (!get_page_unless_zero(page))
		goto stale;
	if (page->mapping != expected_mapping) {
		put_page(page);
		goto stale;
	}
	rcu_read_unlock();
	return page;
stale:
	rcu_read_unlock();
	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);

	return NULL;
}

/*
 * Removing rmap_item from stable or unstable tree.
 * This function will clean the information from the stable/unstable tree.
 */
static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
{
	if (rmap_item->address & STABLE_FLAG) {
		struct stable_node *stable_node;
		struct node_vma *node_vma;
		struct page *page;

		node_vma = rmap_item->head;
		stable_node = node_vma->head;
		page = get_uksm_page(stable_node, 1, 1);
		if (!page)
			goto out;

		/*
		 * page lock is needed because it's racing with
		 * try_to_unmap_ksm(), etc.
		 */
		lock_page(page);
		hlist_del(&rmap_item->hlist);

		if (hlist_empty(&node_vma->rmap_hlist)) {
			hlist_del(&node_vma->hlist);
			free_node_vma(node_vma);
		}
		unlock_page(page);

		put_page(page);
		if (hlist_empty(&stable_node->hlist)) {
			/* do NOT call remove_node_from_stable_tree() here,
			 * it's possible for a forked rmap_item not in
			 * stable tree while the in-tree rmap_items were
			 * deleted.
			 */
			uksm_pages_shared--;
		} else
			uksm_pages_sharing--;


		uksm_drop_anon_vma(rmap_item);
	} else if (rmap_item->address & UNSTABLE_FLAG) {
		if (rmap_item->hash_round == uksm_hash_round) {

			rb_erase(&rmap_item->node,
				 &rmap_item->tree_node->sub_root);
			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
				rb_erase(&rmap_item->tree_node->node,
					 &root_unstable_tree);

				free_tree_node(rmap_item->tree_node);
			} else
				rmap_item->tree_node->count--;
		}
		uksm_pages_unshared--;
	}

	rmap_item->address &= PAGE_MASK;
	rmap_item->hash_max = 0;

out:
	cond_resched();		/* we're called from many long loops */
}

static inline int slot_in_uksm(struct vma_slot *slot)
{
	return list_empty(&slot->slot_list);
}

/*
 * Test if the mm is exiting
 */
static inline bool uksm_test_exit(struct mm_struct *mm)
{
	return atomic_read(&mm->mm_users) == 0;
}

static inline unsigned long vma_pool_size(struct vma_slot *slot)
{
	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
			PAGE_SIZE) >> PAGE_SHIFT;
}

#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))

/* must be done with sem locked */
static int slot_pool_alloc(struct vma_slot *slot)
{
	unsigned long pool_size;

	if (slot->rmap_list_pool)
		return 0;

	pool_size = vma_pool_size(slot);
	slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
				       pool_size, GFP_KERNEL);
	if (!slot->rmap_list_pool)
		return -ENOMEM;

	slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
				    GFP_KERNEL);
	if (!slot->pool_counts) {
		kfree(slot->rmap_list_pool);
		return -ENOMEM;
	}

	slot->pool_size = pool_size;
	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
	slot->flags |= UKSM_SLOT_IN_UKSM;
	uksm_pages_total += slot->pages;

	return 0;
}

/*
 * Called after vma is unlinked from its mm
 */
void uksm_remove_vma(struct vm_area_struct *vma)
{
	struct vma_slot *slot;

	if (!vma->uksm_vma_slot)
		return;

	spin_lock(&vma_slot_list_lock);
	slot = vma->uksm_vma_slot;
	if (!slot)
		goto out;

	if (slot_in_uksm(slot)) {
		/**
		 * This slot has been added by ksmd, so move to the del list
		 * waiting ksmd to free it.
		 */
		list_add_tail(&slot->slot_list, &vma_slot_del);
	} else {
		/**
		 * It's still on new list. It's ok to free slot directly.
		 */
		list_del(&slot->slot_list);
		free_vma_slot(slot);
	}
out:
	vma->uksm_vma_slot = NULL;
	spin_unlock(&vma_slot_list_lock);
}

/**
 * Need to do two things:
 * 1. check if slot was moved to del list
 * 2. make sure the mmap_sem is manipulated under valid vma.
 *
 * My concern here is that in some cases, this may make
 * vma_slot_list_lock() waiters to serialized further by some
 * sem->wait_lock, can this really be expensive?
 *
 *
 * @return
 * 0: if successfully locked mmap_sem
 * -ENOENT: this slot was moved to del list
 * -EBUSY: vma lock failed
 */
static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
{
	struct vm_area_struct *vma;
	struct mm_struct *mm;
	struct rw_semaphore *sem;

	spin_lock(&vma_slot_list_lock);

	/* the slot_list was removed and inited from new list, when it enters
	 * uksm_list. If now it's not empty, then it must be moved to del list
	 */
	if (!slot_in_uksm(slot)) {
		spin_unlock(&vma_slot_list_lock);
		return -ENOENT;
	}

	BUG_ON(slot->pages != vma_pages(slot->vma));
	/* Ok, vma still valid */
	vma = slot->vma;
	mm = vma->vm_mm;
	sem = &mm->mmap_sem;

	if (uksm_test_exit(mm)) {
		spin_unlock(&vma_slot_list_lock);
		return -ENOENT;
	}

	if (down_read_trylock(sem)) {
		spin_unlock(&vma_slot_list_lock);
		if (slot_pool_alloc(slot)) {
			uksm_remove_vma(vma);
			up_read(sem);
			return -ENOENT;
		}
		return 0;
	}

	spin_unlock(&vma_slot_list_lock);
	return -EBUSY;
}

static inline unsigned long
vma_page_address(struct page *page, struct vm_area_struct *vma)
{
	pgoff_t pgoff = page->index;
	unsigned long address;

	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
		/* page should be within @vma mapping range */
		return -EFAULT;
	}
	return address;
}


/* return 0 on success with the item's mmap_sem locked */
static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
{
	struct mm_struct *mm;
	struct vma_slot *slot = item->slot;
	int err = -EINVAL;

	struct page *page;

	/*
	 * try_down_read_slot_mmap_sem() returns non-zero if the slot
	 * has been removed by uksm_remove_vma().
	 */
	if (try_down_read_slot_mmap_sem(slot))
		return -EBUSY;

	mm = slot->vma->vm_mm;

	if (uksm_test_exit(mm))
		goto failout_up;

	page = item->page;
	rcu_read_lock();
	if (!get_page_unless_zero(page)) {
		rcu_read_unlock();
		goto failout_up;
	}

	/* No need to consider huge page here. */
	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
		/*
		 * TODO:
		 * should we release this item becase of its stale page
		 * mapping?
		 */
		put_page(page);
		rcu_read_unlock();
		goto failout_up;
	}
	rcu_read_unlock();
	return 0;

failout_up:
	up_read(&mm->mmap_sem);
	return err;
}

/*
 * What kind of VMA is considered ?
 */
static inline int vma_can_enter(struct vm_area_struct *vma)
{
	return uksm_flags_can_scan(vma->vm_flags);
}

/*
 * Called whenever a fresh new vma is created A new vma_slot.
 * is created and inserted into a global list Must be called.
 * after vma is inserted to its mm      		    .
 */
void uksm_vma_add_new(struct vm_area_struct *vma)
{
	struct vma_slot *slot;

	if (!vma_can_enter(vma)) {
		vma->uksm_vma_slot = NULL;
		return;
	}

	slot = alloc_vma_slot();
	if (!slot) {
		vma->uksm_vma_slot = NULL;
		return;
	}

	vma->uksm_vma_slot = slot;
	vma->vm_flags |= VM_MERGEABLE;
	slot->vma = vma;
	slot->mm = vma->vm_mm;
	slot->ctime_j = jiffies;
	slot->pages = vma_pages(vma);
	spin_lock(&vma_slot_list_lock);
	list_add_tail(&slot->slot_list, &vma_slot_new);
	spin_unlock(&vma_slot_list_lock);
}

/*   32/3 < they < 32/2 */
#define shiftl	8
#define shiftr	12

#define HASH_FROM_TO(from, to) 				\
for (index = from; index < to; index++) {		\
	pos = random_nums[index];			\
	hash += key[pos];				\
	hash += (hash << shiftl);			\
	hash ^= (hash >> shiftr);			\
}


#define HASH_FROM_DOWN_TO(from, to) 			\
for (index = from - 1; index >= to; index--) {		\
	hash ^= (hash >> shiftr);			\
	hash ^= (hash >> (shiftr*2));			\
	hash -= (hash << shiftl);			\
	hash += (hash << (shiftl*2));			\
	pos = random_nums[index];			\
	hash -= key[pos];				\
}

/*
 * The main random sample hash function.
 */
static u32 random_sample_hash(void *addr, u32 hash_strength)
{
	u32 hash = 0xdeadbeef;
	int index, pos, loop = hash_strength;
	u32 *key = (u32 *)addr;

	if (loop > HASH_STRENGTH_FULL)
		loop = HASH_STRENGTH_FULL;

	HASH_FROM_TO(0, loop);

	if (hash_strength > HASH_STRENGTH_FULL) {
		loop = hash_strength - HASH_STRENGTH_FULL;
		HASH_FROM_TO(0, loop);
	}

	return hash;
}


/**
 * It's used when hash strength is adjusted
 *
 * @addr The page's virtual address
 * @from The original hash strength
 * @to   The hash strength changed to
 * @hash The hash value generated with "from" hash value
 *
 * return the hash value
 */
static u32 delta_hash(void *addr, int from, int to, u32 hash)
{
	u32 *key = (u32 *)addr;
	int index, pos; /* make sure they are int type */

	if (to > from) {
		if (from >= HASH_STRENGTH_FULL) {
			from -= HASH_STRENGTH_FULL;
			to -= HASH_STRENGTH_FULL;
			HASH_FROM_TO(from, to);
		} else if (to <= HASH_STRENGTH_FULL) {
			HASH_FROM_TO(from, to);
		} else {
			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
		}
	} else {
		if (from <= HASH_STRENGTH_FULL) {
			HASH_FROM_DOWN_TO(from, to);
		} else if (to >= HASH_STRENGTH_FULL) {
			from -= HASH_STRENGTH_FULL;
			to -= HASH_STRENGTH_FULL;
			HASH_FROM_DOWN_TO(from, to);
		} else {
			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
		}
	}

	return hash;
}

/**
 *
 * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
 * has finished.
 *
 * return 0 if no page has been scanned since last call, 1 otherwise.
 */
static inline int encode_benefit(void)
{
	u64 scanned_delta, pos_delta, neg_delta;
	unsigned long base = benefit.base;

	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;

	if (!scanned_delta)
		return 0;

	scanned_delta >>= base;
	pos_delta = rshash_pos >> base;
	neg_delta = rshash_neg >> base;

	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
		benefit.scanned >>= 1;
		benefit.neg >>= 1;
		benefit.pos >>= 1;
		benefit.base++;
		scanned_delta >>= 1;
		pos_delta >>= 1;
		neg_delta >>= 1;
	}

	benefit.pos += pos_delta;
	benefit.neg += neg_delta;
	benefit.scanned += scanned_delta;

	BUG_ON(!benefit.scanned);

	rshash_pos = rshash_neg = 0;
	uksm_pages_scanned_last = uksm_pages_scanned;

	return 1;
}

static inline void reset_benefit(void)
{
	benefit.pos = 0;
	benefit.neg = 0;
	benefit.base = 0;
	benefit.scanned = 0;
}

static inline void inc_rshash_pos(unsigned long delta)
{
	if (CAN_OVERFLOW_U64(rshash_pos, delta))
		encode_benefit();

	rshash_pos += delta;
}

static inline void inc_rshash_neg(unsigned long delta)
{
	if (CAN_OVERFLOW_U64(rshash_neg, delta))
		encode_benefit();

	rshash_neg += delta;
}


static inline u32 page_hash(struct page *page, unsigned long hash_strength,
			    int cost_accounting)
{
	u32 val;
	unsigned long delta;

	void *addr = kmap_atomic(page);

	val = random_sample_hash(addr, hash_strength);
	kunmap_atomic(addr);

	if (cost_accounting) {
		if (HASH_STRENGTH_FULL > hash_strength)
			delta = HASH_STRENGTH_FULL - hash_strength;
		else
			delta = 0;

		inc_rshash_pos(delta);
	}

	return val;
}

static int memcmp_pages(struct page *page1, struct page *page2,
			int cost_accounting)
{
	char *addr1, *addr2;
	int ret;

	addr1 = kmap_atomic(page1);
	addr2 = kmap_atomic(page2);
	ret = memcmp(addr1, addr2, PAGE_SIZE);
	kunmap_atomic(addr2);
	kunmap_atomic(addr1);

	if (cost_accounting)
		inc_rshash_neg(memcmp_cost);

	return ret;
}

static inline int pages_identical(struct page *page1, struct page *page2)
{
	return !memcmp_pages(page1, page2, 0);
}

static inline int is_page_full_zero(struct page *page)
{
	char *addr;
	int ret;

	addr = kmap_atomic(page);
	ret = is_full_zero(addr, PAGE_SIZE);
	kunmap_atomic(addr);

	return ret;
}

static int write_protect_page(struct vm_area_struct *vma, struct page *page,
			      pte_t *orig_pte, pte_t *old_pte)
{
	struct mm_struct *mm = vma->vm_mm;
	unsigned long addr;
	pte_t *ptep;
	spinlock_t *ptl;
	int swapped;
	int err = -EFAULT;
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */

	addr = page_address_in_vma(page, vma);
	if (addr == -EFAULT)
		goto out;

	BUG_ON(PageTransCompound(page));

	mmun_start = addr;
	mmun_end   = addr + PAGE_SIZE;
	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

	ptep = page_check_address(page, mm, addr, &ptl, 0);
	if (!ptep)
		goto out_mn;

	if (old_pte)
		*old_pte = *ptep;

	if (pte_write(*ptep) || pte_dirty(*ptep)) {
		pte_t entry;

		swapped = PageSwapCache(page);
		flush_cache_page(vma, addr, page_to_pfn(page));
		/*
		 * Ok this is tricky, when get_user_pages_fast() run it doesnt
		 * take any lock, therefore the check that we are going to make
		 * with the pagecount against the mapcount is racey and
		 * O_DIRECT can happen right after the check.
		 * So we clear the pte and flush the tlb before the check
		 * this assure us that no O_DIRECT can happen after the check
		 * or in the middle of the check.
		 */
		entry = ptep_clear_flush_notify(vma, addr, ptep);
		/*
		 * Check that no O_DIRECT or similar I/O is in progress on the
		 * page
		 */
		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
			set_pte_at(mm, addr, ptep, entry);
			goto out_unlock;
		}
		if (pte_dirty(entry))
			set_page_dirty(page);
		entry = pte_mkclean(pte_wrprotect(entry));
		set_pte_at_notify(mm, addr, ptep, entry);
	}
	*orig_pte = *ptep;
	err = 0;

out_unlock:
	pte_unmap_unlock(ptep, ptl);
out_mn:
	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
	return err;
}

#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
#define MERGE_ERR_COLLI		2 /* there is a collision */
#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */


/**
 * replace_page - replace page in vma by new ksm page
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage
 * @kpage:    the ksm page we replace page by
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, MERGE_ERR_PGERR on failure.
 */
static int replace_page(struct vm_area_struct *vma, struct page *page,
			struct page *kpage, pte_t orig_pte)
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep;
	spinlock_t *ptl;
	pte_t entry;

	unsigned long addr;
	int err = MERGE_ERR_PGERR;
	unsigned long mmun_start;	/* For mmu_notifiers */
	unsigned long mmun_end;		/* For mmu_notifiers */

	addr = page_address_in_vma(page, vma);
	if (addr == -EFAULT)
		goto out;

	pgd = pgd_offset(mm, addr);
	if (!pgd_present(*pgd))
		goto out;

	pud = pud_offset(pgd, addr);
	if (!pud_present(*pud))
		goto out;

	pmd = pmd_offset(pud, addr);
	BUG_ON(pmd_trans_huge(*pmd));
	if (!pmd_present(*pmd))
		goto out;

	mmun_start = addr;
	mmun_end   = addr + PAGE_SIZE;
	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!pte_same(*ptep, orig_pte)) {
		pte_unmap_unlock(ptep, ptl);
		goto out_mn;
	}

	flush_cache_page(vma, addr, pte_pfn(*ptep));
	ptep_clear_flush_notify(vma, addr, ptep);
	entry = mk_pte(kpage, vma->vm_page_prot);

	/* special treatment is needed for zero_page */
	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
				(page_to_pfn(kpage) == zero_pfn)) {
		entry = pte_mkspecial(entry);
		dec_mm_counter(mm, MM_ANONPAGES);
		inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
	} else {
		get_page(kpage);
		page_add_anon_rmap(kpage, vma, addr, false);
	}

	set_pte_at_notify(mm, addr, ptep, entry);

	page_remove_rmap(page, false);
	if (!page_mapped(page))
		try_to_free_swap(page);
	put_page(page);

	pte_unmap_unlock(ptep, ptl);
	err = 0;
out_mn:
	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
	return err;
}


/**
 *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
 *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
 *  hash_max member has not been calculated.
 *
 * @page The page needs to be hashed
 * @hash_old The hash value calculated with current hash strength
 *
 * return the new hash value calculated at HASH_STRENGTH_MAX
 */
static inline u32 page_hash_max(struct page *page, u32 hash_old)
{
	u32 hash_max = 0;
	void *addr;

	addr = kmap_atomic(page);
	hash_max = delta_hash(addr, hash_strength,
			      HASH_STRENGTH_MAX, hash_old);

	kunmap_atomic(addr);

	if (!hash_max)
		hash_max = 1;

	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
	return hash_max;
}

/*
 * We compare the hash again, to ensure that it is really a hash collision
 * instead of being caused by page write.
 */
static inline int check_collision(struct rmap_item *rmap_item,
				  u32 hash)
{
	int err;
	struct page *page = rmap_item->page;

	/* if this rmap_item has already been hash_maxed, then the collision
	 * must appears in the second-level rbtree search. In this case we check
	 * if its hash_max value has been changed. Otherwise, the collision
	 * happens in the first-level rbtree search, so we check against it's
	 * current hash value.
	 */
	if (rmap_item->hash_max) {
		inc_rshash_neg(memcmp_cost);
		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);

		if (rmap_item->hash_max == page_hash_max(page, hash))
			err = MERGE_ERR_COLLI;
		else
			err = MERGE_ERR_CHANGED;
	} else {
		inc_rshash_neg(memcmp_cost + hash_strength);

		if (page_hash(page, hash_strength, 0) == hash)
			err = MERGE_ERR_COLLI;
		else
			err = MERGE_ERR_CHANGED;
	}

	return err;
}

/**
 * Try to merge a rmap_item.page with a kpage in stable node. kpage must
 * already be a ksm page.
 *
 * @return 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
				      struct page *kpage, u32 hash)
{
	struct vm_area_struct *vma = rmap_item->slot->vma;
	struct mm_struct *mm = vma->vm_mm;
	pte_t orig_pte = __pte(0);
	int err = MERGE_ERR_PGERR;
	struct page *page;

	if (uksm_test_exit(mm))
		goto out;

	page = rmap_item->page;

	if (page == kpage) { /* ksm page forked */
		err = 0;
		goto out;
	}

	/*
	 * We need the page lock to read a stable PageSwapCache in
	 * write_protect_page().  We use trylock_page() instead of
	 * lock_page() because we don't want to wait here - we
	 * prefer to continue scanning and merging different pages,
	 * then come back to this page when it is unlocked.
	 */
	if (!trylock_page(page))
		goto out;

	if (!PageAnon(page) || !PageKsm(kpage))
		goto out_unlock;

	if (PageTransCompound(page)) {
		err = split_huge_page(page);
		if (err)
			goto out_unlock;
	}

	/*
	 * If this anonymous page is mapped only here, its pte may need
	 * to be write-protected.  If it's mapped elsewhere, all of its
	 * ptes are necessarily already write-protected.  But in either
	 * case, we need to lock and check page_count is not raised.
	 */
	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
		if (pages_identical(page, kpage))
			err = replace_page(vma, page, kpage, orig_pte);
		else
			err = check_collision(rmap_item, hash);
	}

	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
		munlock_vma_page(page);
		if (!PageMlocked(kpage)) {
			unlock_page(page);
			lock_page(kpage);
			mlock_vma_page(kpage);
			page = kpage;		/* for final unlock */
		}
	}

out_unlock:
	unlock_page(page);
out:
	return err;
}


/**
 * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
 * to restore a page mapping that has been changed in try_to_merge_two_pages.
 *
 * @return 0 on success.
 */
static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
			     pte_t orig_pte, pte_t wprt_pte)
{
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *ptep;
	spinlock_t *ptl;

	int err = -EFAULT;

	pgd = pgd_offset(mm, addr);
	if (!pgd_present(*pgd))
		goto out;

	pud = pud_offset(pgd, addr);
	if (!pud_present(*pud))
		goto out;

	pmd = pmd_offset(pud, addr);
	if (!pmd_present(*pmd))
		goto out;

	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
	if (!pte_same(*ptep, wprt_pte)) {
		/* already copied, let it be */
		pte_unmap_unlock(ptep, ptl);
		goto out;
	}

	/*
	 * Good boy, still here. When we still get the ksm page, it does not
	 * return to the free page pool, there is no way that a pte was changed
	 * to other page and gets back to this page. And remind that ksm page
	 * do not reuse in do_wp_page(). So it's safe to restore the original
	 * pte.
	 */
	flush_cache_page(vma, addr, pte_pfn(*ptep));
	ptep_clear_flush_notify(vma, addr, ptep);
	set_pte_at_notify(mm, addr, ptep, orig_pte);

	pte_unmap_unlock(ptep, ptl);
	err = 0;
out:
	return err;
}

/**
 * try_to_merge_two_pages() - take two identical pages and prepare
 * them to be merged into one page(rmap_item->page)
 *
 * @return 0 if we successfully merged two identical pages into
 *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
 *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
 *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
 *
 */
static int try_to_merge_two_pages(struct rmap_item *rmap_item,
				  struct rmap_item *tree_rmap_item,
				  u32 hash)
{
	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
	struct vm_area_struct *vma1 = rmap_item->slot->vma;
	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
	struct page *page = rmap_item->page;
	struct page *tree_page = tree_rmap_item->page;
	int err = MERGE_ERR_PGERR;
	struct address_space *saved_mapping;


	if (rmap_item->page == tree_rmap_item->page)
		goto out;

	if (!trylock_page(page))
		goto out;

	if (!PageAnon(page))
		goto out_unlock;

	if (PageTransCompound(page)) {
		err = split_huge_page(page);
		if (err)
			goto out_unlock;
	}

	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
		unlock_page(page);
		goto out;
	}

	/*
	 * While we hold page lock, upgrade page from
	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
	 * stable_tree_insert() will update stable_node.
	 */
	saved_mapping = page->mapping;
	set_page_stable_node(page, NULL);
	mark_page_accessed(page);
	if (!PageDirty(page))
		SetPageDirty(page);

	unlock_page(page);

	if (!trylock_page(tree_page))
		goto restore_out;

	if (!PageAnon(tree_page)) {
		unlock_page(tree_page);
		goto restore_out;
	}

	if (PageTransCompound(tree_page)) {
		err = split_huge_page(tree_page);
		if (err) {
			unlock_page(tree_page);
			goto restore_out;
		}
	}

	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
		unlock_page(tree_page);
		goto restore_out;
	}

	if (pages_identical(page, tree_page)) {
		err = replace_page(vma2, tree_page, page, wprt_pte2);
		if (err) {
			unlock_page(tree_page);
			goto restore_out;
		}

		if ((vma2->vm_flags & VM_LOCKED)) {
			munlock_vma_page(tree_page);
			if (!PageMlocked(page)) {
				unlock_page(tree_page);
				lock_page(page);
				mlock_vma_page(page);
				tree_page = page; /* for final unlock */
			}
		}

		unlock_page(tree_page);

		goto out; /* success */

	} else {
		if (tree_rmap_item->hash_max &&
		    tree_rmap_item->hash_max == rmap_item->hash_max) {
			err = MERGE_ERR_COLLI_MAX;
		} else if (page_hash(page, hash_strength, 0) ==
		    page_hash(tree_page, hash_strength, 0)) {
			inc_rshash_neg(memcmp_cost + hash_strength * 2);
			err = MERGE_ERR_COLLI;
		} else {
			err = MERGE_ERR_CHANGED;
		}

		unlock_page(tree_page);
	}

restore_out:
	lock_page(page);
	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
				  orig_pte1, wprt_pte1))
		page->mapping = saved_mapping;

out_unlock:
	unlock_page(page);
out:
	return err;
}

static inline int hash_cmp(u32 new_val, u32 node_val)
{
	if (new_val > node_val)
		return 1;
	else if (new_val < node_val)
		return -1;
	else
		return 0;
}

static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
{
	u32 hash_max = item->hash_max;

	if (!hash_max) {
		hash_max = page_hash_max(item->page, hash);

		item->hash_max = hash_max;
	}

	return hash_max;
}


/**
 * stable_tree_search() - search the stable tree for a page
 *
 * @item: 	the rmap_item we are comparing with
 * @hash: 	the hash value of this item->page already calculated
 *
 * @return 	the page we have found, NULL otherwise. The page returned has
 *         	been gotten.
 */
static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
{
	struct rb_node *node = root_stable_treep->rb_node;
	struct tree_node *tree_node;
	unsigned long hash_max;
	struct page *page = item->page;
	struct stable_node *stable_node;

	stable_node = page_stable_node(page);
	if (stable_node) {
		/* ksm page forked, that is
		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
		 * it's actually gotten once outside.
		 */
		get_page(page);
		return page;
	}

	while (node) {
		int cmp;

		tree_node = rb_entry(node, struct tree_node, node);

		cmp = hash_cmp(hash, tree_node->hash);

		if (cmp < 0)
			node = node->rb_left;
		else if (cmp > 0)
			node = node->rb_right;
		else
			break;
	}

	if (!node)
		return NULL;

	if (tree_node->count == 1) {
		stable_node = rb_entry(tree_node->sub_root.rb_node,
				       struct stable_node, node);
		BUG_ON(!stable_node);

		goto get_page_out;
	}

	/*
	 * ok, we have to search the second
	 * level subtree, hash the page to a
	 * full strength.
	 */
	node = tree_node->sub_root.rb_node;
	BUG_ON(!node);
	hash_max = rmap_item_hash_max(item, hash);

	while (node) {
		int cmp;

		stable_node = rb_entry(node, struct stable_node, node);

		cmp = hash_cmp(hash_max, stable_node->hash_max);

		if (cmp < 0)
			node = node->rb_left;
		else if (cmp > 0)
			node = node->rb_right;
		else
			goto get_page_out;
	}

	return NULL;

get_page_out:
	page = get_uksm_page(stable_node, 1, 1);
	return page;
}

static int try_merge_rmap_item(struct rmap_item *item,
			       struct page *kpage,
			       struct page *tree_page)
{
	spinlock_t *ptl;
	pte_t *ptep;
	unsigned long addr;
	struct vm_area_struct *vma = item->slot->vma;

	addr = get_rmap_addr(item);
	ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
	if (!ptep)
		return 0;

	if (pte_write(*ptep)) {
		/* has changed, abort! */
		pte_unmap_unlock(ptep, ptl);
		return 0;
	}

	get_page(tree_page);
	page_add_anon_rmap(tree_page, vma, addr, false);

	flush_cache_page(vma, addr, pte_pfn(*ptep));
	ptep_clear_flush_notify(vma, addr, ptep);
	set_pte_at_notify(vma->vm_mm, addr, ptep,
			  mk_pte(tree_page, vma->vm_page_prot));

	page_remove_rmap(kpage, false);
	put_page(kpage);

	pte_unmap_unlock(ptep, ptl);

	return 1;
}

/**
 * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
 * into stable tree, the page was found to be identical to a stable ksm page,
 * this is the last chance we can merge them into one.
 *
 * @item1:	the rmap_item holding the page which we wanted to insert
 *       	into stable tree.
 * @item2:	the other rmap_item we found when unstable tree search
 * @oldpage:	the page currently mapped by the two rmap_items
 * @tree_page: 	the page we found identical in stable tree node
 * @success1:	return if item1 is successfully merged
 * @success2:	return if item2 is successfully merged
 */
static void try_merge_with_stable(struct rmap_item *item1,
				  struct rmap_item *item2,
				  struct page **kpage,
				  struct page *tree_page,
				  int *success1, int *success2)
{
	struct vm_area_struct *vma1 = item1->slot->vma;
	struct vm_area_struct *vma2 = item2->slot->vma;
	*success1 = 0;
	*success2 = 0;

	if (unlikely(*kpage == tree_page)) {
		/* I don't think this can really happen */
		printk(KERN_WARNING "UKSM: unexpected condition detected in "
			"try_merge_with_stable() -- *kpage == tree_page !\n");
		*success1 = 1;
		*success2 = 1;
		return;
	}

	if (!PageAnon(*kpage) || !PageKsm(*kpage))
		goto failed;

	if (!trylock_page(tree_page))
		goto failed;

	/* If the oldpage is still ksm and still pointed
	 * to in the right place, and still write protected,
	 * we are confident it's not changed, no need to
	 * memcmp anymore.
	 * be ware, we cannot take nested pte locks,
	 * deadlock risk.
	 */
	if (!try_merge_rmap_item(item1, *kpage, tree_page))
		goto unlock_failed;

	/* ok, then vma2, remind that pte1 already set */
	if (!try_merge_rmap_item(item2, *kpage, tree_page))
		goto success_1;

	*success2 = 1;
success_1:
	*success1 = 1;


	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
		munlock_vma_page(*kpage);
		if (!PageMlocked(tree_page))
			mlock_vma_page(tree_page);
	}

	/*
	 * We do not need oldpage any more in the caller, so can break the lock
	 * now.
	 */
	unlock_page(*kpage);
	*kpage = tree_page; /* Get unlocked outside. */
	return;

unlock_failed:
	unlock_page(tree_page);
failed:
	return;
}

static inline void stable_node_hash_max(struct stable_node *node,
					 struct page *page, u32 hash)
{
	u32 hash_max = node->hash_max;

	if (!hash_max) {
		hash_max = page_hash_max(page, hash);
		node->hash_max = hash_max;
	}
}

static inline
struct stable_node *new_stable_node(struct tree_node *tree_node,
				    struct page *kpage, u32 hash_max)
{
	struct stable_node *new_stable_node;

	new_stable_node = alloc_stable_node();
	if (!new_stable_node)
		return NULL;

	new_stable_node->kpfn = page_to_pfn(kpage);
	new_stable_node->hash_max = hash_max;
	new_stable_node->tree_node = tree_node;
	set_page_stable_node(kpage, new_stable_node);

	return new_stable_node;
}

static inline
struct stable_node *first_level_insert(struct tree_node *tree_node,
				       struct rmap_item *rmap_item,
				       struct rmap_item *tree_rmap_item,
				       struct page **kpage, u32 hash,
				       int *success1, int *success2)
{
	int cmp;
	struct page *tree_page;
	u32 hash_max = 0;
	struct stable_node *stable_node, *new_snode;
	struct rb_node *parent = NULL, **new;

	/* this tree node contains no sub-tree yet */
	stable_node = rb_entry(tree_node->sub_root.rb_node,
			       struct stable_node, node);

	tree_page = get_uksm_page(stable_node, 1, 0);
	if (tree_page) {
		cmp = memcmp_pages(*kpage, tree_page, 1);
		if (!cmp) {
			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
					      tree_page, success1, success2);
			put_page(tree_page);
			if (!*success1 && !*success2)
				goto failed;

			return stable_node;

		} else {
			/*
			 * collision in first level try to create a subtree.
			 * A new node need to be created.
			 */
			put_page(tree_page);

			stable_node_hash_max(stable_node, tree_page,
					     tree_node->hash);
			hash_max = rmap_item_hash_max(rmap_item, hash);
			cmp = hash_cmp(hash_max, stable_node->hash_max);

			parent = &stable_node->node;
			if (cmp < 0) {
				new = &parent->rb_left;
			} else if (cmp > 0) {
				new = &parent->rb_right;
			} else {
				goto failed;
			}
		}

	} else {
		/* the only stable_node deleted, we reuse its tree_node.
		 */
		parent = NULL;
		new = &tree_node->sub_root.rb_node;
	}

	new_snode = new_stable_node(tree_node, *kpage, hash_max);
	if (!new_snode)
		goto failed;

	rb_link_node(&new_snode->node, parent, new);
	rb_insert_color(&new_snode->node, &tree_node->sub_root);
	tree_node->count++;
	*success1 = *success2 = 1;

	return new_snode;

failed:
	return NULL;
}

static inline
struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
					  struct rmap_item *rmap_item,
					  struct rmap_item *tree_rmap_item,
					  struct page **kpage, u32 hash,
					  int *success1, int *success2)
{
	struct page *tree_page;
	u32 hash_max;
	struct stable_node *stable_node, *new_snode;
	struct rb_node *parent, **new;

research:
	parent = NULL;
	new = &tree_node->sub_root.rb_node;
	BUG_ON(!*new);
	hash_max = rmap_item_hash_max(rmap_item, hash);
	while (*new) {
		int cmp;

		stable_node = rb_entry(*new, struct stable_node, node);

		cmp = hash_cmp(hash_max, stable_node->hash_max);

		if (cmp < 0) {
			parent = *new;
			new = &parent->rb_left;
		} else if (cmp > 0) {
			parent = *new;
			new = &parent->rb_right;
		} else {
			tree_page = get_uksm_page(stable_node, 1, 0);
			if (tree_page) {
				cmp = memcmp_pages(*kpage, tree_page, 1);
				if (!cmp) {
					try_merge_with_stable(rmap_item,
						tree_rmap_item, kpage,
						tree_page, success1, success2);

					put_page(tree_page);
					if (!*success1 && !*success2)
						goto failed;
					/*
					 * successfully merged with a stable
					 * node
					 */
					return stable_node;
				} else {
					put_page(tree_page);
					goto failed;
				}
			} else {
				/*
				 * stable node may be deleted,
				 * and subtree maybe
				 * restructed, cannot
				 * continue, research it.
				 */
				if (tree_node->count) {
					goto research;
				} else {
					/* reuse the tree node*/
					parent = NULL;
					new = &tree_node->sub_root.rb_node;
				}
			}
		}
	}

	new_snode = new_stable_node(tree_node, *kpage, hash_max);
	if (!new_snode)
		goto failed;

	rb_link_node(&new_snode->node, parent, new);
	rb_insert_color(&new_snode->node, &tree_node->sub_root);
	tree_node->count++;
	*success1 = *success2 = 1;

	return new_snode;

failed:
	return NULL;
}


/**
 * stable_tree_insert() - try to insert a merged page in unstable tree to
 * the stable tree
 *
 * @kpage:		the page need to be inserted
 * @hash:		the current hash of this page
 * @rmap_item:		the rmap_item being scanned
 * @tree_rmap_item:	the rmap_item found on unstable tree
 * @success1:		return if rmap_item is merged
 * @success2:		return if tree_rmap_item is merged
 *
 * @return 		the stable_node on stable tree if at least one
 *      		rmap_item is inserted into stable tree, NULL
 *      		otherwise.
 */
static struct stable_node *
stable_tree_insert(struct page **kpage, u32 hash,
		   struct rmap_item *rmap_item,
		   struct rmap_item *tree_rmap_item,
		   int *success1, int *success2)
{
	struct rb_node **new = &root_stable_treep->rb_node;
	struct rb_node *parent = NULL;
	struct stable_node *stable_node;
	struct tree_node *tree_node;
	u32 hash_max = 0;

	*success1 = *success2 = 0;

	while (*new) {
		int cmp;

		tree_node = rb_entry(*new, struct tree_node, node);

		cmp = hash_cmp(hash, tree_node->hash);

		if (cmp < 0) {
			parent = *new;
			new = &parent->rb_left;
		} else if (cmp > 0) {
			parent = *new;
			new = &parent->rb_right;
		} else
			break;
	}

	if (*new) {
		if (tree_node->count == 1) {
			stable_node = first_level_insert(tree_node, rmap_item,
						tree_rmap_item, kpage,
						hash, success1, success2);
		} else {
			stable_node = stable_subtree_insert(tree_node,
					rmap_item, tree_rmap_item, kpage,
					hash, success1, success2);
		}
	} else {

		/* no tree node found */
		tree_node = alloc_tree_node(stable_tree_node_listp);
		if (!tree_node) {
			stable_node = NULL;
			goto out;
		}

		stable_node = new_stable_node(tree_node, *kpage, hash_max);
		if (!stable_node) {
			free_tree_node(tree_node);
			goto out;
		}

		tree_node->hash = hash;
		rb_link_node(&tree_node->node, parent, new);
		rb_insert_color(&tree_node->node, root_stable_treep);
		parent = NULL;
		new = &tree_node->sub_root.rb_node;

		rb_link_node(&stable_node->node, parent, new);
		rb_insert_color(&stable_node->node, &tree_node->sub_root);
		tree_node->count++;
		*success1 = *success2 = 1;
	}

out:
	return stable_node;
}


/**
 * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
 *
 * @return 	0 on success, -EBUSY if unable to lock the mmap_sem,
 *         	-EINVAL if the page mapping has been changed.
 */
static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
{
	int err;

	err = get_mergeable_page_lock_mmap(tree_rmap_item);

	if (err == -EINVAL) {
		/* its page map has been changed, remove it */
		remove_rmap_item_from_tree(tree_rmap_item);
	}

	/* The page is gotten and mmap_sem is locked now. */
	return err;
}


/**
 * unstable_tree_search_insert() - search an unstable tree rmap_item with the
 * same hash value. Get its page and trylock the mmap_sem
 */
static inline
struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
					      u32 hash)

{
	struct rb_node **new = &root_unstable_tree.rb_node;
	struct rb_node *parent = NULL;
	struct tree_node *tree_node;
	u32 hash_max;
	struct rmap_item *tree_rmap_item;

	while (*new) {
		int cmp;

		tree_node = rb_entry(*new, struct tree_node, node);

		cmp = hash_cmp(hash, tree_node->hash);

		if (cmp < 0) {
			parent = *new;
			new = &parent->rb_left;
		} else if (cmp > 0) {
			parent = *new;
			new = &parent->rb_right;
		} else
			break;
	}

	if (*new) {
		/* got the tree_node */
		if (tree_node->count == 1) {
			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
						  struct rmap_item, node);
			BUG_ON(!tree_rmap_item);

			goto get_page_out;
		}

		/* well, search the collision subtree */
		new = &tree_node->sub_root.rb_node;
		BUG_ON(!*new);
		hash_max = rmap_item_hash_max(rmap_item, hash);

		while (*new) {
			int cmp;

			tree_rmap_item = rb_entry(*new, struct rmap_item,
						  node);

			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
			parent = *new;
			if (cmp < 0)
				new = &parent->rb_left;
			else if (cmp > 0)
				new = &parent->rb_right;
			else
				goto get_page_out;
		}
	} else {
		/* alloc a new tree_node */
		tree_node = alloc_tree_node(&unstable_tree_node_list);
		if (!tree_node)
			return NULL;

		tree_node->hash = hash;
		rb_link_node(&tree_node->node, parent, new);
		rb_insert_color(&tree_node->node, &root_unstable_tree);
		parent = NULL;
		new = &tree_node->sub_root.rb_node;
	}

	/* did not found even in sub-tree */
	rmap_item->tree_node = tree_node;
	rmap_item->address |= UNSTABLE_FLAG;
	rmap_item->hash_round = uksm_hash_round;
	rb_link_node(&rmap_item->node, parent, new);
	rb_insert_color(&rmap_item->node, &tree_node->sub_root);

	uksm_pages_unshared++;
	return NULL;

get_page_out:
	if (tree_rmap_item->page == rmap_item->page)
		return NULL;

	if (get_tree_rmap_item_page(tree_rmap_item))
		return NULL;

	return tree_rmap_item;
}

static void hold_anon_vma(struct rmap_item *rmap_item,
			  struct anon_vma *anon_vma)
{
	rmap_item->anon_vma = anon_vma;
	get_anon_vma(anon_vma);
}


/**
 * stable_tree_append() - append a rmap_item to a stable node. Deduplication
 * ratio statistics is done in this function.
 *
 */
static void stable_tree_append(struct rmap_item *rmap_item,
			       struct stable_node *stable_node, int logdedup)
{
	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
	unsigned long key = (unsigned long)rmap_item->slot;
	unsigned long factor = rmap_item->slot->rung->step;

	BUG_ON(!stable_node);
	rmap_item->address |= STABLE_FLAG;

	if (hlist_empty(&stable_node->hlist)) {
		uksm_pages_shared++;
		goto node_vma_new;
	} else {
		uksm_pages_sharing++;
	}

	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
		if (node_vma->key >= key)
			break;

		if (logdedup) {
			node_vma->slot->pages_bemerged += factor;
			if (list_empty(&node_vma->slot->dedup_list))
				list_add(&node_vma->slot->dedup_list,
					 &vma_slot_dedup);
		}
	}

	if (node_vma) {
		if (node_vma->key == key) {
			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
			goto node_vma_ok;
		} else if (node_vma->key > key) {
			node_vma_cont = node_vma;
		}
	}

node_vma_new:
	/* no same vma already in node, alloc a new node_vma */
	new_node_vma = alloc_node_vma();
	BUG_ON(!new_node_vma);
	new_node_vma->head = stable_node;
	new_node_vma->slot = rmap_item->slot;

	if (!node_vma) {
		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
	} else if (node_vma->key != key) {
		if (node_vma->key < key)
			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
		else {
			hlist_add_before(&new_node_vma->hlist,
					 &node_vma->hlist);
		}

	}
	node_vma = new_node_vma;

node_vma_ok: /* ok, ready to add to the list */
	rmap_item->head = node_vma;
	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
	if (logdedup) {
		rmap_item->slot->pages_merged++;
		if (node_vma_cont) {
			node_vma = node_vma_cont;
			hlist_for_each_entry_continue(node_vma, hlist) {
				node_vma->slot->pages_bemerged += factor;
				if (list_empty(&node_vma->slot->dedup_list))
					list_add(&node_vma->slot->dedup_list,
						 &vma_slot_dedup);
			}
		}
	}
}

/*
 * We use break_ksm to break COW on a ksm page: it's a stripped down
 *
 *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
 *		put_page(page);
 *
 * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
 * in case the application has unmapped and remapped mm,addr meanwhile.
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
 */
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
	struct page *page;
	int ret = 0;

	do {
		cond_resched();
		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
		if (IS_ERR_OR_NULL(page))
			break;
		if (PageKsm(page)) {
			ret = handle_mm_fault(vma, addr,
					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
		} else
			ret = VM_FAULT_WRITE;
		put_page(page);
	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
	/*
	 * We must loop because handle_mm_fault() may back out if there's
	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
	 *
	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
	 * COW has been broken, even if the vma does not permit VM_WRITE;
	 * but note that a concurrent fault might break PageKsm for us.
	 *
	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
	 * backing file, which also invalidates anonymous pages: that's
	 * okay, that truncation will have unmapped the PageKsm for us.
	 *
	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
	 * current task has TIF_MEMDIE set, and will be OOM killed on return
	 * to user; and ksmd, having no mm, would never be chosen for that.
	 *
	 * But if the mm is in a limited mem_cgroup, then the fault may fail
	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
	 * even ksmd can fail in this way - though it's usually breaking ksm
	 * just to undo a merge it made a moment before, so unlikely to oom.
	 *
	 * That's a pity: we might therefore have more kernel pages allocated
	 * than we're counting as nodes in the stable tree; but uksm_do_scan
	 * will retry to break_cow on each pass, so should recover the page
	 * in due course.  The important thing is to not let VM_MERGEABLE
	 * be cleared while any such pages might remain in the area.
	 */
	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}

static void break_cow(struct rmap_item *rmap_item)
{
	struct vm_area_struct *vma = rmap_item->slot->vma;
	struct mm_struct *mm = vma->vm_mm;
	unsigned long addr = get_rmap_addr(rmap_item);

	if (uksm_test_exit(mm))
		goto out;

	break_ksm(vma, addr);
out:
	return;
}

/*
 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
 * than check every pte of a given vma, the locking doesn't quite work for
 * that - an rmap_item is assigned to the stable tree after inserting ksm
 * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
 * rmap_items from parent to child at fork time (so as not to waste time
 * if exit comes before the next scan reaches it).
 *
 * Similarly, although we'd like to remove rmap_items (so updating counts
 * and freeing memory) when unmerging an area, it's easier to leave that
 * to the next pass of ksmd - consider, for example, how ksmd might be
 * in cmp_and_merge_page on one of the rmap_items we would be removing.
 */
inline int unmerge_uksm_pages(struct vm_area_struct *vma,
		      unsigned long start, unsigned long end)
{
	unsigned long addr;
	int err = 0;

	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
		if (uksm_test_exit(vma->vm_mm))
			break;
		if (signal_pending(current))
			err = -ERESTARTSYS;
		else
			err = break_ksm(vma, addr);
	}
	return err;
}

static inline void inc_uksm_pages_scanned(void)
{
	u64 delta;


	if (uksm_pages_scanned == U64_MAX) {
		encode_benefit();

		delta = uksm_pages_scanned >> pages_scanned_base;

		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
			pages_scanned_stored >>= 1;
			delta >>= 1;
			pages_scanned_base++;
		}

		pages_scanned_stored += delta;

		uksm_pages_scanned = uksm_pages_scanned_last = 0;
	}

	uksm_pages_scanned++;
}

static inline int find_zero_page_hash(int strength, u32 hash)
{
	return (zero_hash_table[strength] == hash);
}

static
int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
{
	struct page *zero_page = empty_uksm_zero_page;
	struct mm_struct *mm = vma->vm_mm;
	pte_t orig_pte = __pte(0);
	int err = -EFAULT;

	if (uksm_test_exit(mm))
		goto out;

	if (!trylock_page(page))
		goto out;

	if (!PageAnon(page))
		goto out_unlock;

	if (PageTransCompound(page)) {
		err = split_huge_page(page);
		if (err)
			goto out_unlock;
	}

	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
		if (is_page_full_zero(page))
			err = replace_page(vma, page, zero_page, orig_pte);
	}

out_unlock:
	unlock_page(page);
out:
	return err;
}

/*
 * cmp_and_merge_page() - first see if page can be merged into the stable
 * tree; if not, compare hash to previous and if it's the same, see if page
 * can be inserted into the unstable tree, or merged with a page already there
 * and both transferred to the stable tree.
 *
 * @page: the page that we are searching identical page to.
 * @rmap_item: the reverse mapping into the virtual address of this page
 */
static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
{
	struct rmap_item *tree_rmap_item;
	struct page *page;
	struct page *kpage = NULL;
	u32 hash_max;
	int err;
	unsigned int success1, success2;
	struct stable_node *snode;
	int cmp;
	struct rb_node *parent = NULL, **new;

	remove_rmap_item_from_tree(rmap_item);
	page = rmap_item->page;

	/* We first start with searching the page inside the stable tree */
	kpage = stable_tree_search(rmap_item, hash);
	if (kpage) {
		err = try_to_merge_with_uksm_page(rmap_item, kpage,
						 hash);
		if (!err) {
			/*
			 * The page was successfully merged, add
			 * its rmap_item to the stable tree.
			 * page lock is needed because it's
			 * racing with try_to_unmap_ksm(), etc.
			 */
			lock_page(kpage);
			snode = page_stable_node(kpage);
			stable_tree_append(rmap_item, snode, 1);
			unlock_page(kpage);
			put_page(kpage);
			return; /* success */
		}
		put_page(kpage);

		/*
		 * if it's a collision and it has been search in sub-rbtree
		 * (hash_max != 0), we want to abort, because if it is
		 * successfully merged in unstable tree, the collision trends to
		 * happen again.
		 */
		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
			return;
	}

	tree_rmap_item =
		unstable_tree_search_insert(rmap_item, hash);
	if (tree_rmap_item) {
		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
		/*
		 * As soon as we merge this page, we want to remove the
		 * rmap_item of the page we have merged with from the unstable
		 * tree, and insert it instead as new node in the stable tree.
		 */
		if (!err) {
			kpage = page;
			remove_rmap_item_from_tree(tree_rmap_item);
			lock_page(kpage);
			snode = stable_tree_insert(&kpage, hash,
						   rmap_item, tree_rmap_item,
						   &success1, &success2);

			/*
			 * Do not log dedup for tree item, it's not counted as
			 * scanned in this round.
			 */
			if (success2)
				stable_tree_append(tree_rmap_item, snode, 0);

			/*
			 * The order of these two stable append is important:
			 * we are scanning rmap_item.
			 */
			if (success1)
				stable_tree_append(rmap_item, snode, 1);

			/*
			 * The original kpage may be unlocked inside
			 * stable_tree_insert() already. This page
			 * should be unlocked before doing
			 * break_cow().
			 */
			unlock_page(kpage);

			if (!success1)
				break_cow(rmap_item);

			if (!success2)
				break_cow(tree_rmap_item);

		} else if (err == MERGE_ERR_COLLI) {
			BUG_ON(tree_rmap_item->tree_node->count > 1);

			rmap_item_hash_max(tree_rmap_item,
					   tree_rmap_item->tree_node->hash);

			hash_max = rmap_item_hash_max(rmap_item, hash);
			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
			parent = &tree_rmap_item->node;
			if (cmp < 0)
				new = &parent->rb_left;
			else if (cmp > 0)
				new = &parent->rb_right;
			else
				goto put_up_out;

			rmap_item->tree_node = tree_rmap_item->tree_node;
			rmap_item->address |= UNSTABLE_FLAG;
			rmap_item->hash_round = uksm_hash_round;
			rb_link_node(&rmap_item->node, parent, new);
			rb_insert_color(&rmap_item->node,
					&tree_rmap_item->tree_node->sub_root);
			rmap_item->tree_node->count++;
		} else {
			/*
			 * either one of the page has changed or they collide
			 * at the max hash, we consider them as ill items.
			 */
			remove_rmap_item_from_tree(tree_rmap_item);
		}
put_up_out:
		put_page(tree_rmap_item->page);
		up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
	}
}


static inline unsigned long get_pool_index(struct vma_slot *slot,
					   unsigned long index)
{
	unsigned long pool_index;

	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
	if (pool_index >= slot->pool_size)
		BUG();
	return pool_index;
}

static inline unsigned long index_page_offset(unsigned long index)
{
	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
}

static inline
struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
					    unsigned long index, int need_alloc)
{
	unsigned long pool_index;
	struct page *page;
	void *addr;


	pool_index = get_pool_index(slot, index);
	if (!slot->rmap_list_pool[pool_index]) {
		if (!need_alloc)
			return NULL;

		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
		if (!page)
			return NULL;

		slot->rmap_list_pool[pool_index] = page;
	}

	addr = kmap(slot->rmap_list_pool[pool_index]);
	addr += index_page_offset(index);

	return addr;
}

static inline void put_rmap_list_entry(struct vma_slot *slot,
				       unsigned long index)
{
	unsigned long pool_index;

	pool_index = get_pool_index(slot, index);
	BUG_ON(!slot->rmap_list_pool[pool_index]);
	kunmap(slot->rmap_list_pool[pool_index]);
}

static inline int entry_is_new(struct rmap_list_entry *entry)
{
	return !entry->item;
}

static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
						unsigned long index)
{
	return slot->vma->vm_start + (index << PAGE_SHIFT);
}

static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
{
	unsigned long addr;

	if (is_addr(entry->addr))
		addr = get_clean_addr(entry->addr);
	else if (entry->item)
		addr = get_rmap_addr(entry->item);
	else
		BUG();

	return addr;
}

static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
{
	if (is_addr(entry->addr))
		return NULL;

	return entry->item;
}

static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
					    unsigned long index)
{
	unsigned long pool_index;

	pool_index = get_pool_index(slot, index);
	BUG_ON(!slot->rmap_list_pool[pool_index]);
	slot->pool_counts[pool_index]++;
}

static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
					    unsigned long index)
{
	unsigned long pool_index;

	pool_index = get_pool_index(slot, index);
	BUG_ON(!slot->rmap_list_pool[pool_index]);
	BUG_ON(!slot->pool_counts[pool_index]);
	slot->pool_counts[pool_index]--;
}

static inline int entry_has_rmap(struct rmap_list_entry *entry)
{
	return !is_addr(entry->addr) && entry->item;
}

static inline void swap_entries(struct rmap_list_entry *entry1,
				unsigned long index1,
				struct rmap_list_entry *entry2,
				unsigned long index2)
{
	struct rmap_list_entry tmp;

	/* swapping two new entries is meaningless */
	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));

	tmp = *entry1;
	*entry1 = *entry2;
	*entry2 = tmp;

	if (entry_has_rmap(entry1))
		entry1->item->entry_index = index1;

	if (entry_has_rmap(entry2))
		entry2->item->entry_index = index2;

	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
		inc_rmap_list_pool_count(entry1->item->slot, index1);
		dec_rmap_list_pool_count(entry1->item->slot, index2);
	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
		inc_rmap_list_pool_count(entry2->item->slot, index2);
		dec_rmap_list_pool_count(entry2->item->slot, index1);
	}
}

static inline void free_entry_item(struct rmap_list_entry *entry)
{
	unsigned long index;
	struct rmap_item *item;

	if (!is_addr(entry->addr)) {
		BUG_ON(!entry->item);
		item = entry->item;
		entry->addr = get_rmap_addr(item);
		set_is_addr(entry->addr);
		index = item->entry_index;
		remove_rmap_item_from_tree(item);
		dec_rmap_list_pool_count(item->slot, index);
		free_rmap_item(item);
	}
}

static inline int pool_entry_boundary(unsigned long index)
{
	unsigned long linear_addr;

	linear_addr = sizeof(struct rmap_list_entry *) * index;
	return index && !offset_in_page(linear_addr);
}

static inline void try_free_last_pool(struct vma_slot *slot,
				      unsigned long index)
{
	unsigned long pool_index;

	pool_index = get_pool_index(slot, index);
	if (slot->rmap_list_pool[pool_index] &&
	    !slot->pool_counts[pool_index]) {
		__free_page(slot->rmap_list_pool[pool_index]);
		slot->rmap_list_pool[pool_index] = NULL;
		slot->flags |= UKSM_SLOT_NEED_SORT;
	}

}

static inline unsigned long vma_item_index(struct vm_area_struct *vma,
					   struct rmap_item *item)
{
	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
}

static int within_same_pool(struct vma_slot *slot,
			    unsigned long i, unsigned long j)
{
	unsigned long pool_i, pool_j;

	pool_i = get_pool_index(slot, i);
	pool_j = get_pool_index(slot, j);

	return (pool_i == pool_j);
}

static void sort_rmap_entry_list(struct vma_slot *slot)
{
	unsigned long i, j;
	struct rmap_list_entry *entry, *swap_entry;

	entry = get_rmap_list_entry(slot, 0, 0);
	for (i = 0; i < slot->pages; ) {

		if (!entry)
			goto skip_whole_pool;

		if (entry_is_new(entry))
			goto next_entry;

		if (is_addr(entry->addr)) {
			entry->addr = 0;
			goto next_entry;
		}

		j = vma_item_index(slot->vma, entry->item);
		if (j == i)
			goto next_entry;

		if (within_same_pool(slot, i, j))
			swap_entry = entry + j - i;
		else
			swap_entry = get_rmap_list_entry(slot, j, 1);

		swap_entries(entry, i, swap_entry, j);
		if (!within_same_pool(slot, i, j))
			put_rmap_list_entry(slot, j);
		continue;

skip_whole_pool:
		i += PAGE_SIZE / sizeof(*entry);
		if (i < slot->pages)
			entry = get_rmap_list_entry(slot, i, 0);
		continue;

next_entry:
		if (i >= slot->pages - 1 ||
		    !within_same_pool(slot, i, i + 1)) {
			put_rmap_list_entry(slot, i);
			if (i + 1 < slot->pages)
				entry = get_rmap_list_entry(slot, i + 1, 0);
		} else
			entry++;
		i++;
		continue;
	}

	/* free empty pool entries which contain no rmap_item */
	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
	for (i = 0; i < slot->pool_size; i++) {
		unsigned char has_rmap;
		void *addr;

		if (!slot->rmap_list_pool[i])
			continue;

		has_rmap = 0;
		addr = kmap(slot->rmap_list_pool[i]);
		BUG_ON(!addr);
		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
			entry = (struct rmap_list_entry *)addr + j;
			if (is_addr(entry->addr))
				continue;
			if (!entry->item)
				continue;
			has_rmap = 1;
		}
		kunmap(slot->rmap_list_pool[i]);
		if (!has_rmap) {
			BUG_ON(slot->pool_counts[i]);
			__free_page(slot->rmap_list_pool[i]);
			slot->rmap_list_pool[i] = NULL;
		}
	}

	slot->flags &= ~UKSM_SLOT_NEED_SORT;
}

/*
 * vma_fully_scanned() - if all the pages in this slot have been scanned.
 */
static inline int vma_fully_scanned(struct vma_slot *slot)
{
	return slot->pages_scanned == slot->pages;
}

/**
 * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
 * its random permutation. This function is embedded with the random
 * permutation index management code.
 */
static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
{
	unsigned long rand_range, addr, swap_index, scan_index;
	struct rmap_item *item = NULL;
	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
	struct page *page;

	scan_index = swap_index = slot->pages_scanned % slot->pages;

	if (pool_entry_boundary(scan_index))
		try_free_last_pool(slot, scan_index - 1);

	if (vma_fully_scanned(slot)) {
		if (slot->flags & UKSM_SLOT_NEED_SORT)
			slot->flags |= UKSM_SLOT_NEED_RERAND;
		else
			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
		if (slot->flags & UKSM_SLOT_NEED_SORT)
			sort_rmap_entry_list(slot);
	}

	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
	if (!scan_entry)
		return NULL;

	if (entry_is_new(scan_entry)) {
		scan_entry->addr = get_index_orig_addr(slot, scan_index);
		set_is_addr(scan_entry->addr);
	}

	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
		rand_range = slot->pages - scan_index;
		BUG_ON(!rand_range);
		swap_index = scan_index + (prandom_u32() % rand_range);
	}

	if (swap_index != scan_index) {
		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
		if (entry_is_new(swap_entry)) {
			swap_entry->addr = get_index_orig_addr(slot,
							       swap_index);
			set_is_addr(swap_entry->addr);
		}
		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
	}

	addr = get_entry_address(scan_entry);
	item = get_entry_item(scan_entry);
	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);

	page = follow_page(slot->vma, addr, FOLL_GET);
	if (IS_ERR_OR_NULL(page))
		goto nopage;

	if (!PageAnon(page))
		goto putpage;

	/*check is zero_page pfn or uksm_zero_page*/
	if ((page_to_pfn(page) == zero_pfn)
			|| (page_to_pfn(page) == uksm_zero_pfn))
		goto putpage;

	flush_anon_page(slot->vma, page, addr);
	flush_dcache_page(page);


	*hash = page_hash(page, hash_strength, 1);
	inc_uksm_pages_scanned();
	/*if the page content all zero, re-map to zero-page*/
	if (find_zero_page_hash(hash_strength, *hash)) {
		if (!cmp_and_merge_zero_page(slot->vma, page)) {
			slot->pages_merged++;

			/* For full-zero pages, no need to create rmap item */
			goto putpage;
		} else {
			inc_rshash_neg(memcmp_cost / 2);
		}
	}

	if (!item) {
		item = alloc_rmap_item();
		if (item) {
			/* It has already been zeroed */
			item->slot = slot;
			item->address = addr;
			item->entry_index = scan_index;
			scan_entry->item = item;
			inc_rmap_list_pool_count(slot, scan_index);
		} else
			goto putpage;
	}

	BUG_ON(item->slot != slot);
	/* the page may have changed */
	item->page = page;
	put_rmap_list_entry(slot, scan_index);
	if (swap_entry)
		put_rmap_list_entry(slot, swap_index);
	return item;

putpage:
	put_page(page);
	page = NULL;
nopage:
	/* no page, store addr back and free rmap_item if possible */
	free_entry_item(scan_entry);
	put_rmap_list_entry(slot, scan_index);
	if (swap_entry)
		put_rmap_list_entry(slot, swap_index);
	return NULL;
}

static inline int in_stable_tree(struct rmap_item *rmap_item)
{
	return rmap_item->address & STABLE_FLAG;
}

/**
 * scan_vma_one_page() - scan the next page in a vma_slot. Called with
 * mmap_sem locked.
 */
static noinline void scan_vma_one_page(struct vma_slot *slot)
{
	u32 hash;
	struct mm_struct *mm;
	struct rmap_item *rmap_item = NULL;
	struct vm_area_struct *vma = slot->vma;

	mm = vma->vm_mm;
	BUG_ON(!mm);
	BUG_ON(!slot);

	rmap_item = get_next_rmap_item(slot, &hash);
	if (!rmap_item)
		goto out1;

	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
		goto out2;

	cmp_and_merge_page(rmap_item, hash);
out2:
	put_page(rmap_item->page);
out1:
	slot->pages_scanned++;
	slot->this_sampled++;
	if (slot->fully_scanned_round != fully_scanned_round)
		scanned_virtual_pages++;

	if (vma_fully_scanned(slot))
		slot->fully_scanned_round = fully_scanned_round;
}

static inline unsigned long rung_get_pages(struct scan_rung *rung)
{
	struct slot_tree_node *node;

	if (!rung->vma_root.rnode)
		return 0;

	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);

	return node->size;
}

#define RUNG_SAMPLED_MIN	3

static inline
void uksm_calc_rung_step(struct scan_rung *rung,
			 unsigned long page_time, unsigned long ratio)
{
	unsigned long sampled, pages;

	/* will be fully scanned ? */
	if (!rung->cover_msecs) {
		rung->step = 1;
		return;
	}

	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
		  * ratio / page_time;

	/*
	 *  Before we finsish a scan round and expensive per-round jobs,
	 *  we need to have a chance to estimate the per page time. So
	 *  the sampled number can not be too small.
	 */
	if (sampled < RUNG_SAMPLED_MIN)
		sampled = RUNG_SAMPLED_MIN;

	pages = rung_get_pages(rung);
	if (likely(pages > sampled))
		rung->step = pages / sampled;
	else
		rung->step = 1;
}

static inline int step_need_recalc(struct scan_rung *rung)
{
	unsigned long pages, stepmax;

	pages = rung_get_pages(rung);
	stepmax = pages / RUNG_SAMPLED_MIN;

	return pages && (rung->step > pages ||
			 (stepmax && rung->step > stepmax));
}

static inline
void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
{
	struct vma_slot *slot;

	if (finished)
		rung->flags |= UKSM_RUNG_ROUND_FINISHED;

	if (step_recalc || step_need_recalc(rung)) {
		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
		BUG_ON(step_need_recalc(rung));
	}

	slot_iter_index = prandom_u32() % rung->step;
	BUG_ON(!rung->vma_root.rnode);
	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
	BUG_ON(!slot);

	rung->current_scan = slot;
	rung->current_offset = slot_iter_index;
}

static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
{
	return &slot->rung->vma_root;
}

/*
 * return if resetted.
 */
static int advance_current_scan(struct scan_rung *rung)
{
	unsigned short n;
	struct vma_slot *slot, *next = NULL;

	BUG_ON(!rung->vma_root.num);

	slot = rung->current_scan;
	n = (slot->pages - rung->current_offset) % rung->step;
	slot_iter_index = rung->step - n;
	next = sradix_tree_next(&rung->vma_root, slot->snode,
				slot->sindex, slot_iter);

	if (next) {
		rung->current_offset = slot_iter_index;
		rung->current_scan = next;
		return 0;
	} else {
		reset_current_scan(rung, 1, 0);
		return 1;
	}
}

static inline void rung_rm_slot(struct vma_slot *slot)
{
	struct scan_rung *rung = slot->rung;
	struct sradix_tree_root *root;

	if (rung->current_scan == slot)
		advance_current_scan(rung);

	root = slot_get_root(slot);
	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
	slot->snode = NULL;
	if (step_need_recalc(rung)) {
		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
		BUG_ON(step_need_recalc(rung));
	}

	/* In case advance_current_scan loop back to this slot again */
	if (rung->vma_root.num && rung->current_scan == slot)
		reset_current_scan(slot->rung, 1, 0);
}

static inline void rung_add_new_slots(struct scan_rung *rung,
			struct vma_slot **slots, unsigned long num)
{
	int err;
	struct vma_slot *slot;
	unsigned long i;
	struct sradix_tree_root *root = &rung->vma_root;

	err = sradix_tree_enter(root, (void **)slots, num);
	BUG_ON(err);

	for (i = 0; i < num; i++) {
		slot = slots[i];
		slot->rung = rung;
		BUG_ON(vma_fully_scanned(slot));
	}

	if (rung->vma_root.num == num)
		reset_current_scan(rung, 0, 1);
}

static inline int rung_add_one_slot(struct scan_rung *rung,
				     struct vma_slot *slot)
{
	int err;

	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
	if (err)
		return err;

	slot->rung = rung;
	if (rung->vma_root.num == 1)
		reset_current_scan(rung, 0, 1);

	return 0;
}

/*
 * Return true if the slot is deleted from its rung.
 */
static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
{
	struct scan_rung *old_rung = slot->rung;
	int err;

	if (old_rung == rung)
		return 0;

	rung_rm_slot(slot);
	err = rung_add_one_slot(rung, slot);
	if (err) {
		err = rung_add_one_slot(old_rung, slot);
		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
	}

	return 1;
}

static inline int vma_rung_up(struct vma_slot *slot)
{
	struct scan_rung *rung;

	rung = slot->rung;
	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
		rung++;

	return vma_rung_enter(slot, rung);
}

static inline int vma_rung_down(struct vma_slot *slot)
{
	struct scan_rung *rung;

	rung = slot->rung;
	if (slot->rung != &uksm_scan_ladder[0])
		rung--;

	return vma_rung_enter(slot, rung);
}

/**
 * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
 */
static unsigned long cal_dedup_ratio(struct vma_slot *slot)
{
	unsigned long ret;
	unsigned long pages;

	pages = slot->this_sampled;
	if (!pages)
		return 0;

	BUG_ON(slot->pages_scanned == slot->last_scanned);

	ret = slot->pages_merged;

	/* Thrashing area filtering */
	if (ret && uksm_thrash_threshold) {
		if (slot->pages_cowed * 100 / slot->pages_merged
		    > uksm_thrash_threshold) {
			ret = 0;
		} else {
			ret = slot->pages_merged - slot->pages_cowed;
		}
	}

	return ret * 100 / pages;
}

/**
 * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
 */
static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
{
	unsigned long ret;
	unsigned long pages;

	pages = slot->pages;
	if (!pages)
		return 0;

	ret = slot->pages_bemerged;

	/* Thrashing area filtering */
	if (ret && uksm_thrash_threshold) {
		if (slot->pages_cowed * 100 / slot->pages_bemerged
		    > uksm_thrash_threshold) {
			ret = 0;
		} else {
			ret = slot->pages_bemerged - slot->pages_cowed;
		}
	}

	return ret * 100 / pages;
}

/**
 * stable_node_reinsert() - When the hash_strength has been adjusted, the
 * stable tree need to be restructured, this is the function re-inserting the
 * stable node.
 */
static inline void stable_node_reinsert(struct stable_node *new_node,
					struct page *page,
					struct rb_root *root_treep,
					struct list_head *tree_node_listp,
					u32 hash)
{
	struct rb_node **new = &root_treep->rb_node;
	struct rb_node *parent = NULL;
	struct stable_node *stable_node;
	struct tree_node *tree_node;
	struct page *tree_page;
	int cmp;

	while (*new) {
		int cmp;

		tree_node = rb_entry(*new, struct tree_node, node);

		cmp = hash_cmp(hash, tree_node->hash);

		if (cmp < 0) {
			parent = *new;
			new = &parent->rb_left;
		} else if (cmp > 0) {
			parent = *new;
			new = &parent->rb_right;
		} else
			break;
	}

	if (*new) {
		/* find a stable tree node with same first level hash value */
		stable_node_hash_max(new_node, page, hash);
		if (tree_node->count == 1) {
			stable_node = rb_entry(tree_node->sub_root.rb_node,
					       struct stable_node, node);
			tree_page = get_uksm_page(stable_node, 1, 0);
			if (tree_page) {
				stable_node_hash_max(stable_node,
						      tree_page, hash);
				put_page(tree_page);

				/* prepare for stable node insertion */

				cmp = hash_cmp(new_node->hash_max,
						   stable_node->hash_max);
				parent = &stable_node->node;
				if (cmp < 0)
					new = &parent->rb_left;
				else if (cmp > 0)
					new = &parent->rb_right;
				else
					goto failed;

				goto add_node;
			} else {
				/* the only stable_node deleted, the tree node
				 * was not deleted.
				 */
				goto tree_node_reuse;
			}
		}

		/* well, search the collision subtree */
		new = &tree_node->sub_root.rb_node;
		parent = NULL;
		BUG_ON(!*new);
		while (*new) {
			int cmp;

			stable_node = rb_entry(*new, struct stable_node, node);

			cmp = hash_cmp(new_node->hash_max,
					   stable_node->hash_max);

			if (cmp < 0) {
				parent = *new;
				new = &parent->rb_left;
			} else if (cmp > 0) {
				parent = *new;
				new = &parent->rb_right;
			} else {
				/* oh, no, still a collision */
				goto failed;
			}
		}

		goto add_node;
	}

	/* no tree node found */
	tree_node = alloc_tree_node(tree_node_listp);
	if (!tree_node) {
		printk(KERN_ERR "UKSM: memory allocation error!\n");
		goto failed;
	} else {
		tree_node->hash = hash;
		rb_link_node(&tree_node->node, parent, new);
		rb_insert_color(&tree_node->node, root_treep);

tree_node_reuse:
		/* prepare for stable node insertion */
		parent = NULL;
		new = &tree_node->sub_root.rb_node;
	}

add_node:
	rb_link_node(&new_node->node, parent, new);
	rb_insert_color(&new_node->node, &tree_node->sub_root);
	new_node->tree_node = tree_node;
	tree_node->count++;
	return;

failed:
	/* This can only happen when two nodes have collided
	 * in two levels.
	 */
	new_node->tree_node = NULL;
	return;
}

static inline void free_all_tree_nodes(struct list_head *list)
{
	struct tree_node *node, *tmp;

	list_for_each_entry_safe(node, tmp, list, all_list) {
		free_tree_node(node);
	}
}

/**
 * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
 * strength to the current hash_strength. It re-structures the hole tree.
 */
static inline void stable_tree_delta_hash(u32 prev_hash_strength)
{
	struct stable_node *node, *tmp;
	struct rb_root *root_new_treep;
	struct list_head *new_tree_node_listp;

	stable_tree_index = (stable_tree_index + 1) % 2;
	root_new_treep = &root_stable_tree[stable_tree_index];
	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
	*root_new_treep = RB_ROOT;
	BUG_ON(!list_empty(new_tree_node_listp));

	/*
	 * we need to be safe, the node could be removed by get_uksm_page()
	 */
	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
		void *addr;
		struct page *node_page;
		u32 hash;

		/*
		 * We are completely re-structuring the stable nodes to a new
		 * stable tree. We don't want to touch the old tree unlinks and
		 * old tree_nodes. The old tree_nodes will be freed at once.
		 */
		node_page = get_uksm_page(node, 0, 0);
		if (!node_page)
			continue;

		if (node->tree_node) {
			hash = node->tree_node->hash;

			addr = kmap_atomic(node_page);

			hash = delta_hash(addr, prev_hash_strength,
					  hash_strength, hash);
			kunmap_atomic(addr);
		} else {
			/*
			 *it was not inserted to rbtree due to collision in last
			 *round scan.
			 */
			hash = page_hash(node_page, hash_strength, 0);
		}

		stable_node_reinsert(node, node_page, root_new_treep,
				     new_tree_node_listp, hash);
		put_page(node_page);
	}

	root_stable_treep = root_new_treep;
	free_all_tree_nodes(stable_tree_node_listp);
	BUG_ON(!list_empty(stable_tree_node_listp));
	stable_tree_node_listp = new_tree_node_listp;
}

static inline void inc_hash_strength(unsigned long delta)
{
	hash_strength += 1 << delta;
	if (hash_strength > HASH_STRENGTH_MAX)
		hash_strength = HASH_STRENGTH_MAX;
}

static inline void dec_hash_strength(unsigned long delta)
{
	unsigned long change = 1 << delta;

	if (hash_strength <= change + 1)
		hash_strength = 1;
	else
		hash_strength -= change;
}

static inline void inc_hash_strength_delta(void)
{
	hash_strength_delta++;
	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
}

/*
static inline unsigned long get_current_neg_ratio(void)
{
	if (!rshash_pos || rshash_neg > rshash_pos)
		return 100;

	return div64_u64(100 * rshash_neg , rshash_pos);
}
*/

static inline unsigned long get_current_neg_ratio(void)
{
	u64 pos = benefit.pos;
	u64 neg = benefit.neg;

	if (!neg)
		return 0;

	if (!pos || neg > pos)
		return 100;

	if (neg > div64_u64(U64_MAX, 100))
		pos = div64_u64(pos, 100);
	else
		neg *= 100;

	return div64_u64(neg, pos);
}

static inline unsigned long get_current_benefit(void)
{
	u64 pos = benefit.pos;
	u64 neg = benefit.neg;
	u64 scanned = benefit.scanned;

	if (neg > pos)
		return 0;

	return div64_u64((pos - neg), scanned);
}

static inline int judge_rshash_direction(void)
{
	u64 current_neg_ratio, stable_benefit;
	u64 current_benefit, delta = 0;
	int ret = STILL;

	/* Try to probe a value after the boot, and in case the system
	   are still for a long time. */
	if ((fully_scanned_round & 0xFFULL) == 10) {
		ret = OBSCURE;
		goto out;
	}

	current_neg_ratio = get_current_neg_ratio();

	if (current_neg_ratio == 0) {
		rshash_neg_cont_zero++;
		if (rshash_neg_cont_zero > 2)
			return GO_DOWN;
		else
			return STILL;
	}
	rshash_neg_cont_zero = 0;

	if (current_neg_ratio > 90) {
		ret = GO_UP;
		goto out;
	}

	current_benefit = get_current_benefit();
	stable_benefit = rshash_state.stable_benefit;

	if (!stable_benefit) {
		ret = OBSCURE;
		goto out;
	}

	if (current_benefit > stable_benefit)
		delta = current_benefit - stable_benefit;
	else if (current_benefit < stable_benefit)
		delta = stable_benefit - current_benefit;

	delta = div64_u64(100 * delta , stable_benefit);

	if (delta > 50) {
		rshash_cont_obscure++;
		if (rshash_cont_obscure > 2)
			return OBSCURE;
		else
			return STILL;
	}

out:
	rshash_cont_obscure = 0;
	return ret;
}

/**
 * rshash_adjust() - The main function to control the random sampling state
 * machine for hash strength adapting.
 *
 * return true if hash_strength has changed.
 */
static inline int rshash_adjust(void)
{
	unsigned long prev_hash_strength = hash_strength;

	if (!encode_benefit())
		return 0;

	switch (rshash_state.state) {
	case RSHASH_STILL:
		switch (judge_rshash_direction()) {
		case GO_UP:
			if (rshash_state.pre_direct == GO_DOWN)
				hash_strength_delta = 0;

			inc_hash_strength(hash_strength_delta);
			inc_hash_strength_delta();
			rshash_state.stable_benefit = get_current_benefit();
			rshash_state.pre_direct = GO_UP;
			break;

		case GO_DOWN:
			if (rshash_state.pre_direct == GO_UP)
				hash_strength_delta = 0;

			dec_hash_strength(hash_strength_delta);
			inc_hash_strength_delta();
			rshash_state.stable_benefit = get_current_benefit();
			rshash_state.pre_direct = GO_DOWN;
			break;

		case OBSCURE:
			rshash_state.stable_point = hash_strength;
			rshash_state.turn_point_down = hash_strength;
			rshash_state.turn_point_up = hash_strength;
			rshash_state.turn_benefit_down = get_current_benefit();
			rshash_state.turn_benefit_up = get_current_benefit();
			rshash_state.lookup_window_index = 0;
			rshash_state.state = RSHASH_TRYDOWN;
			dec_hash_strength(hash_strength_delta);
			inc_hash_strength_delta();
			break;

		case STILL:
			break;
		default:
			BUG();
		}
		break;

	case RSHASH_TRYDOWN:
		if (rshash_state.lookup_window_index++ % 5 == 0)
			rshash_state.below_count = 0;

		if (get_current_benefit() < rshash_state.stable_benefit)
			rshash_state.below_count++;
		else if (get_current_benefit() >
			 rshash_state.turn_benefit_down) {
			rshash_state.turn_point_down = hash_strength;
			rshash_state.turn_benefit_down = get_current_benefit();
		}

		if (rshash_state.below_count >= 3 ||
		    judge_rshash_direction() == GO_UP ||
		    hash_strength == 1) {
			hash_strength = rshash_state.stable_point;
			hash_strength_delta = 0;
			inc_hash_strength(hash_strength_delta);
			inc_hash_strength_delta();
			rshash_state.lookup_window_index = 0;
			rshash_state.state = RSHASH_TRYUP;
			hash_strength_delta = 0;
		} else {
			dec_hash_strength(hash_strength_delta);
			inc_hash_strength_delta();
		}
		break;

	case RSHASH_TRYUP:
		if (rshash_state.lookup_window_index++ % 5 == 0)
			rshash_state.below_count = 0;

		if (get_current_benefit() < rshash_state.turn_benefit_down)
			rshash_state.below_count++;
		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
			rshash_state.turn_point_up = hash_strength;
			rshash_state.turn_benefit_up = get_current_benefit();
		}

		if (rshash_state.below_count >= 3 ||
		    judge_rshash_direction() == GO_DOWN ||
		    hash_strength == HASH_STRENGTH_MAX) {
			hash_strength = rshash_state.turn_benefit_up >
				rshash_state.turn_benefit_down ?
				rshash_state.turn_point_up :
				rshash_state.turn_point_down;

			rshash_state.state = RSHASH_PRE_STILL;
		} else {
			inc_hash_strength(hash_strength_delta);
			inc_hash_strength_delta();
		}

		break;

	case RSHASH_NEW:
	case RSHASH_PRE_STILL:
		rshash_state.stable_benefit = get_current_benefit();
		rshash_state.state = RSHASH_STILL;
		hash_strength_delta = 0;
		break;
	default:
		BUG();
	}

	/* rshash_neg = rshash_pos = 0; */
	reset_benefit();

	if (prev_hash_strength != hash_strength)
		stable_tree_delta_hash(prev_hash_strength);

	return prev_hash_strength != hash_strength;
}

/**
 * round_update_ladder() - The main function to do update of all the
 * adjustments whenever a scan round is finished.
 */
static noinline void round_update_ladder(void)
{
	int i;
	unsigned long dedup;
	struct vma_slot *slot, *tmp_slot;

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
	}

	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {

		/* slot may be rung_rm_slot() when mm exits */
		if (slot->snode) {
			dedup = cal_dedup_ratio_old(slot);
			if (dedup && dedup >= uksm_abundant_threshold)
				vma_rung_up(slot);
		}

		slot->pages_bemerged = 0;
		slot->pages_cowed = 0;

		list_del_init(&slot->dedup_list);
	}
}

static void uksm_del_vma_slot(struct vma_slot *slot)
{
	int i, j;
	struct rmap_list_entry *entry;

	if (slot->snode) {
		/*
		 * In case it just failed when entering the rung, it's not
		 * necessary.
		 */
		rung_rm_slot(slot);
	}

	if (!list_empty(&slot->dedup_list))
		list_del(&slot->dedup_list);

	if (!slot->rmap_list_pool || !slot->pool_counts) {
		/* In case it OOMed in uksm_vma_enter() */
		goto out;
	}

	for (i = 0; i < slot->pool_size; i++) {
		void *addr;

		if (!slot->rmap_list_pool[i])
			continue;

		addr = kmap(slot->rmap_list_pool[i]);
		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
			entry = (struct rmap_list_entry *)addr + j;
			if (is_addr(entry->addr))
				continue;
			if (!entry->item)
				continue;

			remove_rmap_item_from_tree(entry->item);
			free_rmap_item(entry->item);
			slot->pool_counts[i]--;
		}
		BUG_ON(slot->pool_counts[i]);
		kunmap(slot->rmap_list_pool[i]);
		__free_page(slot->rmap_list_pool[i]);
	}
	kfree(slot->rmap_list_pool);
	kfree(slot->pool_counts);

out:
	slot->rung = NULL;
	if (slot->flags & UKSM_SLOT_IN_UKSM) {
		BUG_ON(uksm_pages_total < slot->pages);
		uksm_pages_total -= slot->pages;
	}

	if (slot->fully_scanned_round == fully_scanned_round)
		scanned_virtual_pages -= slot->pages;
	else
		scanned_virtual_pages -= slot->pages_scanned;
	free_vma_slot(slot);
}


#define SPIN_LOCK_PERIOD	32
static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
static inline void cleanup_vma_slots(void)
{
	struct vma_slot *slot;
	int i;

	i = 0;
	spin_lock(&vma_slot_list_lock);
	while (!list_empty(&vma_slot_del)) {
		slot = list_entry(vma_slot_del.next,
				  struct vma_slot, slot_list);
		list_del(&slot->slot_list);
		cleanup_slots[i++] = slot;
		if (i == SPIN_LOCK_PERIOD) {
			spin_unlock(&vma_slot_list_lock);
			while (--i >= 0)
				uksm_del_vma_slot(cleanup_slots[i]);
			i = 0;
			spin_lock(&vma_slot_list_lock);
		}
	}
	spin_unlock(&vma_slot_list_lock);

	while (--i >= 0)
		uksm_del_vma_slot(cleanup_slots[i]);
}

/*
*expotional moving average formula
*/
static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
{
	/*
	 * For a very high burst, even the ema cannot work well, a false very
	 * high per-page time estimation can result in feedback in very high
	 * overhead of context swith and rung update -- this will then lead
	 * to higher per-paper time, this may not converge.
	 *
	 * Instead, we try to approach this value in a binary manner.
	 */
	if (curr > last_ema * 10)
		return last_ema * 2;

	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
}

/*
 * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
 * nanoseconds based on current uksm_sleep_jiffies.
 */
static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
{
	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
		(TIME_RATIO_SCALE - ratio) * ratio;
}


static inline unsigned long rung_real_ratio(int cpu_time_ratio)
{
	unsigned long ret;

	BUG_ON(!cpu_time_ratio);

	if (cpu_time_ratio > 0)
		ret = cpu_time_ratio;
	else
		ret = (unsigned long)(-cpu_time_ratio) *
			uksm_max_cpu_percentage / 100UL;

	return ret ? ret : 1;
}

static noinline void uksm_calc_scan_pages(void)
{
	struct scan_rung *ladder = uksm_scan_ladder;
	unsigned long sleep_usecs, nsecs;
	unsigned long ratio;
	int i;
	unsigned long per_page;

	if (uksm_ema_page_time > 100000 ||
	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;

	per_page = uksm_ema_page_time;
	BUG_ON(!per_page);

	/*
	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
	 * based on saved user input.
	 */
	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
		uksm_sleep_jiffies = uksm_sleep_saved;

	/* We require a rung scan at least 1 page in a period. */
	nsecs = per_page;
	ratio = rung_real_ratio(ladder[0].cpu_ratio);
	if (cpu_ratio_to_nsec(ratio) < nsecs) {
		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
				/ NSEC_PER_USEC;
		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
	}

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		ratio = rung_real_ratio(ladder[i].cpu_ratio);
		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
					per_page;
		BUG_ON(!ladder[i].pages_to_scan);
		uksm_calc_rung_step(&ladder[i], per_page, ratio);
	}
}

/*
 * From the scan time of this round (ns) to next expected min sleep time
 * (ms), be careful of the possible overflows. ratio is taken from
 * rung_real_ratio()
 */
static inline
unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
{
	scan_time >>= 20; /* to msec level now */
	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));

	return (unsigned int) ((unsigned long) scan_time *
			       (TIME_RATIO_SCALE - ratio) / ratio);
}

#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)

static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
{
	struct scan_rung *rung;

	rung = &uksm_scan_ladder[0];
	rung_add_new_slots(rung, slots, num);
}

static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];

static void uksm_enter_all_slots(void)
{
	struct vma_slot *slot;
	unsigned long index;
	struct list_head empty_vma_list;
	int i;

	i = 0;
	index = 0;
	INIT_LIST_HEAD(&empty_vma_list);

	spin_lock(&vma_slot_list_lock);
	while (!list_empty(&vma_slot_new)) {
		slot = list_entry(vma_slot_new.next,
				  struct vma_slot, slot_list);

		if (!slot->vma->anon_vma) {
			list_move(&slot->slot_list, &empty_vma_list);
		} else if (vma_can_enter(slot->vma)) {
			batch_slots[index++] = slot;
			list_del_init(&slot->slot_list);
		} else {
			list_move(&slot->slot_list, &vma_slot_noadd);
		}

		if (++i == SPIN_LOCK_PERIOD ||
		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
			spin_unlock(&vma_slot_list_lock);

			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
				uksm_vma_enter(batch_slots, index);
				index = 0;
			}
			i = 0;
			cond_resched();
			spin_lock(&vma_slot_list_lock);
		}
	}

	list_splice(&empty_vma_list, &vma_slot_new);

	spin_unlock(&vma_slot_list_lock);

	if (index)
		uksm_vma_enter(batch_slots, index);

}

static inline int rung_round_finished(struct scan_rung *rung)
{
	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
}

static inline void judge_slot(struct vma_slot *slot)
{
	struct scan_rung *rung = slot->rung;
	unsigned long dedup;
	int deleted;

	dedup = cal_dedup_ratio(slot);
	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
	else if (dedup && dedup >= uksm_abundant_threshold)
		deleted = vma_rung_up(slot);
	else
		deleted = vma_rung_down(slot);

	slot->pages_merged = 0;
	slot->pages_cowed = 0;
	slot->this_sampled = 0;

	if (vma_fully_scanned(slot)) {
		slot->pages_scanned = 0;
	}

	slot->last_scanned = slot->pages_scanned;

	/* If its deleted in above, then rung was already advanced. */
	if (!deleted)
		advance_current_scan(rung);
}


static inline int hash_round_finished(void)
{
	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
		scanned_virtual_pages = 0;
		if (uksm_pages_scanned)
			fully_scanned_round++;

		return 1;
	} else {
		return 0;
	}
}

#define UKSM_MMSEM_BATCH	5
#define BUSY_RETRY		100

/**
 * uksm_do_scan()  - the main worker function.
 */
static noinline void uksm_do_scan(void)
{
	struct vma_slot *slot, *iter;
	struct mm_struct *busy_mm;
	unsigned char round_finished, all_rungs_emtpy;
	int i, err, mmsem_batch;
	unsigned long pcost;
	long long delta_exec;
	unsigned long vpages, max_cpu_ratio;
	unsigned long long start_time, end_time, scan_time;
	unsigned int expected_jiffies;

	might_sleep();

	vpages = 0;

	start_time = task_sched_runtime(current);
	max_cpu_ratio = 0;
	mmsem_batch = 0;

	for (i = 0; i < SCAN_LADDER_SIZE;) {
		struct scan_rung *rung = &uksm_scan_ladder[i];
		unsigned long ratio;
		int busy_retry;

		if (!rung->pages_to_scan) {
			i++;
			continue;
		}

		if (!rung->vma_root.num) {
			rung->pages_to_scan = 0;
			i++;
			continue;
		}

		ratio = rung_real_ratio(rung->cpu_ratio);
		if (ratio > max_cpu_ratio)
			max_cpu_ratio = ratio;

		busy_retry = BUSY_RETRY;
		/*
		 * Do not consider rung_round_finished() here, just used up the
		 * rung->pages_to_scan quota.
		 */
		while (rung->pages_to_scan && rung->vma_root.num &&
		       likely(!freezing(current))) {
			int reset = 0;

			slot = rung->current_scan;

			BUG_ON(vma_fully_scanned(slot));

			if (mmsem_batch) {
				err = 0;
			} else {
				err = try_down_read_slot_mmap_sem(slot);
			}

			if (err == -ENOENT) {
rm_slot:
				rung_rm_slot(slot);
				continue;
			}

			busy_mm = slot->mm;

			if (err == -EBUSY) {
				/* skip other vmas on the same mm */
				do {
					reset = advance_current_scan(rung);
					iter = rung->current_scan;
					busy_retry--;
					if (iter->vma->vm_mm != busy_mm ||
					    !busy_retry || reset)
						break;
				} while (1);

				if (iter->vma->vm_mm != busy_mm) {
					continue;
				} else {
					/* scan round finsished */
					break;
				}
			}

			BUG_ON(!vma_can_enter(slot->vma));
			if (uksm_test_exit(slot->vma->vm_mm)) {
				mmsem_batch = 0;
				up_read(&slot->vma->vm_mm->mmap_sem);
				goto rm_slot;
			}

			if (mmsem_batch)
				mmsem_batch--;
			else
				mmsem_batch = UKSM_MMSEM_BATCH;

			/* Ok, we have take the mmap_sem, ready to scan */
			scan_vma_one_page(slot);
			rung->pages_to_scan--;
			vpages++;

			if (rung->current_offset + rung->step > slot->pages - 1
			    || vma_fully_scanned(slot)) {
				up_read(&slot->vma->vm_mm->mmap_sem);
				judge_slot(slot);
				mmsem_batch = 0;
			} else {
				rung->current_offset += rung->step;
				if (!mmsem_batch)
					up_read(&slot->vma->vm_mm->mmap_sem);
			}

			busy_retry = BUSY_RETRY;
			cond_resched();
		}

		if (mmsem_batch) {
			up_read(&slot->vma->vm_mm->mmap_sem);
			mmsem_batch = 0;
		}

		if (freezing(current))
			break;

		cond_resched();
	}
	end_time = task_sched_runtime(current);
	delta_exec = end_time - start_time;

	if (freezing(current))
		return;

	cleanup_vma_slots();
	uksm_enter_all_slots();

	round_finished = 1;
	all_rungs_emtpy = 1;
	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		struct scan_rung *rung = &uksm_scan_ladder[i];

		if (rung->vma_root.num) {
			all_rungs_emtpy = 0;
			if (!rung_round_finished(rung))
				round_finished = 0;
		}
	}

	if (all_rungs_emtpy)
		round_finished = 0;

	if (round_finished) {
		round_update_ladder();
		uksm_eval_round++;

		if (hash_round_finished() && rshash_adjust()) {
			/* Reset the unstable root iff hash strength changed */
			uksm_hash_round++;
			root_unstable_tree = RB_ROOT;
			free_all_tree_nodes(&unstable_tree_node_list);
		}

		/*
		 * A number of pages can hang around indefinitely on per-cpu
		 * pagevecs, raised page count preventing write_protect_page
		 * from merging them.  Though it doesn't really matter much,
		 * it is puzzling to see some stuck in pages_volatile until
		 * other activity jostles them out, and they also prevented
		 * LTP's KSM test from succeeding deterministically; so drain
		 * them here (here rather than on entry to uksm_do_scan(),
		 * so we don't IPI too often when pages_to_scan is set low).
		 */
		lru_add_drain_all();
	}


	if (vpages && delta_exec > 0) {
		pcost = (unsigned long) delta_exec / vpages;
		if (likely(uksm_ema_page_time))
			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
		else
			uksm_ema_page_time = pcost;
	}

	uksm_calc_scan_pages();
	uksm_sleep_real = uksm_sleep_jiffies;
	/* in case of radical cpu bursts, apply the upper bound */
	end_time = task_sched_runtime(current);
	if (max_cpu_ratio && end_time > start_time) {
		scan_time = end_time - start_time;
		expected_jiffies = msecs_to_jiffies(
			scan_time_to_sleep(scan_time, max_cpu_ratio));

		if (expected_jiffies > uksm_sleep_real)
			uksm_sleep_real = expected_jiffies;

		/* We have a 1 second up bound for responsiveness. */
		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
			uksm_sleep_real = msecs_to_jiffies(1000);
	}

	return;
}

static int ksmd_should_run(void)
{
	return uksm_run & UKSM_RUN_MERGE;
}

static int uksm_scan_thread(void *nothing)
{
	set_freezable();
	set_user_nice(current, 5);

	while (!kthread_should_stop()) {
		mutex_lock(&uksm_thread_mutex);
		if (ksmd_should_run()) {
			uksm_do_scan();
		}
		mutex_unlock(&uksm_thread_mutex);

		try_to_freeze();

		if (ksmd_should_run()) {
			schedule_timeout_interruptible(uksm_sleep_real);
			uksm_sleep_times++;
		} else {
			wait_event_freezable(uksm_thread_wait,
				ksmd_should_run() || kthread_should_stop());
		}
	}
	return 0;
}

int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
	struct stable_node *stable_node;
	struct node_vma *node_vma;
	struct rmap_item *rmap_item;
	int ret = SWAP_AGAIN;
	int search_new_forks = 0;
	unsigned long address;

	VM_BUG_ON_PAGE(!PageKsm(page), page);
	VM_BUG_ON_PAGE(!PageLocked(page), page);

	stable_node = page_stable_node(page);
	if (!stable_node)
		return ret;
again:
	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
			struct anon_vma *anon_vma = rmap_item->anon_vma;
			struct anon_vma_chain *vmac;
			struct vm_area_struct *vma;

			cond_resched();
			anon_vma_lock_read(anon_vma);
			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
						       0, ULONG_MAX) {
				cond_resched();
				vma = vmac->vma;
				address = get_rmap_addr(rmap_item);

				if (address < vma->vm_start ||
				    address >= vma->vm_end)
					continue;

				if ((rmap_item->slot->vma == vma) ==
				    search_new_forks)
					continue;

				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
					continue;

				ret = rwc->rmap_one(page, vma, address, rwc->arg);
				if (ret != SWAP_AGAIN) {
					anon_vma_unlock_read(anon_vma);
					goto out;
				}

				if (rwc->done && rwc->done(page)) {
					anon_vma_unlock_read(anon_vma);
					goto out;
				}
			}
			anon_vma_unlock_read(anon_vma);
		}
	}
	if (!search_new_forks++)
		goto again;
out:
	return ret;
}

#ifdef CONFIG_MIGRATION
/* Common ksm interface but may be specific to uksm */
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
	struct stable_node *stable_node;

	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
	VM_BUG_ON(newpage->mapping != oldpage->mapping);

	stable_node = page_stable_node(newpage);
	if (stable_node) {
		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
		stable_node->kpfn = page_to_pfn(newpage);
	}
}
#endif /* CONFIG_MIGRATION */

#ifdef CONFIG_MEMORY_HOTREMOVE
static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
						 unsigned long end_pfn)
{
	struct rb_node *node;

	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
		struct stable_node *stable_node;

		stable_node = rb_entry(node, struct stable_node, node);
		if (stable_node->kpfn >= start_pfn &&
		    stable_node->kpfn < end_pfn)
			return stable_node;
	}
	return NULL;
}

static int uksm_memory_callback(struct notifier_block *self,
			       unsigned long action, void *arg)
{
	struct memory_notify *mn = arg;
	struct stable_node *stable_node;

	switch (action) {
	case MEM_GOING_OFFLINE:
		/*
		 * Keep it very simple for now: just lock out ksmd and
		 * MADV_UNMERGEABLE while any memory is going offline.
		 * mutex_lock_nested() is necessary because lockdep was alarmed
		 * that here we take uksm_thread_mutex inside notifier chain
		 * mutex, and later take notifier chain mutex inside
		 * uksm_thread_mutex to unlock it.   But that's safe because both
		 * are inside mem_hotplug_mutex.
		 */
		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
		break;

	case MEM_OFFLINE:
		/*
		 * Most of the work is done by page migration; but there might
		 * be a few stable_nodes left over, still pointing to struct
		 * pages which have been offlined: prune those from the tree.
		 */
		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
					mn->start_pfn + mn->nr_pages)) != NULL)
			remove_node_from_stable_tree(stable_node, 1, 1);
		/* fallthrough */

	case MEM_CANCEL_OFFLINE:
		mutex_unlock(&uksm_thread_mutex);
		break;
	}
	return NOTIFY_OK;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

#ifdef CONFIG_SYSFS
/*
 * This all compiles without CONFIG_SYSFS, but is a waste of space.
 */

#define UKSM_ATTR_RO(_name) \
	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define UKSM_ATTR(_name) \
	static struct kobj_attribute _name##_attr = \
		__ATTR(_name, 0644, _name##_show, _name##_store)

static ssize_t max_cpu_percentage_show(struct kobject *kobj,
				    struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
}

static ssize_t max_cpu_percentage_store(struct kobject *kobj,
				     struct kobj_attribute *attr,
				     const char *buf, size_t count)
{
	unsigned long max_cpu_percentage;
	int err;

	err = kstrtoul(buf, 10, &max_cpu_percentage);
	if (err || max_cpu_percentage > 100)
		return -EINVAL;

	if (max_cpu_percentage == 100)
		max_cpu_percentage = 99;
	else if (max_cpu_percentage < 10)
		max_cpu_percentage = 10;

	uksm_max_cpu_percentage = max_cpu_percentage;

	return count;
}
UKSM_ATTR(max_cpu_percentage);

static ssize_t sleep_millisecs_show(struct kobject *kobj,
				    struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
}

static ssize_t sleep_millisecs_store(struct kobject *kobj,
				     struct kobj_attribute *attr,
				     const char *buf, size_t count)
{
	unsigned long msecs;
	int err;

	err = kstrtoul(buf, 10, &msecs);
	if (err || msecs > MSEC_PER_SEC)
		return -EINVAL;

	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
	uksm_sleep_saved = uksm_sleep_jiffies;

	return count;
}
UKSM_ATTR(sleep_millisecs);


static ssize_t cpu_governor_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf)
{
	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
	int i;

	buf[0] = '\0';
	for (i = 0; i < n ; i++) {
		if (uksm_cpu_governor == i)
			strcat(buf, "[");

		strcat(buf, uksm_cpu_governor_str[i]);

		if (uksm_cpu_governor == i)
			strcat(buf, "]");

		strcat(buf, " ");
	}
	strcat(buf, "\n");

	return strlen(buf);
}

static inline void init_performance_values(void)
{
	int i;
	struct scan_rung *rung;
	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;


	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		rung = uksm_scan_ladder + i;
		rung->cpu_ratio = preset->cpu_ratio[i];
		rung->cover_msecs = preset->cover_msecs[i];
	}

	uksm_max_cpu_percentage = preset->max_cpu;
}

static ssize_t cpu_governor_store(struct kobject *kobj,
				   struct kobj_attribute *attr,
				   const char *buf, size_t count)
{
	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);

	for (n--; n >=0 ; n--) {
		if (!strncmp(buf, uksm_cpu_governor_str[n],
			     strlen(uksm_cpu_governor_str[n])))
			break;
	}

	if (n < 0)
		return -EINVAL;
	else
		uksm_cpu_governor = n;

	init_performance_values();

	return count;
}
UKSM_ATTR(cpu_governor);

static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
			char *buf)
{
	return sprintf(buf, "%u\n", uksm_run);
}

static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
			 const char *buf, size_t count)
{
	int err;
	unsigned long flags;

	err = kstrtoul(buf, 10, &flags);
	if (err || flags > UINT_MAX)
		return -EINVAL;
	if (flags > UKSM_RUN_MERGE)
		return -EINVAL;

	mutex_lock(&uksm_thread_mutex);
	if (uksm_run != flags) {
		uksm_run = flags;
	}
	mutex_unlock(&uksm_thread_mutex);

	if (flags & UKSM_RUN_MERGE)
		wake_up_interruptible(&uksm_thread_wait);

	return count;
}
UKSM_ATTR(run);

static ssize_t abundant_threshold_show(struct kobject *kobj,
				     struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%u\n", uksm_abundant_threshold);
}

static ssize_t abundant_threshold_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	int err;
	unsigned long flags;

	err = kstrtoul(buf, 10, &flags);
	if (err || flags > 99)
		return -EINVAL;

	uksm_abundant_threshold = flags;

	return count;
}
UKSM_ATTR(abundant_threshold);

static ssize_t thrash_threshold_show(struct kobject *kobj,
				     struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%u\n", uksm_thrash_threshold);
}

static ssize_t thrash_threshold_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	int err;
	unsigned long flags;

	err = kstrtoul(buf, 10, &flags);
	if (err || flags > 99)
		return -EINVAL;

	uksm_thrash_threshold = flags;

	return count;
}
UKSM_ATTR(thrash_threshold);

static ssize_t cpu_ratios_show(struct kobject *kobj,
			       struct kobj_attribute *attr, char *buf)
{
	int i, size;
	struct scan_rung *rung;
	char *p = buf;

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		rung = &uksm_scan_ladder[i];

		if (rung->cpu_ratio > 0)
			size = sprintf(p, "%d ", rung->cpu_ratio);
		else
			size = sprintf(p, "MAX/%d ",
					TIME_RATIO_SCALE / -rung->cpu_ratio);

		p += size;
	}

	*p++ = '\n';
	*p = '\0';

	return p - buf;
}

static ssize_t cpu_ratios_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	int i, cpuratios[SCAN_LADDER_SIZE], err;
	unsigned long value;
	struct scan_rung *rung;
	char *p, *end = NULL;

	p = kzalloc(count, GFP_KERNEL);
	if (!p)
		return -ENOMEM;

	memcpy(p, buf, count);

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		if (i != SCAN_LADDER_SIZE -1) {
			end = strchr(p, ' ');
			if (!end)
				return -EINVAL;

			*end = '\0';
		}

		if (strstr(p, "MAX/")) {
			p = strchr(p, '/') + 1;
			err = kstrtoul(p, 10, &value);
			if (err || value > TIME_RATIO_SCALE || !value)
				return -EINVAL;

			cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
		} else {
			err = kstrtoul(p, 10, &value);
			if (err || value > TIME_RATIO_SCALE || !value)
				return -EINVAL;

			cpuratios[i] = value;
		}

		p = end + 1;
	}

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		rung = &uksm_scan_ladder[i];

		rung->cpu_ratio = cpuratios[i];
	}

	return count;
}
UKSM_ATTR(cpu_ratios);

static ssize_t eval_intervals_show(struct kobject *kobj,
			       struct kobj_attribute *attr, char *buf)
{
	int i, size;
	struct scan_rung *rung;
	char *p = buf;

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		rung = &uksm_scan_ladder[i];
		size = sprintf(p, "%u ", rung->cover_msecs);
		p += size;
	}

	*p++ = '\n';
	*p = '\0';

	return p - buf;
}

static ssize_t eval_intervals_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	int i, err;
	unsigned long values[SCAN_LADDER_SIZE];
	struct scan_rung *rung;
	char *p, *end = NULL;
	ssize_t ret = count;

	p = kzalloc(count + 2, GFP_KERNEL);
	if (!p)
		return -ENOMEM;

	memcpy(p, buf, count);

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		if (i != SCAN_LADDER_SIZE -1) {
			end = strchr(p, ' ');
			if (!end) {
				ret = -EINVAL;
				goto out;
			}

			*end = '\0';
		}

		err = kstrtoul(p, 10, &values[i]);
		if (err) {
			ret = -EINVAL;
			goto out;
		}

		p = end + 1;
	}

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		rung = &uksm_scan_ladder[i];

		rung->cover_msecs = values[i];
	}

out:
	kfree(p);
	return ret;
}
UKSM_ATTR(eval_intervals);

static ssize_t ema_per_page_time_show(struct kobject *kobj,
				 struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%lu\n", uksm_ema_page_time);
}
UKSM_ATTR_RO(ema_per_page_time);

static ssize_t pages_shared_show(struct kobject *kobj,
				 struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%lu\n", uksm_pages_shared);
}
UKSM_ATTR_RO(pages_shared);

static ssize_t pages_sharing_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%lu\n", uksm_pages_sharing);
}
UKSM_ATTR_RO(pages_sharing);

static ssize_t pages_unshared_show(struct kobject *kobj,
				   struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%lu\n", uksm_pages_unshared);
}
UKSM_ATTR_RO(pages_unshared);

static ssize_t full_scans_show(struct kobject *kobj,
			       struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%llu\n", fully_scanned_round);
}
UKSM_ATTR_RO(full_scans);

static ssize_t pages_scanned_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf)
{
	unsigned long base = 0;
	u64 delta, ret;

	if (pages_scanned_stored) {
		base = pages_scanned_base;
		ret = pages_scanned_stored;
		delta = uksm_pages_scanned >> base;
		if (CAN_OVERFLOW_U64(ret, delta)) {
			ret >>= 1;
			delta >>= 1;
			base++;
			ret += delta;
		}
	} else {
		ret = uksm_pages_scanned;
	}

	while (ret > ULONG_MAX) {
		ret >>= 1;
		base++;
	}

	if (base)
		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
	else
		return sprintf(buf, "%lu\n", (unsigned long)ret);
}
UKSM_ATTR_RO(pages_scanned);

static ssize_t hash_strength_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%lu\n", hash_strength);
}
UKSM_ATTR_RO(hash_strength);

static ssize_t sleep_times_show(struct kobject *kobj,
				  struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%llu\n", uksm_sleep_times);
}
UKSM_ATTR_RO(sleep_times);


static struct attribute *uksm_attrs[] = {
	&max_cpu_percentage_attr.attr,
	&sleep_millisecs_attr.attr,
	&cpu_governor_attr.attr,
	&run_attr.attr,
	&ema_per_page_time_attr.attr,
	&pages_shared_attr.attr,
	&pages_sharing_attr.attr,
	&pages_unshared_attr.attr,
	&full_scans_attr.attr,
	&pages_scanned_attr.attr,
	&hash_strength_attr.attr,
	&sleep_times_attr.attr,
	&thrash_threshold_attr.attr,
	&abundant_threshold_attr.attr,
	&cpu_ratios_attr.attr,
	&eval_intervals_attr.attr,
	NULL,
};

static struct attribute_group uksm_attr_group = {
	.attrs = uksm_attrs,
	.name = "uksm",
};
#endif /* CONFIG_SYSFS */

static inline void init_scan_ladder(void)
{
	int i;
	struct scan_rung *rung;

	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
		rung = uksm_scan_ladder + i;
		slot_tree_init_root(&rung->vma_root);
	}

	init_performance_values();
	uksm_calc_scan_pages();
}

static inline int cal_positive_negative_costs(void)
{
	struct page *p1, *p2;
	unsigned char *addr1, *addr2;
	unsigned long i, time_start, hash_cost;
	unsigned long loopnum = 0;

	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
	volatile u32 hash;
	volatile int ret;

	p1 = alloc_page(GFP_KERNEL);
	if (!p1)
		return -ENOMEM;

	p2 = alloc_page(GFP_KERNEL);
	if (!p2)
		return -ENOMEM;

	addr1 = kmap_atomic(p1);
	addr2 = kmap_atomic(p2);
	memset(addr1, prandom_u32(), PAGE_SIZE);
	memcpy(addr2, addr1, PAGE_SIZE);

	/* make sure that the two pages differ in last byte */
	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
	kunmap_atomic(addr2);
	kunmap_atomic(addr1);

	time_start = jiffies;
	while (jiffies - time_start < 100) {
		for (i = 0; i < 100; i++)
			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
		loopnum += 100;
	}
	hash_cost = (jiffies - time_start);

	time_start = jiffies;
	for (i = 0; i < loopnum; i++)
		ret = pages_identical(p1, p2);
	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
	memcmp_cost /= hash_cost;
	printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
			 "hash=%u cmp_ret=%d.\n",
	       memcmp_cost, hash, ret);

	__free_page(p1);
	__free_page(p2);
	return 0;
}

static int init_zeropage_hash_table(void)
{
	struct page *page;
	char *addr;
	int i;

	page = alloc_page(GFP_KERNEL);
	if (!page)
		return -ENOMEM;

	addr = kmap_atomic(page);
	memset(addr, 0, PAGE_SIZE);
	kunmap_atomic(addr);

	zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
		GFP_KERNEL);
	if (!zero_hash_table)
		return -ENOMEM;

	for (i = 0; i < HASH_STRENGTH_MAX; i++)
		zero_hash_table[i] = page_hash(page, i, 0);

	__free_page(page);

	return 0;
}

static inline int init_random_sampling(void)
{
	unsigned long i;
	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
	if (!random_nums)
		return -ENOMEM;

	for (i = 0; i < HASH_STRENGTH_FULL; i++)
		random_nums[i] = i;

	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
		unsigned long rand_range, swap_index, tmp;

		rand_range = HASH_STRENGTH_FULL - i;
		swap_index = i + prandom_u32() % rand_range;
		tmp = random_nums[i];
		random_nums[i] =  random_nums[swap_index];
		random_nums[swap_index] = tmp;
	}

	rshash_state.state = RSHASH_NEW;
	rshash_state.below_count = 0;
	rshash_state.lookup_window_index = 0;

	return cal_positive_negative_costs();
}

static int __init uksm_slab_init(void)
{
	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
	if (!rmap_item_cache)
		goto out;

	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
	if (!stable_node_cache)
		goto out_free1;

	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
	if (!node_vma_cache)
		goto out_free2;

	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
	if (!vma_slot_cache)
		goto out_free3;

	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
	if (!tree_node_cache)
		goto out_free4;

	return 0;

out_free4:
	kmem_cache_destroy(vma_slot_cache);
out_free3:
	kmem_cache_destroy(node_vma_cache);
out_free2:
	kmem_cache_destroy(stable_node_cache);
out_free1:
	kmem_cache_destroy(rmap_item_cache);
out:
	return -ENOMEM;
}

static void __init uksm_slab_free(void)
{
	kmem_cache_destroy(stable_node_cache);
	kmem_cache_destroy(rmap_item_cache);
	kmem_cache_destroy(node_vma_cache);
	kmem_cache_destroy(vma_slot_cache);
	kmem_cache_destroy(tree_node_cache);
}

/* Common interface to ksm, different to it. */
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
		unsigned long end, int advice, unsigned long *vm_flags)
{
	int err;

	switch (advice) {
	case MADV_MERGEABLE:
		return 0;		/* just ignore the advice */

	case MADV_UNMERGEABLE:
		if (!(*vm_flags & VM_MERGEABLE))
			return 0;		/* just ignore the advice */

		if (vma->anon_vma) {
			err = unmerge_uksm_pages(vma, start, end);
			if (err)
				return err;
		}

		uksm_remove_vma(vma);
		*vm_flags &= ~VM_MERGEABLE;
		break;
	}

	return 0;
}

/* Common interface to ksm, actually the same. */
struct page *ksm_might_need_to_copy(struct page *page,
			struct vm_area_struct *vma, unsigned long address)
{
	struct anon_vma *anon_vma = page_anon_vma(page);
	struct page *new_page;

	if (PageKsm(page)) {
		if (page_stable_node(page))
			return page;	/* no need to copy it */
	} else if (!anon_vma) {
		return page;		/* no need to copy it */
	} else if (anon_vma->root == vma->anon_vma->root &&
		 page->index == linear_page_index(vma, address)) {
		return page;		/* still no need to copy it */
	}
	if (!PageUptodate(page))
		return page;		/* let do_swap_page report the error */

	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
	if (new_page) {
		copy_user_highpage(new_page, page, address, vma);

		SetPageDirty(new_page);
		__SetPageUptodate(new_page);
		__SetPageLocked(new_page);
	}

	return new_page;
}

static int __init uksm_init(void)
{
	struct task_struct *uksm_thread;
	int err;

	uksm_sleep_jiffies = msecs_to_jiffies(100);
	uksm_sleep_saved = uksm_sleep_jiffies;

	slot_tree_init();
	init_scan_ladder();


	err = init_random_sampling();
	if (err)
		goto out_free2;

	err = uksm_slab_init();
	if (err)
		goto out_free1;

	err = init_zeropage_hash_table();
	if (err)
		goto out_free0;

	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
	if (IS_ERR(uksm_thread)) {
		printk(KERN_ERR "uksm: creating kthread failed\n");
		err = PTR_ERR(uksm_thread);
		goto out_free;
	}

#ifdef CONFIG_SYSFS
	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
	if (err) {
		printk(KERN_ERR "uksm: register sysfs failed\n");
		kthread_stop(uksm_thread);
		goto out_free;
	}
#else
	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MEMORY_HOTREMOVE
	/*
	 * Choose a high priority since the callback takes uksm_thread_mutex:
	 * later callbacks could only be taking locks which nest within that.
	 */
	hotplug_memory_notifier(uksm_memory_callback, 100);
#endif
	return 0;

out_free:
	kfree(zero_hash_table);
out_free0:
	uksm_slab_free();
out_free1:
	kfree(random_nums);
out_free2:
	kfree(uksm_scan_ladder);
	return err;
}

#ifdef MODULE
subsys_initcall(ksm_init);
#else
late_initcall(uksm_init);
#endif