├── README.md
├── v4.x
    ├── uksm-4.0.patch
    ├── uksm-4.1.patch
    ├── uksm-4.10.patch
    ├── uksm-4.11.patch
    ├── uksm-4.12.patch
    ├── uksm-4.13.patch
    ├── uksm-4.14.patch
    ├── uksm-4.15.patch
    ├── uksm-4.16.patch
    ├── uksm-4.17.patch
    ├── uksm-4.18.patch
    ├── uksm-4.19.patch
    ├── uksm-4.2.patch
    ├── uksm-4.20.patch
    ├── uksm-4.3.patch
    ├── uksm-4.4.patch
    ├── uksm-4.5.patch
    ├── uksm-4.6.patch
    ├── uksm-4.7.patch
    ├── uksm-4.8.patch
    └── uksm-4.9.patch
└── v5.x
    ├── uksm-5.0.patch
    ├── uksm-5.1.patch
    ├── uksm-5.10.patch
    ├── uksm-5.11.patch
    ├── uksm-5.12.patch
    ├── uksm-5.13.patch
    ├── uksm-5.14.patch
    ├── uksm-5.15.patch
    ├── uksm-5.16.patch
    ├── uksm-5.17.patch
    ├── uksm-5.2.patch
    ├── uksm-5.3.patch
    ├── uksm-5.4.patch
    ├── uksm-5.5.patch
    ├── uksm-5.6.patch
    ├── uksm-5.7.patch
    ├── uksm-5.8.patch
    └── uksm-5.9.patch


/README.md:
--------------------------------------------------------------------------------
 1 | # UKSM
 2 | ----------------------------------------------------
 3 | The patches in this repo are the latest UKSM patches
 4 | 
 5 | The current release number: 0.1.2.6
 6 | 
 7 | This release includes two bug fixes from Huawei, many thanks to their engineers and esepcially to @colo-ft who submitted the patches.
 8 | 
 9 | Changelog for all versions is in Documentation/vm/uksm.txt.
10 | 
11 | # What is it?
12 | 
13 | The Ultra Kernel Samepage Merging feature
14 | ----------------------------------------------
15 | 
16 | Ultra KSM. Copyright (C) 2011-2016 Nai Xia
17 | 
18 | This is an improvement upon KSM. Some basic data structures and routines
19 | are borrowed from ksm.c .
20 | 
21 | Its new features:
22 | 
23 | 1. Full system scan:
24 |      It automatically scans all user processes' anonymous VMAs. Kernel-user
25 |      interaction to submit a memory area to KSM is no longer needed.
26 | 
27 | 2. Rich area detection:
28 |      It automatically detects rich areas containing abundant duplicated
29 |      pages based. Rich areas are given a full scan speed. Poor areas are
30 |      sampled at a reasonable speed with very low CPU consumption.
31 | 
32 | 3. Ultra Per-page scan speed improvement:
33 |      A new hash algorithm is proposed. As a result, on a machine with
34 |      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
35 |      can scan memory areas that does not contain duplicated pages at speed of
36 |      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
37 |      477MB/sec ~ 923MB/sec.
38 | 
39 | 4. Thrashing area avoidance:
40 |      Thrashing area(an VMA that has frequent Ksm page break-out) can be
41 |      filtered out. My benchmark shows it's more efficient than KSM's per-page
42 |      hash value based volatile page detection.
43 | 
44 | 
45 | 5. Misc changes upon KSM:
46 |      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
47 |        comparison. It's much faster than default C version on x86.
48 |      * rmap_item now has an struct *page member to loosely cache a
49 |        address-->page mapping, which reduces too much time-costly
50 |        follow_page().
51 |      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
52 |      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
53 |        ksm is needed for this case.
54 | 
55 | 6. Full Zero Page consideration(contributed by Figo Zhang)
56 |    Now uksmd consider full zero pages as special pages and merge them to an
57 |    special unswappable uksm zero page.
58 | 
59 | # Credits
60 | 
61 | Ultra KSM. Copyright (C) 2011-2016 Nai Xia
62 | 
63 | 
64 | # Reference
65 | 
66 | [FAST '18] UKSM: Swift Memory Deduplication via Hierarchical and Adaptive Memory Region Distilling [[PDF](https://www.usenix.org/system/files/conference/fast18/fast18-xia.pdf)] [[Slides](https://www.usenix.org/sites/default/files/conference/protected-files/fast18_slides_xia.pdf)]
67 | 


--------------------------------------------------------------------------------
/v4.x/uksm-4.7.patch:
--------------------------------------------------------------------------------
   1 | diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
   2 | index 6a5e2a1..09eaa9a1 100644
   3 | --- a/Documentation/vm/00-INDEX
   4 | +++ b/Documentation/vm/00-INDEX
   5 | @@ -18,6 +18,8 @@ idle_page_tracking.txt
   6 |  	- description of the idle page tracking feature.
   7 |  ksm.txt
   8 |  	- how to use the Kernel Samepage Merging feature.
   9 | +uksm.txt
  10 | +	- Introduction to Ultra KSM
  11 |  numa
  12 |  	- information about NUMA specific code in the Linux vm.
  13 |  numa_memory_policy.txt
  14 | diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
  15 | new file mode 100644
  16 | index 0000000..b7a110f
  17 | --- /dev/null
  18 | +++ b/Documentation/vm/uksm.txt
  19 | @@ -0,0 +1,61 @@
  20 | +The Ultra Kernel Samepage Merging feature
  21 | +----------------------------------------------
  22 | +/*
  23 | + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
  24 | + *
  25 | + * This is an improvement upon KSM. Some basic data structures and routines
  26 | + * are borrowed from ksm.c .
  27 | + *
  28 | + * Its new features:
  29 | + * 1. Full system scan:
  30 | + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
  31 | + *      interaction to submit a memory area to KSM is no longer needed.
  32 | + *
  33 | + * 2. Rich area detection:
  34 | + *      It automatically detects rich areas containing abundant duplicated
  35 | + *      pages based. Rich areas are given a full scan speed. Poor areas are
  36 | + *      sampled at a reasonable speed with very low CPU consumption.
  37 | + *
  38 | + * 3. Ultra Per-page scan speed improvement:
  39 | + *      A new hash algorithm is proposed. As a result, on a machine with
  40 | + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
  41 | + *      can scan memory areas that does not contain duplicated pages at speed of
  42 | + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
  43 | + *      477MB/sec ~ 923MB/sec.
  44 | + *
  45 | + * 4. Thrashing area avoidance:
  46 | + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
  47 | + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
  48 | + *      hash value based volatile page detection.
  49 | + *
  50 | + *
  51 | + * 5. Misc changes upon KSM:
  52 | + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
  53 | + *        comparison. It's much faster than default C version on x86.
  54 | + *      * rmap_item now has an struct *page member to loosely cache a
  55 | + *        address-->page mapping, which reduces too much time-costly
  56 | + *        follow_page().
  57 | + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
  58 | + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
  59 | + *        ksm is needed for this case.
  60 | + *
  61 | + * 6. Full Zero Page consideration(contributed by Figo Zhang)
  62 | + *    Now uksmd consider full zero pages as special pages and merge them to an
  63 | + *    special unswappable uksm zero page.
  64 | + */
  65 | +
  66 | +ChangeLog:
  67 | +
  68 | +2012-05-05 The creation of this Doc
  69 | +2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
  70 | +2012-05-28 UKSM 0.1.1.2 bug fix release
  71 | +2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
  72 | +2012-07-2  UKSM 0.1.2-beta2
  73 | +2012-07-10 UKSM 0.1.2-beta3
  74 | +2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
  75 | +2012-10-13 UKSM 0.1.2.1 Bug fixes.
  76 | +2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
  77 | +2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
  78 | +2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
  79 | +2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
  80 | +2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. 
  81 | diff --git a/fs/exec.c b/fs/exec.c
  82 | index 887c1c9..2bee16e 100644
  83 | --- a/fs/exec.c
  84 | +++ b/fs/exec.c
  85 | @@ -19,7 +19,7 @@
  86 |   * current->executable is only used by the procfs.  This allows a dispatch
  87 |   * table to check for several different types  of binary formats.  We keep
  88 |   * trying until we recognize the file or we run out of supported binary
  89 | - * formats. 
  90 | + * formats.
  91 |   */
  92 |  
  93 |  #include <linux/slab.h>
  94 | @@ -57,6 +57,7 @@
  95 |  #include <linux/oom.h>
  96 |  #include <linux/compat.h>
  97 |  #include <linux/vmalloc.h>
  98 | +#include <linux/ksm.h>
  99 |  
 100 |  #include <asm/uaccess.h>
 101 |  #include <asm/mmu_context.h>
 102 | diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
 103 | index 8372046..82aa2f4 100644
 104 | --- a/fs/proc/meminfo.c
 105 | +++ b/fs/proc/meminfo.c
 106 | @@ -89,6 +89,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 107 |  		"SUnreclaim:     %8lu kB\n"
 108 |  		"KernelStack:    %8lu kB\n"
 109 |  		"PageTables:     %8lu kB\n"
 110 | +#ifdef CONFIG_UKSM
 111 | +		"KsmZeroPages:   %8lu kB\n"
 112 | +#endif
 113 |  #ifdef CONFIG_QUICKLIST
 114 |  		"Quicklists:     %8lu kB\n"
 115 |  #endif
 116 | @@ -147,6 +150,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 117 |  		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
 118 |  		global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
 119 |  		K(global_page_state(NR_PAGETABLE)),
 120 | +#ifdef CONFIG_UKSM
 121 | +		K(global_page_state(NR_UKSM_ZERO_PAGES)),
 122 | +#endif
 123 |  #ifdef CONFIG_QUICKLIST
 124 |  		K(quicklist_total_size()),
 125 |  #endif
 126 | diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
 127 | index d4458b6..172ceb9 100644
 128 | --- a/include/asm-generic/pgtable.h
 129 | +++ b/include/asm-generic/pgtable.h
 130 | @@ -601,12 +601,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 131 |  extern void untrack_pfn_moved(struct vm_area_struct *vma);
 132 |  #endif
 133 |  
 134 | +#ifdef CONFIG_UKSM
 135 | +static inline int is_uksm_zero_pfn(unsigned long pfn)
 136 | +{
 137 | +	extern unsigned long uksm_zero_pfn;
 138 | +        return pfn == uksm_zero_pfn;
 139 | +}
 140 | +#else
 141 | +static inline int is_uksm_zero_pfn(unsigned long pfn)
 142 | +{
 143 | +        return 0;
 144 | +}
 145 | +#endif
 146 | +
 147 |  #ifdef __HAVE_COLOR_ZERO_PAGE
 148 |  static inline int is_zero_pfn(unsigned long pfn)
 149 |  {
 150 |  	extern unsigned long zero_pfn;
 151 |  	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
 152 | -	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
 153 | +	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
 154 |  }
 155 |  
 156 |  #define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
 157 | @@ -615,7 +628,7 @@ static inline int is_zero_pfn(unsigned long pfn)
 158 |  static inline int is_zero_pfn(unsigned long pfn)
 159 |  {
 160 |  	extern unsigned long zero_pfn;
 161 | -	return pfn == zero_pfn;
 162 | +	return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
 163 |  }
 164 |  
 165 |  static inline unsigned long my_zero_pfn(unsigned long addr)
 166 | diff --git a/include/linux/ksm.h b/include/linux/ksm.h
 167 | index 7ae216a..06861d8 100644
 168 | --- a/include/linux/ksm.h
 169 | +++ b/include/linux/ksm.h
 170 | @@ -19,21 +19,6 @@ struct mem_cgroup;
 171 |  #ifdef CONFIG_KSM
 172 |  int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 173 |  		unsigned long end, int advice, unsigned long *vm_flags);
 174 | -int __ksm_enter(struct mm_struct *mm);
 175 | -void __ksm_exit(struct mm_struct *mm);
 176 | -
 177 | -static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 178 | -{
 179 | -	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 180 | -		return __ksm_enter(mm);
 181 | -	return 0;
 182 | -}
 183 | -
 184 | -static inline void ksm_exit(struct mm_struct *mm)
 185 | -{
 186 | -	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 187 | -		__ksm_exit(mm);
 188 | -}
 189 |  
 190 |  static inline struct stable_node *page_stable_node(struct page *page)
 191 |  {
 192 | @@ -64,6 +49,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
 193 |  int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 194 |  void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 195 |  
 196 | +#ifdef CONFIG_KSM_LEGACY
 197 | +int __ksm_enter(struct mm_struct *mm);
 198 | +void __ksm_exit(struct mm_struct *mm);
 199 | +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 200 | +{
 201 | +	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
 202 | +		return __ksm_enter(mm);
 203 | +	return 0;
 204 | +}
 205 | +
 206 | +static inline void ksm_exit(struct mm_struct *mm)
 207 | +{
 208 | +	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
 209 | +		__ksm_exit(mm);
 210 | +}
 211 | +
 212 | +#elif defined(CONFIG_UKSM)
 213 | +static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 214 | +{
 215 | +	return 0;
 216 | +}
 217 | +
 218 | +static inline void ksm_exit(struct mm_struct *mm)
 219 | +{
 220 | +}
 221 | +#endif /* !CONFIG_UKSM */
 222 | +
 223 |  #else  /* !CONFIG_KSM */
 224 |  
 225 |  static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 226 | @@ -106,4 +118,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 227 |  #endif /* CONFIG_MMU */
 228 |  #endif /* !CONFIG_KSM */
 229 |  
 230 | +#include <linux/uksm.h>
 231 | +
 232 |  #endif /* __LINUX_KSM_H */
 233 | diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
 234 | index ca3e517..ae62e7d1 100644
 235 | --- a/include/linux/mm_types.h
 236 | +++ b/include/linux/mm_types.h
 237 | @@ -357,6 +357,9 @@ struct vm_area_struct {
 238 |  	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 239 |  #endif
 240 |  	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 241 | +#ifdef CONFIG_UKSM
 242 | +	struct vma_slot *uksm_vma_slot;
 243 | +#endif
 244 |  };
 245 |  
 246 |  struct core_thread {
 247 | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 248 | index 02069c2..f7cce50 100644
 249 | --- a/include/linux/mmzone.h
 250 | +++ b/include/linux/mmzone.h
 251 | @@ -153,6 +153,9 @@ enum zone_stat_item {
 252 |  	WORKINGSET_NODERECLAIM,
 253 |  	NR_ANON_TRANSPARENT_HUGEPAGES,
 254 |  	NR_FREE_CMA_PAGES,
 255 | +#ifdef CONFIG_UKSM
 256 | +	NR_UKSM_ZERO_PAGES,
 257 | +#endif
 258 |  	NR_VM_ZONE_STAT_ITEMS };
 259 |  
 260 |  /*
 261 | @@ -817,7 +820,7 @@ static inline int is_highmem_idx(enum zone_type idx)
 262 |  }
 263 |  
 264 |  /**
 265 | - * is_highmem - helper function to quickly check if a struct zone is a 
 266 | + * is_highmem - helper function to quickly check if a struct zone is a
 267 |   *              highmem zone or not.  This is an attempt to keep references
 268 |   *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 269 |   * @zone - pointer to struct zone variable
 270 | diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
 271 | new file mode 100644
 272 | index 0000000..6780fdb
 273 | --- /dev/null
 274 | +++ b/include/linux/sradix-tree.h
 275 | @@ -0,0 +1,77 @@
 276 | +#ifndef _LINUX_SRADIX_TREE_H
 277 | +#define _LINUX_SRADIX_TREE_H
 278 | +
 279 | +
 280 | +#define INIT_SRADIX_TREE(root, mask)					\
 281 | +do {									\
 282 | +	(root)->height = 0;						\
 283 | +	(root)->gfp_mask = (mask);					\
 284 | +	(root)->rnode = NULL;						\
 285 | +} while (0)
 286 | +
 287 | +#define ULONG_BITS	(sizeof(unsigned long) * 8)
 288 | +#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
 289 | +//#define SRADIX_TREE_MAP_SHIFT	6
 290 | +//#define SRADIX_TREE_MAP_SIZE	(1UL << SRADIX_TREE_MAP_SHIFT)
 291 | +//#define SRADIX_TREE_MAP_MASK	(SRADIX_TREE_MAP_SIZE-1)
 292 | +
 293 | +struct sradix_tree_node {
 294 | +	unsigned int	height;		/* Height from the bottom */
 295 | +	unsigned int	count;		
 296 | +	unsigned int	fulls;		/* Number of full sublevel trees */ 
 297 | +	struct sradix_tree_node *parent;
 298 | +	void *stores[0];
 299 | +};
 300 | +
 301 | +/* A simple radix tree implementation */
 302 | +struct sradix_tree_root {
 303 | +        unsigned int            height;
 304 | +        struct sradix_tree_node *rnode;
 305 | +
 306 | +	/* Where found to have available empty stores in its sublevels */
 307 | +        struct sradix_tree_node *enter_node;
 308 | +	unsigned int shift;
 309 | +	unsigned int stores_size;
 310 | +	unsigned int mask;
 311 | +	unsigned long min;	/* The first hole index */
 312 | +	unsigned long num;
 313 | +	//unsigned long *height_to_maxindex;
 314 | +
 315 | +	/* How the node is allocated and freed. */
 316 | +	struct sradix_tree_node *(*alloc)(void); 
 317 | +	void (*free)(struct sradix_tree_node *node);
 318 | +
 319 | +	/* When a new node is added and removed */
 320 | +	void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
 321 | +	void (*assign)(struct sradix_tree_node *node, unsigned index, void *item);
 322 | +	void (*rm)(struct sradix_tree_node *node, unsigned offset);
 323 | +};
 324 | +
 325 | +struct sradix_tree_path {
 326 | +	struct sradix_tree_node *node;
 327 | +	int offset;
 328 | +};
 329 | +
 330 | +static inline 
 331 | +void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
 332 | +{
 333 | +	root->height = 0;
 334 | +	root->rnode = NULL;
 335 | +	root->shift = shift;
 336 | +	root->stores_size = 1UL << shift;
 337 | +	root->mask = root->stores_size - 1;
 338 | +}
 339 | +
 340 | +
 341 | +extern void *sradix_tree_next(struct sradix_tree_root *root,
 342 | +		       struct sradix_tree_node *node, unsigned long index,
 343 | +		       int (*iter)(void *, unsigned long));
 344 | +
 345 | +extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
 346 | +
 347 | +extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
 348 | +			struct sradix_tree_node *node, unsigned long index);
 349 | +
 350 | +extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
 351 | +
 352 | +#endif /* _LINUX_SRADIX_TREE_H */
 353 | diff --git a/include/linux/uksm.h b/include/linux/uksm.h
 354 | new file mode 100644
 355 | index 0000000..825f05e
 356 | --- /dev/null
 357 | +++ b/include/linux/uksm.h
 358 | @@ -0,0 +1,149 @@
 359 | +#ifndef __LINUX_UKSM_H
 360 | +#define __LINUX_UKSM_H
 361 | +/*
 362 | + * Memory merging support.
 363 | + *
 364 | + * This code enables dynamic sharing of identical pages found in different
 365 | + * memory areas, even if they are not shared by fork().
 366 | + */
 367 | +
 368 | +/* if !CONFIG_UKSM this file should not be compiled at all. */
 369 | +#ifdef CONFIG_UKSM
 370 | +
 371 | +#include <linux/bitops.h>
 372 | +#include <linux/mm.h>
 373 | +#include <linux/pagemap.h>
 374 | +#include <linux/rmap.h>
 375 | +#include <linux/sched.h>
 376 | +
 377 | +extern unsigned long zero_pfn __read_mostly;
 378 | +extern unsigned long uksm_zero_pfn __read_mostly;
 379 | +extern struct page *empty_uksm_zero_page;
 380 | +
 381 | +/* must be done before linked to mm */
 382 | +extern void uksm_vma_add_new(struct vm_area_struct *vma);
 383 | +extern void uksm_remove_vma(struct vm_area_struct *vma);
 384 | +
 385 | +#define UKSM_SLOT_NEED_SORT	(1 << 0)
 386 | +#define UKSM_SLOT_NEED_RERAND 	(1 << 1)
 387 | +#define UKSM_SLOT_SCANNED     	(1 << 2) /* It's scanned in this round */
 388 | +#define UKSM_SLOT_FUL_SCANNED 	(1 << 3)
 389 | +#define UKSM_SLOT_IN_UKSM 	(1 << 4)
 390 | +
 391 | +struct vma_slot {
 392 | +	struct sradix_tree_node *snode;
 393 | +	unsigned long sindex;
 394 | +
 395 | +	struct list_head slot_list;
 396 | +	unsigned long fully_scanned_round;
 397 | +	unsigned long dedup_num;
 398 | +	unsigned long pages_scanned;
 399 | +	unsigned long this_sampled;
 400 | +	unsigned long last_scanned;
 401 | +	unsigned long pages_to_scan;
 402 | +	struct scan_rung *rung;
 403 | +	struct page **rmap_list_pool;
 404 | +	unsigned int *pool_counts;
 405 | +	unsigned long pool_size;
 406 | +	struct vm_area_struct *vma;
 407 | +	struct mm_struct *mm;
 408 | +	unsigned long ctime_j;
 409 | +	unsigned long pages;
 410 | +	unsigned long flags;
 411 | +	unsigned long pages_cowed; /* pages cowed this round */
 412 | +	unsigned long pages_merged; /* pages merged this round */
 413 | +	unsigned long pages_bemerged;
 414 | +
 415 | +	/* when it has page merged in this eval round */
 416 | +	struct list_head dedup_list;
 417 | +};
 418 | +
 419 | +static inline void uksm_unmap_zero_page(pte_t pte)
 420 | +{
 421 | +	if (pte_pfn(pte) == uksm_zero_pfn)
 422 | +		__dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
 423 | +}
 424 | +
 425 | +static inline void uksm_map_zero_page(pte_t pte)
 426 | +{
 427 | +	if (pte_pfn(pte) == uksm_zero_pfn)
 428 | +		__inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
 429 | +}
 430 | +
 431 | +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
 432 | +{
 433 | +	if (vma->uksm_vma_slot && PageKsm(page))
 434 | +		vma->uksm_vma_slot->pages_cowed++;
 435 | +}
 436 | +
 437 | +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
 438 | +{
 439 | +	if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
 440 | +		vma->uksm_vma_slot->pages_cowed++;
 441 | +}
 442 | +
 443 | +static inline int uksm_flags_can_scan(unsigned long vm_flags)
 444 | +{
 445 | +#ifdef VM_SAO
 446 | +		if (vm_flags & VM_SAO)
 447 | +			return 0;
 448 | +#endif
 449 | +
 450 | +	return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
 451 | +			     VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
 452 | +			     | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
 453 | +}
 454 | +
 455 | +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
 456 | +{
 457 | +	if (uksm_flags_can_scan(*vm_flags_p))
 458 | +		*vm_flags_p |= VM_MERGEABLE;
 459 | +}
 460 | +
 461 | +/*
 462 | + * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
 463 | + * be removed when uksm zero page patch is stable enough.
 464 | + */
 465 | +static inline void uksm_bugon_zeropage(pte_t pte)
 466 | +{
 467 | +	BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
 468 | +}
 469 | +#else
 470 | +static inline void uksm_vma_add_new(struct vm_area_struct *vma)
 471 | +{
 472 | +}
 473 | +
 474 | +static inline void uksm_remove_vma(struct vm_area_struct *vma)
 475 | +{
 476 | +}
 477 | +
 478 | +static inline void uksm_unmap_zero_page(pte_t pte)
 479 | +{
 480 | +}
 481 | +
 482 | +static inline void uksm_map_zero_page(pte_t pte)
 483 | +{
 484 | +}
 485 | +
 486 | +static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
 487 | +{
 488 | +}
 489 | +
 490 | +static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
 491 | +{
 492 | +}
 493 | +
 494 | +static inline int uksm_flags_can_scan(unsigned long vm_flags)
 495 | +{
 496 | +	return 0;
 497 | +}
 498 | +
 499 | +static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
 500 | +{
 501 | +}
 502 | +
 503 | +static inline void uksm_bugon_zeropage(pte_t pte)
 504 | +{
 505 | +}
 506 | +#endif /* !CONFIG_UKSM */
 507 | +#endif /* __LINUX_UKSM_H */
 508 | diff --git a/kernel/fork.c b/kernel/fork.c
 509 | index d6404ed..4ce26c0 100644
 510 | --- a/kernel/fork.c
 511 | +++ b/kernel/fork.c
 512 | @@ -459,7 +459,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 513 |  				goto fail_nomem;
 514 |  			charge = len;
 515 |  		}
 516 | -		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 517 | +		tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 518 |  		if (!tmp)
 519 |  			goto fail_nomem;
 520 |  		*tmp = *mpnt;
 521 | @@ -512,7 +512,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 522 |  		__vma_link_rb(mm, tmp, rb_link, rb_parent);
 523 |  		rb_link = &tmp->vm_rb.rb_right;
 524 |  		rb_parent = &tmp->vm_rb;
 525 | -
 526 | +		uksm_vma_add_new(tmp);
 527 |  		mm->map_count++;
 528 |  		retval = copy_page_range(mm, oldmm, mpnt);
 529 |  
 530 | diff --git a/lib/Makefile b/lib/Makefile
 531 | index ff6a7a6..ac0bb55 100644
 532 | --- a/lib/Makefile
 533 | +++ b/lib/Makefile
 534 | @@ -20,7 +20,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n
 535 |  KCOV_INSTRUMENT_hweight.o := n
 536 |  
 537 |  lib-y := ctype.o string.o vsprintf.o cmdline.o \
 538 | -	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 539 | +	 rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\
 540 |  	 idr.o int_sqrt.o extable.o \
 541 |  	 sha1.o md5.o irq_regs.o argv_split.o \
 542 |  	 flex_proportions.o ratelimit.o show_mem.o \
 543 | diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
 544 | new file mode 100644
 545 | index 0000000..8d06329
 546 | --- /dev/null
 547 | +++ b/lib/sradix-tree.c
 548 | @@ -0,0 +1,476 @@
 549 | +#include <linux/errno.h>
 550 | +#include <linux/mm.h>
 551 | +#include <linux/mman.h>
 552 | +#include <linux/spinlock.h>
 553 | +#include <linux/slab.h>
 554 | +#include <linux/gcd.h>
 555 | +#include <linux/sradix-tree.h>
 556 | +
 557 | +static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
 558 | +{
 559 | +	return node->fulls == root->stores_size || 
 560 | +		(node->height == 1 && node->count == root->stores_size);
 561 | +}
 562 | +
 563 | +/*
 564 | + *	Extend a sradix tree so it can store key @index.
 565 | + */
 566 | +static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
 567 | +{
 568 | +	struct sradix_tree_node *node;
 569 | +	unsigned int height;
 570 | +
 571 | +	if (unlikely(root->rnode == NULL)) {
 572 | +		if (!(node = root->alloc()))
 573 | +			return -ENOMEM;
 574 | +
 575 | +		node->height = 1;
 576 | +		root->rnode = node;
 577 | +		root->height = 1;
 578 | +	}
 579 | +
 580 | +	/* Figure out what the height should be.  */
 581 | +	height = root->height;
 582 | +	index >>= root->shift * height;
 583 | +
 584 | +	while (index) {
 585 | +		index >>= root->shift;
 586 | +		height++;
 587 | +	}
 588 | +
 589 | +	while (height > root->height) {
 590 | +		unsigned int newheight;
 591 | +		if (!(node = root->alloc()))
 592 | +			return -ENOMEM;
 593 | +
 594 | +		/* Increase the height.  */
 595 | +		node->stores[0] = root->rnode;
 596 | +		root->rnode->parent = node;
 597 | +		if (root->extend)
 598 | +			root->extend(node, root->rnode);
 599 | +
 600 | +		newheight = root->height + 1;
 601 | +		node->height = newheight;
 602 | +		node->count = 1;
 603 | +		if (sradix_node_full(root, root->rnode))
 604 | +			node->fulls = 1;
 605 | +
 606 | +		root->rnode = node;
 607 | +		root->height = newheight;
 608 | +	}
 609 | +
 610 | +	return 0;
 611 | +}
 612 | +
 613 | +/*
 614 | + * Search the next item from the current node, that is not NULL
 615 | + * and can satify root->iter().
 616 | + */
 617 | +void *sradix_tree_next(struct sradix_tree_root *root,
 618 | +		       struct sradix_tree_node *node, unsigned long index,
 619 | +		       int (*iter)(void *item, unsigned long height))
 620 | +{
 621 | +	unsigned long offset;
 622 | +	void *item;
 623 | +
 624 | +	if (unlikely(node == NULL)) {
 625 | +		node = root->rnode;
 626 | +		for (offset = 0; offset < root->stores_size; offset++) {
 627 | +			item = node->stores[offset];
 628 | +			if (item && (!iter || iter(item, node->height)))
 629 | +				break;
 630 | +		}
 631 | +
 632 | +		if (unlikely(offset >= root->stores_size))
 633 | +			return NULL;
 634 | +
 635 | +		if (node->height == 1)
 636 | +			return item;
 637 | +		else
 638 | +			goto go_down;
 639 | +	}
 640 | +
 641 | +	while (node) {
 642 | +		offset = (index & root->mask) + 1;					
 643 | +		for (;offset < root->stores_size; offset++) {
 644 | +			item = node->stores[offset];
 645 | +			if (item && (!iter || iter(item, node->height)))
 646 | +				break;
 647 | +		}
 648 | +
 649 | +		if (offset < root->stores_size)
 650 | +			break;
 651 | +
 652 | +		node = node->parent;
 653 | +		index >>= root->shift;
 654 | +	}
 655 | +
 656 | +	if (!node)
 657 | +		return NULL;
 658 | +
 659 | +	while (node->height > 1) {
 660 | +go_down:
 661 | +		node = item;
 662 | +		for (offset = 0; offset < root->stores_size; offset++) {
 663 | +			item = node->stores[offset];
 664 | +			if (item && (!iter || iter(item, node->height)))
 665 | +				break;
 666 | +		}
 667 | +
 668 | +		if (unlikely(offset >= root->stores_size))
 669 | +			return NULL;
 670 | +	}
 671 | +
 672 | +	BUG_ON(offset > root->stores_size);
 673 | +
 674 | +	return item;
 675 | +}
 676 | +
 677 | +/*
 678 | + * Blindly insert the item to the tree. Typically, we reuse the
 679 | + * first empty store item.
 680 | + */
 681 | +int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
 682 | +{
 683 | +	unsigned long index;
 684 | +	unsigned int height;
 685 | +	struct sradix_tree_node *node, *tmp = NULL;
 686 | +	int offset, offset_saved;
 687 | +	void **store = NULL;
 688 | +	int error, i, j, shift;
 689 | +
 690 | +go_on:
 691 | +	index = root->min;
 692 | +
 693 | +	if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
 694 | +		node = root->enter_node;
 695 | +		BUG_ON((index >> (root->shift * root->height)));
 696 | +	} else {
 697 | +		node = root->rnode;
 698 | +		if (node == NULL || (index >> (root->shift * root->height))
 699 | +		    || sradix_node_full(root, node)) {
 700 | +			error = sradix_tree_extend(root, index);
 701 | +			if (error)
 702 | +				return error;
 703 | +
 704 | +			node = root->rnode;
 705 | +		}
 706 | +	}
 707 | +
 708 | +
 709 | +	height = node->height;
 710 | +	shift = (height - 1) * root->shift;
 711 | +	offset = (index >> shift) & root->mask;
 712 | +	while (shift > 0) {
 713 | +		offset_saved = offset;
 714 | +		for (; offset < root->stores_size; offset++) {
 715 | +			store = &node->stores[offset];
 716 | +			tmp = *store;
 717 | +
 718 | +			if (!tmp || !sradix_node_full(root, tmp))
 719 | +				break;
 720 | +		}
 721 | +		BUG_ON(offset >= root->stores_size);
 722 | +
 723 | +		if (offset != offset_saved) {
 724 | +			index += (offset - offset_saved) << shift;
 725 | +			index &= ~((1UL << shift) - 1);
 726 | +		}
 727 | +
 728 | +		if (!tmp) {
 729 | +			if (!(tmp = root->alloc()))
 730 | +				return -ENOMEM;
 731 | +
 732 | +			tmp->height = shift / root->shift;
 733 | +			*store = tmp;
 734 | +			tmp->parent = node;
 735 | +			node->count++;
 736 | +//			if (root->extend)
 737 | +//				root->extend(node, tmp);
 738 | +		}
 739 | +
 740 | +		node = tmp;
 741 | +		shift -= root->shift;
 742 | +		offset = (index >> shift) & root->mask;
 743 | +	}
 744 | +
 745 | +	BUG_ON(node->height != 1);
 746 | +
 747 | +
 748 | +	store = &node->stores[offset];
 749 | +	for (i = 0, j = 0;
 750 | +	      j < root->stores_size - node->count && 
 751 | +	      i < root->stores_size - offset && j < num; i++) {
 752 | +		if (!store[i]) {
 753 | +			store[i] = item[j];
 754 | +			if (root->assign)
 755 | +				root->assign(node, index + i, item[j]);
 756 | +			j++;
 757 | +		}
 758 | +	}
 759 | +
 760 | +	node->count += j;
 761 | +	root->num += j;
 762 | +	num -= j;
 763 | +
 764 | +	while (sradix_node_full(root, node)) {
 765 | +		node = node->parent;
 766 | +		if (!node)
 767 | +			break;
 768 | +
 769 | +		node->fulls++;
 770 | +	}
 771 | +
 772 | +	if (unlikely(!node)) {
 773 | +		/* All nodes are full */
 774 | +		root->min = 1 << (root->height * root->shift);
 775 | +		root->enter_node = NULL;
 776 | +	} else {
 777 | +		root->min = index + i - 1;
 778 | +		root->min |= (1UL << (node->height - 1)) - 1;
 779 | +		root->min++;
 780 | +		root->enter_node = node;
 781 | +	}
 782 | +
 783 | +	if (num) {
 784 | +		item += j;
 785 | +		goto go_on;
 786 | +	}
 787 | +
 788 | +	return 0;
 789 | +}
 790 | +
 791 | +
 792 | +/**
 793 | + *	sradix_tree_shrink    -    shrink height of a sradix tree to minimal
 794 | + *      @root		sradix tree root
 795 | + *  
 796 | + */
 797 | +static inline void sradix_tree_shrink(struct sradix_tree_root *root)
 798 | +{
 799 | +	/* try to shrink tree height */
 800 | +	while (root->height > 1) {
 801 | +		struct sradix_tree_node *to_free = root->rnode;
 802 | +
 803 | +		/*
 804 | +		 * The candidate node has more than one child, or its child
 805 | +		 * is not at the leftmost store, we cannot shrink.
 806 | +		 */
 807 | +		if (to_free->count != 1 || !to_free->stores[0])
 808 | +			break;
 809 | +
 810 | +		root->rnode = to_free->stores[0];
 811 | +		root->rnode->parent = NULL;
 812 | +		root->height--;
 813 | +		if (unlikely(root->enter_node == to_free)) {
 814 | +			root->enter_node = NULL;
 815 | +		}
 816 | +		root->free(to_free);
 817 | +	}
 818 | +}
 819 | +
 820 | +/*
 821 | + * Del the item on the known leaf node and index
 822 | + */
 823 | +void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
 824 | +				  struct sradix_tree_node *node, unsigned long index)
 825 | +{
 826 | +	unsigned int offset;
 827 | +	struct sradix_tree_node *start, *end;
 828 | +
 829 | +	BUG_ON(node->height != 1);
 830 | +
 831 | +	start = node;
 832 | +	while (node && !(--node->count))
 833 | +		node = node->parent;
 834 | +
 835 | +	end = node;
 836 | +	if (!node) {
 837 | +		root->rnode = NULL;
 838 | +		root->height = 0;
 839 | +		root->min = 0;
 840 | +		root->num = 0;
 841 | +		root->enter_node = NULL;
 842 | +	} else {
 843 | +		offset = (index >> (root->shift * (node->height - 1))) & root->mask;
 844 | +		if (root->rm)
 845 | +			root->rm(node, offset);
 846 | +		node->stores[offset] = NULL;
 847 | +		root->num--;
 848 | +		if (root->min > index) {
 849 | +			root->min = index;
 850 | +			root->enter_node = node;
 851 | +		}
 852 | +	}
 853 | +
 854 | +	if (start != end) {
 855 | +		do {
 856 | +			node = start;
 857 | +			start = start->parent;
 858 | +			if (unlikely(root->enter_node == node))
 859 | +				root->enter_node = end;
 860 | +			root->free(node);
 861 | +		} while (start != end);
 862 | +
 863 | +		/*
 864 | +		 * Note that shrink may free "end", so enter_node still need to
 865 | +		 * be checked inside.
 866 | +		 */
 867 | +		sradix_tree_shrink(root);
 868 | +	} else if (node->count == root->stores_size - 1) {
 869 | +		/* It WAS a full leaf node. Update the ancestors */
 870 | +		node = node->parent;
 871 | +		while (node) {
 872 | +			node->fulls--;
 873 | +			if (node->fulls != root->stores_size - 1)
 874 | +				break;
 875 | +
 876 | +			node = node->parent;
 877 | +		}
 878 | +	}
 879 | +}
 880 | +
 881 | +void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
 882 | +{
 883 | +	unsigned int height, offset;
 884 | +	struct sradix_tree_node *node;
 885 | +	int shift;
 886 | +
 887 | +	node = root->rnode;
 888 | +	if (node == NULL || (index >> (root->shift * root->height)))
 889 | +		return NULL;
 890 | +
 891 | +	height = root->height;
 892 | +	shift = (height - 1) * root->shift;
 893 | +
 894 | +	do {
 895 | +		offset = (index >> shift) & root->mask;
 896 | +		node = node->stores[offset];
 897 | +		if (!node)
 898 | +			return NULL;
 899 | +
 900 | +		shift -= root->shift;
 901 | +	} while (shift >= 0);
 902 | +
 903 | +	return node;
 904 | +}
 905 | +
 906 | +/*
 907 | + * Return the item if it exists, otherwise create it in place
 908 | + * and return the created item.
 909 | + */
 910 | +void *sradix_tree_lookup_create(struct sradix_tree_root *root, 
 911 | +			unsigned long index, void *(*item_alloc)(void))
 912 | +{
 913 | +	unsigned int height, offset;
 914 | +	struct sradix_tree_node *node, *tmp;
 915 | +	void *item;
 916 | +	int shift, error;
 917 | +
 918 | +	if (root->rnode == NULL || (index >> (root->shift * root->height))) {
 919 | +		if (item_alloc) {
 920 | +			error = sradix_tree_extend(root, index);
 921 | +			if (error)
 922 | +				return NULL;
 923 | +		} else {
 924 | +			return NULL;
 925 | +		}
 926 | +	}
 927 | +
 928 | +	node = root->rnode;
 929 | +	height = root->height;
 930 | +	shift = (height - 1) * root->shift;
 931 | +
 932 | +	do {
 933 | +		offset = (index >> shift) & root->mask;
 934 | +		if (!node->stores[offset]) {
 935 | +			if (!(tmp = root->alloc()))
 936 | +				return NULL;
 937 | +
 938 | +			tmp->height = shift / root->shift;
 939 | +			node->stores[offset] = tmp;
 940 | +			tmp->parent = node;
 941 | +			node->count++;
 942 | +			node = tmp;
 943 | +		} else {
 944 | +			node = node->stores[offset];
 945 | +		}
 946 | +
 947 | +		shift -= root->shift;
 948 | +	} while (shift > 0);
 949 | +
 950 | +	BUG_ON(node->height != 1);
 951 | +	offset = index & root->mask;
 952 | +	if (node->stores[offset]) {
 953 | +		return node->stores[offset];
 954 | +	} else if (item_alloc) {
 955 | +		if (!(item = item_alloc()))
 956 | +			return NULL;
 957 | +
 958 | +		node->stores[offset] = item;
 959 | +
 960 | +		/*
 961 | +		 * NOTE: we do NOT call root->assign here, since this item is
 962 | +		 * newly created by us having no meaning. Caller can call this
 963 | +		 * if it's necessary to do so.
 964 | +		 */
 965 | +
 966 | +		node->count++;
 967 | +		root->num++;
 968 | +
 969 | +		while (sradix_node_full(root, node)) {
 970 | +			node = node->parent;
 971 | +			if (!node)
 972 | +				break;
 973 | +
 974 | +			node->fulls++;
 975 | +		}
 976 | +
 977 | +		if (unlikely(!node)) {
 978 | +			/* All nodes are full */
 979 | +			root->min = 1 << (root->height * root->shift);
 980 | +		} else {
 981 | +			if (root->min == index) {
 982 | +				root->min |= (1UL << (node->height - 1)) - 1;
 983 | +				root->min++;
 984 | +				root->enter_node = node;
 985 | +			}
 986 | +		}
 987 | +
 988 | +		return item;
 989 | +	} else {
 990 | +		return NULL;
 991 | +	}
 992 | +
 993 | +}
 994 | +
 995 | +int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
 996 | +{
 997 | +	unsigned int height, offset;
 998 | +	struct sradix_tree_node *node;
 999 | +	int shift;
1000 | +
1001 | +	node = root->rnode;
1002 | +	if (node == NULL || (index >> (root->shift * root->height)))
1003 | +		return -ENOENT;
1004 | +
1005 | +	height = root->height;
1006 | +	shift = (height - 1) * root->shift;
1007 | +
1008 | +	do {
1009 | +		offset = (index >> shift) & root->mask;
1010 | +		node = node->stores[offset];
1011 | +		if (!node)
1012 | +			return -ENOENT;
1013 | +
1014 | +		shift -= root->shift;
1015 | +	} while (shift > 0);
1016 | +
1017 | +	offset = index & root->mask;
1018 | +	if (!node->stores[offset])
1019 | +		return -ENOENT;
1020 | +
1021 | +	sradix_tree_delete_from_leaf(root, node, index);
1022 | +
1023 | +	return 0;
1024 | +}
1025 | diff --git a/mm/Kconfig b/mm/Kconfig
1026 | index 3e2daef..165b60e 100644
1027 | --- a/mm/Kconfig
1028 | +++ b/mm/Kconfig
1029 | @@ -332,6 +332,32 @@ config KSM
1030 |  	  See Documentation/vm/ksm.txt for more information: KSM is inactive
1031 |  	  until a program has madvised that an area is MADV_MERGEABLE, and
1032 |  	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
1033 | +choice
1034 | +	prompt "Choose UKSM/KSM strategy"
1035 | +	default UKSM
1036 | +	depends on KSM
1037 | +	help
1038 | +	  This option allows to select a UKSM/KSM stragety.
1039 | +
1040 | +config UKSM
1041 | +	bool "Ultra-KSM for page merging"
1042 | +	depends on KSM
1043 | +	help
1044 | +	UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
1045 | +	page Merging), but with a fundamentally rewritten core algorithm. With
1046 | +	an advanced algorithm, UKSM now can transparently scans all anonymously
1047 | +	mapped user space applications with an significantly improved scan speed
1048 | +	and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
1049 | +	UKSM. Now UKSM has its first stable release and first real world enterprise user.
1050 | +	For more information, please goto its project page.
1051 | +	(github.com/dolohow/uksm)
1052 | +
1053 | +config KSM_LEGACY
1054 | +	bool "Legacy KSM implementation"
1055 | +	depends on KSM
1056 | +	help
1057 | +	The legacy KSM implementation from Redhat.
1058 | +endchoice
1059 |  
1060 |  config DEFAULT_MMAP_MIN_ADDR
1061 |          int "Low address space to protect from user allocation"
1062 | diff --git a/mm/Makefile b/mm/Makefile
1063 | index 78c6f7d..7e7cd8a 100644
1064 | --- a/mm/Makefile
1065 | +++ b/mm/Makefile
1066 | @@ -63,7 +63,8 @@ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
1067 |  obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
1068 |  obj-$(CONFIG_SLOB) += slob.o
1069 |  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
1070 | -obj-$(CONFIG_KSM) += ksm.o
1071 | +obj-$(CONFIG_KSM_LEGACY) += ksm.o
1072 | +obj-$(CONFIG_UKSM) += uksm.o
1073 |  obj-$(CONFIG_PAGE_POISONING) += page_poison.o
1074 |  obj-$(CONFIG_SLAB) += slab.o
1075 |  obj-$(CONFIG_SLUB) += slub.o
1076 | diff --git a/mm/memory.c b/mm/memory.c
1077 | index 9e04681..02200d3 100644
1078 | --- a/mm/memory.c
1079 | +++ b/mm/memory.c
1080 | @@ -124,6 +124,28 @@ unsigned long highest_memmap_pfn __read_mostly;
1081 |  
1082 |  EXPORT_SYMBOL(zero_pfn);
1083 |  
1084 | +#ifdef CONFIG_UKSM
1085 | +unsigned long uksm_zero_pfn __read_mostly;
1086 | +EXPORT_SYMBOL_GPL(uksm_zero_pfn);
1087 | +struct page *empty_uksm_zero_page;
1088 | +
1089 | +static int __init setup_uksm_zero_page(void)
1090 | +{
1091 | +	unsigned long addr;
1092 | +	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
1093 | +	if (!addr)
1094 | +		panic("Oh boy, that early out of memory?");
1095 | +
1096 | +	empty_uksm_zero_page = virt_to_page((void *) addr);
1097 | +	SetPageReserved(empty_uksm_zero_page);
1098 | +
1099 | +	uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
1100 | +
1101 | +	return 0;
1102 | +}
1103 | +core_initcall(setup_uksm_zero_page);
1104 | +#endif
1105 | +
1106 |  /*
1107 |   * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
1108 |   */
1109 | @@ -135,6 +157,7 @@ static int __init init_zero_pfn(void)
1110 |  core_initcall(init_zero_pfn);
1111 |  
1112 |  
1113 | +
1114 |  #if defined(SPLIT_RSS_COUNTING)
1115 |  
1116 |  void sync_mm_rss(struct mm_struct *mm)
1117 | @@ -905,6 +928,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1118 |  		get_page(page);
1119 |  		page_dup_rmap(page, false);
1120 |  		rss[mm_counter(page)]++;
1121 | +		
1122 | +		/* Should return NULL in vm_normal_page() */
1123 | +		uksm_bugon_zeropage(pte);
1124 | +	} else {
1125 | +		uksm_map_zero_page(pte);
1126 |  	}
1127 |  
1128 |  out_set_pte:
1129 | @@ -1138,8 +1166,10 @@ again:
1130 |  			ptent = ptep_get_and_clear_full(mm, addr, pte,
1131 |  							tlb->fullmm);
1132 |  			tlb_remove_tlb_entry(tlb, pte, addr);
1133 | -			if (unlikely(!page))
1134 | +			if (unlikely(!page)) {
1135 | +				uksm_unmap_zero_page(ptent);
1136 |  				continue;
1137 | +			}
1138 |  
1139 |  			if (!PageAnon(page)) {
1140 |  				if (pte_dirty(ptent)) {
1141 | @@ -1995,8 +2025,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1142 |  			clear_page(kaddr);
1143 |  		kunmap_atomic(kaddr);
1144 |  		flush_dcache_page(dst);
1145 | -	} else
1146 | +	} else {
1147 |  		copy_user_highpage(dst, src, va, vma);
1148 | +		uksm_cow_page(vma, src);
1149 | +	}
1150 |  }
1151 |  
1152 |  static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
1153 | @@ -2141,6 +2173,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
1154 |  		new_page = alloc_zeroed_user_highpage_movable(vma, address);
1155 |  		if (!new_page)
1156 |  			goto oom;
1157 | +		uksm_cow_pte(vma, orig_pte);
1158 |  	} else {
1159 |  		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1160 |  		if (!new_page)
1161 | @@ -2166,7 +2199,9 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
1162 |  						mm_counter_file(old_page));
1163 |  				inc_mm_counter_fast(mm, MM_ANONPAGES);
1164 |  			}
1165 | +			uksm_bugon_zeropage(orig_pte);
1166 |  		} else {
1167 | +			uksm_unmap_zero_page(orig_pte);
1168 |  			inc_mm_counter_fast(mm, MM_ANONPAGES);
1169 |  		}
1170 |  		flush_cache_page(vma, address, pte_pfn(orig_pte));
1171 | diff --git a/mm/mmap.c b/mm/mmap.c
1172 | index de2c176..ce60715 100644
1173 | --- a/mm/mmap.c
1174 | +++ b/mm/mmap.c
1175 | @@ -43,6 +43,7 @@
1176 |  #include <linux/userfaultfd_k.h>
1177 |  #include <linux/moduleparam.h>
1178 |  #include <linux/pkeys.h>
1179 | +#include <linux/ksm.h>
1180 |  
1181 |  #include <asm/uaccess.h>
1182 |  #include <asm/cacheflush.h>
1183 | @@ -164,6 +165,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
1184 |  	if (vma->vm_file)
1185 |  		fput(vma->vm_file);
1186 |  	mpol_put(vma_policy(vma));
1187 | +	uksm_remove_vma(vma);
1188 |  	kmem_cache_free(vm_area_cachep, vma);
1189 |  	return next;
1190 |  }
1191 | @@ -629,9 +631,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1192 |  	long adjust_next = 0;
1193 |  	int remove_next = 0;
1194 |  
1195 | +/*
1196 | + * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
1197 | + * acquired
1198 | + */
1199 | +	uksm_remove_vma(vma);
1200 | +
1201 |  	if (next && !insert) {
1202 |  		struct vm_area_struct *exporter = NULL;
1203 |  
1204 | +		uksm_remove_vma(next);
1205 |  		if (end >= next->vm_end) {
1206 |  			/*
1207 |  			 * vma expands, overlapping all the next, and
1208 | @@ -725,6 +734,7 @@ again:			remove_next = 1 + (end > next->vm_end);
1209 |  		end_changed = true;
1210 |  	}
1211 |  	vma->vm_pgoff = pgoff;
1212 | +
1213 |  	if (adjust_next) {
1214 |  		next->vm_start += adjust_next << PAGE_SHIFT;
1215 |  		next->vm_pgoff += adjust_next;
1216 | @@ -795,16 +805,22 @@ again:			remove_next = 1 + (end > next->vm_end);
1217 |  		 * up the code too much to do both in one go.
1218 |  		 */
1219 |  		next = vma->vm_next;
1220 | -		if (remove_next == 2)
1221 | +		if (remove_next == 2) {
1222 | +			uksm_remove_vma(next);
1223 |  			goto again;
1224 | -		else if (next)
1225 | +		} else if (next) {
1226 |  			vma_gap_update(next);
1227 | -		else
1228 | +		} else {
1229 |  			mm->highest_vm_end = end;
1230 | +		}
1231 | +	} else {
1232 | +		if (next && !insert)
1233 | +			uksm_vma_add_new(next);
1234 |  	}
1235 |  	if (insert && file)
1236 |  		uprobe_mmap(insert);
1237 |  
1238 | +	uksm_vma_add_new(vma);
1239 |  	validate_mm(mm);
1240 |  
1241 |  	return 0;
1242 | @@ -1196,6 +1212,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
1243 |  	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
1244 |  			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1245 |  
1246 | +	/* If uksm is enabled, we add VM_MERGABLE to new VMAs. */
1247 | +	uksm_vm_flags_mod(&vm_flags);
1248 | +
1249 |  	if (flags & MAP_LOCKED)
1250 |  		if (!can_do_mlock())
1251 |  			return -EPERM;
1252 | @@ -1534,6 +1553,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1253 |  			allow_write_access(file);
1254 |  	}
1255 |  	file = vma->vm_file;
1256 | +	uksm_vma_add_new(vma);
1257 |  out:
1258 |  	perf_event_mmap(vma);
1259 |  
1260 | @@ -1575,6 +1595,7 @@ allow_write_and_free_vma:
1261 |  	if (vm_flags & VM_DENYWRITE)
1262 |  		allow_write_access(file);
1263 |  free_vma:
1264 | +	uksm_remove_vma(vma);
1265 |  	kmem_cache_free(vm_area_cachep, vma);
1266 |  unacct_error:
1267 |  	if (charged)
1268 | @@ -2369,6 +2390,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1269 |  	else
1270 |  		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
1271 |  
1272 | +	uksm_vma_add_new(new);
1273 | +
1274 |  	/* Success. */
1275 |  	if (!err)
1276 |  		return 0;
1277 | @@ -2639,6 +2662,7 @@ static int do_brk(unsigned long addr, unsigned long len)
1278 |  		return 0;
1279 |  
1280 |  	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
1281 | +	uksm_vm_flags_mod(&flags);
1282 |  
1283 |  	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
1284 |  	if (offset_in_page(error))
1285 | @@ -2696,6 +2720,7 @@ static int do_brk(unsigned long addr, unsigned long len)
1286 |  	vma->vm_flags = flags;
1287 |  	vma->vm_page_prot = vm_get_page_prot(flags);
1288 |  	vma_link(mm, vma, prev, rb_link, rb_parent);
1289 | +	uksm_vma_add_new(vma);
1290 |  out:
1291 |  	perf_event_mmap(vma);
1292 |  	mm->total_vm += len >> PAGE_SHIFT;
1293 | @@ -2734,6 +2759,12 @@ void exit_mmap(struct mm_struct *mm)
1294 |  	/* mm's last user has gone, and its about to be pulled down */
1295 |  	mmu_notifier_release(mm);
1296 |  
1297 | +	/*
1298 | +	 * Taking write lock on mmap_sem does not harm others,
1299 | +	 * but it's crucial for uksm to avoid races.
1300 | +	 */
1301 | +	down_write(&mm->mmap_sem);
1302 | +
1303 |  	if (mm->locked_vm) {
1304 |  		vma = mm->mmap;
1305 |  		while (vma) {
1306 | @@ -2769,6 +2800,11 @@ void exit_mmap(struct mm_struct *mm)
1307 |  		vma = remove_vma(vma);
1308 |  	}
1309 |  	vm_unacct_memory(nr_accounted);
1310 | +
1311 | +	mm->mmap = NULL;
1312 | +	mm->mm_rb = RB_ROOT;
1313 | +	vmacache_invalidate(mm);
1314 | +	up_write(&mm->mmap_sem);
1315 |  }
1316 |  
1317 |  /* Insert vm structure into process list sorted by address
1318 | @@ -2878,6 +2914,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
1319 |  			new_vma->vm_ops->open(new_vma);
1320 |  		vma_link(mm, new_vma, prev, rb_link, rb_parent);
1321 |  		*need_rmap_locks = false;
1322 | +		uksm_vma_add_new(new_vma);
1323 |  	}
1324 |  	return new_vma;
1325 |  
1326 | @@ -3015,6 +3052,7 @@ static struct vm_area_struct *__install_special_mapping(
1327 |  	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
1328 |  
1329 |  	perf_event_mmap(vma);
1330 | +	uksm_vma_add_new(vma);
1331 |  
1332 |  	return vma;
1333 |  
1334 | diff --git a/mm/rmap.c b/mm/rmap.c
1335 | index 701b93f..64ba784 100644
1336 | --- a/mm/rmap.c
1337 | +++ b/mm/rmap.c
1338 | @@ -1110,9 +1110,9 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
1339 |  
1340 |  /**
1341 |   * __page_set_anon_rmap - set up new anonymous rmap
1342 | - * @page:	Page to add to rmap	
1343 | + * @page:	Page to add to rmap
1344 |   * @vma:	VM area to add page to.
1345 | - * @address:	User virtual address of the mapping	
1346 | + * @address:	User virtual address of the mapping
1347 |   * @exclusive:	the page is exclusively owned by the current process
1348 |   */
1349 |  static void __page_set_anon_rmap(struct page *page,
1350 | diff --git a/mm/uksm.c b/mm/uksm.c
1351 | new file mode 100644
1352 | index 0000000..64b6dc6
1353 | --- /dev/null
1354 | +++ b/mm/uksm.c
1355 | @@ -0,0 +1,5545 @@
1356 | +/*
1357 | + * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
1358 | + *
1359 | + * This is an improvement upon KSM. Some basic data structures and routines
1360 | + * are borrowed from ksm.c .
1361 | + *
1362 | + * Its new features:
1363 | + * 1. Full system scan:
1364 | + *      It automatically scans all user processes' anonymous VMAs. Kernel-user
1365 | + *      interaction to submit a memory area to KSM is no longer needed.
1366 | + *
1367 | + * 2. Rich area detection:
1368 | + *      It automatically detects rich areas containing abundant duplicated
1369 | + *      pages based. Rich areas are given a full scan speed. Poor areas are
1370 | + *      sampled at a reasonable speed with very low CPU consumption.
1371 | + *
1372 | + * 3. Ultra Per-page scan speed improvement:
1373 | + *      A new hash algorithm is proposed. As a result, on a machine with
1374 | + *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
1375 | + *      can scan memory areas that does not contain duplicated pages at speed of
1376 | + *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
1377 | + *      477MB/sec ~ 923MB/sec.
1378 | + *
1379 | + * 4. Thrashing area avoidance:
1380 | + *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
1381 | + *      filtered out. My benchmark shows it's more efficient than KSM's per-page
1382 | + *      hash value based volatile page detection.
1383 | + *
1384 | + *
1385 | + * 5. Misc changes upon KSM:
1386 | + *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
1387 | + *        comparison. It's much faster than default C version on x86.
1388 | + *      * rmap_item now has an struct *page member to loosely cache a
1389 | + *        address-->page mapping, which reduces too much time-costly
1390 | + *        follow_page().
1391 | + *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
1392 | + *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
1393 | + *        ksm is needed for this case.
1394 | + *
1395 | + * 6. Full Zero Page consideration(contributed by Figo Zhang)
1396 | + *    Now uksmd consider full zero pages as special pages and merge them to an
1397 | + *    special unswappable uksm zero page.
1398 | + */
1399 | +
1400 | +#include <linux/errno.h>
1401 | +#include <linux/mm.h>
1402 | +#include <linux/fs.h>
1403 | +#include <linux/mman.h>
1404 | +#include <linux/sched.h>
1405 | +#include <linux/rwsem.h>
1406 | +#include <linux/pagemap.h>
1407 | +#include <linux/rmap.h>
1408 | +#include <linux/spinlock.h>
1409 | +#include <linux/jhash.h>
1410 | +#include <linux/delay.h>
1411 | +#include <linux/kthread.h>
1412 | +#include <linux/wait.h>
1413 | +#include <linux/slab.h>
1414 | +#include <linux/rbtree.h>
1415 | +#include <linux/memory.h>
1416 | +#include <linux/mmu_notifier.h>
1417 | +#include <linux/swap.h>
1418 | +#include <linux/ksm.h>
1419 | +#include <linux/crypto.h>
1420 | +#include <linux/scatterlist.h>
1421 | +#include <crypto/hash.h>
1422 | +#include <linux/random.h>
1423 | +#include <linux/math64.h>
1424 | +#include <linux/gcd.h>
1425 | +#include <linux/freezer.h>
1426 | +#include <linux/sradix-tree.h>
1427 | +
1428 | +#include <asm/tlbflush.h>
1429 | +#include "internal.h"
1430 | +
1431 | +#ifdef CONFIG_X86
1432 | +#undef memcmp
1433 | +
1434 | +#ifdef CONFIG_X86_32
1435 | +#define memcmp memcmpx86_32
1436 | +/*
1437 | + * Compare 4-byte-aligned address s1 and s2, with length n
1438 | + */
1439 | +int memcmpx86_32(void *s1, void *s2, size_t n)
1440 | +{
1441 | +	size_t num = n / 4;
1442 | +	register int res;
1443 | +
1444 | +	__asm__ __volatile__
1445 | +	(
1446 | +	 "testl %3,%3\n\t"
1447 | +	 "repe; cmpsd\n\t"
1448 | +	 "je        1f\n\t"
1449 | +	 "sbbl      %0,%0\n\t"
1450 | +	 "orl       $1,%0\n"
1451 | +	 "1:"
1452 | +	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1453 | +	 : "0" (0)
1454 | +	 : "cc");
1455 | +
1456 | +	return res;
1457 | +}
1458 | +
1459 | +/*
1460 | + * Check the page is all zero ?
1461 | + */
1462 | +static int is_full_zero(const void *s1, size_t len)
1463 | +{
1464 | +	unsigned char same;
1465 | +
1466 | +	len /= 4;
1467 | +
1468 | +	__asm__ __volatile__
1469 | +	("repe; scasl;"
1470 | +	 "sete %0"
1471 | +	 : "=qm" (same), "+D" (s1), "+c" (len)
1472 | +	 : "a" (0)
1473 | +	 : "cc");
1474 | +
1475 | +	return same;
1476 | +}
1477 | +
1478 | +
1479 | +#elif defined(CONFIG_X86_64)
1480 | +#define memcmp memcmpx86_64
1481 | +/*
1482 | + * Compare 8-byte-aligned address s1 and s2, with length n
1483 | + */
1484 | +int memcmpx86_64(void *s1, void *s2, size_t n)
1485 | +{
1486 | +	size_t num = n / 8;
1487 | +	register int res;
1488 | +
1489 | +	__asm__ __volatile__
1490 | +	(
1491 | +	 "testq %q3,%q3\n\t"
1492 | +	 "repe; cmpsq\n\t"
1493 | +	 "je        1f\n\t"
1494 | +	 "sbbq      %q0,%q0\n\t"
1495 | +	 "orq       $1,%q0\n"
1496 | +	 "1:"
1497 | +	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
1498 | +	 : "0" (0)
1499 | +	 : "cc");
1500 | +
1501 | +	return res;
1502 | +}
1503 | +
1504 | +static int is_full_zero(const void *s1, size_t len)
1505 | +{
1506 | +	unsigned char same;
1507 | +
1508 | +	len /= 8;
1509 | +
1510 | +	__asm__ __volatile__
1511 | +	("repe; scasq;"
1512 | +	 "sete %0"
1513 | +	 : "=qm" (same), "+D" (s1), "+c" (len)
1514 | +	 : "a" (0)
1515 | +	 : "cc");
1516 | +
1517 | +	return same;
1518 | +}
1519 | +
1520 | +#endif
1521 | +#else
1522 | +static int is_full_zero(const void *s1, size_t len)
1523 | +{
1524 | +	unsigned long *src = s1;
1525 | +	int i;
1526 | +
1527 | +	len /= sizeof(*src);
1528 | +
1529 | +	for (i = 0; i < len; i++) {
1530 | +		if (src[i])
1531 | +			return 0;
1532 | +	}
1533 | +
1534 | +	return 1;
1535 | +}
1536 | +#endif
1537 | +
1538 | +#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
1539 | +#define TIME_RATIO_SCALE	10000
1540 | +
1541 | +#define SLOT_TREE_NODE_SHIFT	8
1542 | +#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
1543 | +struct slot_tree_node {
1544 | +	unsigned long size;
1545 | +	struct sradix_tree_node snode;
1546 | +	void *stores[SLOT_TREE_NODE_STORE_SIZE];
1547 | +};
1548 | +
1549 | +static struct kmem_cache *slot_tree_node_cachep;
1550 | +
1551 | +static struct sradix_tree_node *slot_tree_node_alloc(void)
1552 | +{
1553 | +	struct slot_tree_node *p;
1554 | +	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
1555 | +	if (!p)
1556 | +		return NULL;
1557 | +
1558 | +	return &p->snode;
1559 | +}
1560 | +
1561 | +static void slot_tree_node_free(struct sradix_tree_node *node)
1562 | +{
1563 | +	struct slot_tree_node *p;
1564 | +
1565 | +	p = container_of(node, struct slot_tree_node, snode);
1566 | +	kmem_cache_free(slot_tree_node_cachep, p);
1567 | +}
1568 | +
1569 | +static void slot_tree_node_extend(struct sradix_tree_node *parent,
1570 | +				  struct sradix_tree_node *child)
1571 | +{
1572 | +	struct slot_tree_node *p, *c;
1573 | +
1574 | +	p = container_of(parent, struct slot_tree_node, snode);
1575 | +	c = container_of(child, struct slot_tree_node, snode);
1576 | +
1577 | +	p->size += c->size;
1578 | +}
1579 | +
1580 | +void slot_tree_node_assign(struct sradix_tree_node *node,
1581 | +			   unsigned index, void *item)
1582 | +{
1583 | +	struct vma_slot *slot = item;
1584 | +	struct slot_tree_node *cur;
1585 | +
1586 | +	slot->snode = node;
1587 | +	slot->sindex = index;
1588 | +
1589 | +	while (node) {
1590 | +		cur = container_of(node, struct slot_tree_node, snode);
1591 | +		cur->size += slot->pages;
1592 | +		node = node->parent;
1593 | +	}
1594 | +}
1595 | +
1596 | +void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
1597 | +{
1598 | +	struct vma_slot *slot;
1599 | +	struct slot_tree_node *cur;
1600 | +	unsigned long pages;
1601 | +
1602 | +	if (node->height == 1) {
1603 | +		slot = node->stores[offset];
1604 | +		pages = slot->pages;
1605 | +	} else {
1606 | +		cur = container_of(node->stores[offset],
1607 | +				   struct slot_tree_node, snode);
1608 | +		pages = cur->size;
1609 | +	}
1610 | +
1611 | +	while (node) {
1612 | +		cur = container_of(node, struct slot_tree_node, snode);
1613 | +		cur->size -= pages;
1614 | +		node = node->parent;
1615 | +	}
1616 | +}
1617 | +
1618 | +unsigned long slot_iter_index;
1619 | +int slot_iter(void *item,  unsigned long height)
1620 | +{
1621 | +	struct slot_tree_node *node;
1622 | +	struct vma_slot *slot;
1623 | +
1624 | +	if (height == 1) {
1625 | +		slot = item;
1626 | +		if (slot_iter_index < slot->pages) {
1627 | +			/*in this one*/
1628 | +			return 1;
1629 | +		} else {
1630 | +			slot_iter_index -= slot->pages;
1631 | +			return 0;
1632 | +		}
1633 | +
1634 | +	} else {
1635 | +		node = container_of(item, struct slot_tree_node, snode);
1636 | +		if (slot_iter_index < node->size) {
1637 | +			/*in this one*/
1638 | +			return 1;
1639 | +		} else {
1640 | +			slot_iter_index -= node->size;
1641 | +			return 0;
1642 | +		}
1643 | +	}
1644 | +}
1645 | +
1646 | +
1647 | +static inline void slot_tree_init_root(struct sradix_tree_root *root)
1648 | +{
1649 | +	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
1650 | +	root->alloc = slot_tree_node_alloc;
1651 | +	root->free = slot_tree_node_free;
1652 | +	root->extend = slot_tree_node_extend;
1653 | +	root->assign = slot_tree_node_assign;
1654 | +	root->rm = slot_tree_node_rm;
1655 | +}
1656 | +
1657 | +void slot_tree_init(void)
1658 | +{
1659 | +	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
1660 | +				sizeof(struct slot_tree_node), 0,
1661 | +				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1662 | +				NULL);
1663 | +}
1664 | +
1665 | +
1666 | +/* Each rung of this ladder is a list of VMAs having a same scan ratio */
1667 | +struct scan_rung {
1668 | +	//struct list_head scanned_list;
1669 | +	struct sradix_tree_root vma_root;
1670 | +	struct sradix_tree_root vma_root2;
1671 | +
1672 | +	struct vma_slot *current_scan;
1673 | +	unsigned long current_offset;
1674 | +
1675 | +	/*
1676 | +	 * The initial value for current_offset, it should loop over
1677 | +	 * [0~ step - 1] to let all slot have its chance to be scanned.
1678 | +	 */
1679 | +	unsigned long offset_init;
1680 | +	unsigned long step; /* dynamic step for current_offset */
1681 | +	unsigned int flags;
1682 | +	unsigned long pages_to_scan;
1683 | +	//unsigned long fully_scanned_slots;
1684 | +	/*
1685 | +	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
1686 | +	 * the cpu time ratio it can spend in rung_i for every scan
1687 | +	 * period. if < 0, then it is the cpu time ratio relative to the
1688 | +	 * max cpu percentage user specified. Both in unit of
1689 | +	 * 1/TIME_RATIO_SCALE
1690 | +	 */
1691 | +	int cpu_ratio;
1692 | +
1693 | +	/*
1694 | +	 * How long it will take for all slots in this rung to be fully
1695 | +	 * scanned? If it's zero, we don't care about the cover time:
1696 | +	 * it's fully scanned.
1697 | +	 */
1698 | +	unsigned int cover_msecs;
1699 | +	//unsigned long vma_num;
1700 | +	//unsigned long pages; /* Sum of all slot's pages in rung */
1701 | +};
1702 | +
1703 | +/**
1704 | + * node of either the stable or unstale rbtree
1705 | + *
1706 | + */
1707 | +struct tree_node {
1708 | +	struct rb_node node; /* link in the main (un)stable rbtree */
1709 | +	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
1710 | +	u32 hash;
1711 | +	unsigned long count; /* TODO: merged with sub_root */
1712 | +	struct list_head all_list; /* all tree nodes in stable/unstable tree */
1713 | +};
1714 | +
1715 | +/**
1716 | + * struct stable_node - node of the stable rbtree
1717 | + * @node: rb node of this ksm page in the stable tree
1718 | + * @hlist: hlist head of rmap_items using this ksm page
1719 | + * @kpfn: page frame number of this ksm page
1720 | + */
1721 | +struct stable_node {
1722 | +	struct rb_node node; /* link in sub-rbtree */
1723 | +	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
1724 | +	struct hlist_head hlist;
1725 | +	unsigned long kpfn;
1726 | +	u32 hash_max; /* if ==0 then it's not been calculated yet */
1727 | +	struct list_head all_list; /* in a list for all stable nodes */
1728 | +};
1729 | +
1730 | +/**
1731 | + * struct node_vma - group rmap_items linked in a same stable
1732 | + * node together.
1733 | + */
1734 | +struct node_vma {
1735 | +	union {
1736 | +		struct vma_slot *slot;
1737 | +		unsigned long key;  /* slot is used as key sorted on hlist */
1738 | +	};
1739 | +	struct hlist_node hlist;
1740 | +	struct hlist_head rmap_hlist;
1741 | +	struct stable_node *head;
1742 | +};
1743 | +
1744 | +/**
1745 | + * struct rmap_item - reverse mapping item for virtual addresses
1746 | + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
1747 | + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
1748 | + * @mm: the memory structure this rmap_item is pointing into
1749 | + * @address: the virtual address this rmap_item tracks (+ flags in low bits)
1750 | + * @node: rb node of this rmap_item in the unstable tree
1751 | + * @head: pointer to stable_node heading this list in the stable tree
1752 | + * @hlist: link into hlist of rmap_items hanging off that stable_node
1753 | + */
1754 | +struct rmap_item {
1755 | +	struct vma_slot *slot;
1756 | +	struct page *page;
1757 | +	unsigned long address;	/* + low bits used for flags below */
1758 | +	unsigned long hash_round;
1759 | +	unsigned long entry_index;
1760 | +	union {
1761 | +		struct {/* when in unstable tree */
1762 | +			struct rb_node node;
1763 | +			struct tree_node *tree_node;
1764 | +			u32 hash_max;
1765 | +		};
1766 | +		struct { /* when in stable tree */
1767 | +			struct node_vma *head;
1768 | +			struct hlist_node hlist;
1769 | +			struct anon_vma *anon_vma;
1770 | +		};
1771 | +	};
1772 | +} __attribute__((aligned(4)));
1773 | +
1774 | +struct rmap_list_entry {
1775 | +	union {
1776 | +		struct rmap_item *item;
1777 | +		unsigned long addr;
1778 | +	};
1779 | +	/* lowest bit is used for is_addr tag */
1780 | +} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
1781 | +
1782 | +
1783 | +/* Basic data structure definition ends */
1784 | +
1785 | +
1786 | +/*
1787 | + * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
1788 | + * The flags use the low bits of rmap_item.address
1789 | + */
1790 | +#define UNSTABLE_FLAG	0x1
1791 | +#define STABLE_FLAG	0x2
1792 | +#define get_rmap_addr(x)	((x)->address & PAGE_MASK)
1793 | +
1794 | +/*
1795 | + * rmap_list_entry helpers
1796 | + */
1797 | +#define IS_ADDR_FLAG	1
1798 | +#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
1799 | +#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
1800 | +#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
1801 | +
1802 | +
1803 | +/*
1804 | + * High speed caches for frequently allocated and freed structs
1805 | + */
1806 | +static struct kmem_cache *rmap_item_cache;
1807 | +static struct kmem_cache *stable_node_cache;
1808 | +static struct kmem_cache *node_vma_cache;
1809 | +static struct kmem_cache *vma_slot_cache;
1810 | +static struct kmem_cache *tree_node_cache;
1811 | +#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
1812 | +		sizeof(struct __struct), __alignof__(struct __struct),\
1813 | +		(__flags), NULL)
1814 | +
1815 | +/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
1816 | +#define SCAN_LADDER_SIZE 4
1817 | +static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
1818 | +
1819 | +/* The evaluation rounds uksmd has finished */
1820 | +static unsigned long long uksm_eval_round = 1;
1821 | +
1822 | +/*
1823 | + * we add 1 to this var when we consider we should rebuild the whole
1824 | + * unstable tree.
1825 | + */
1826 | +static unsigned long uksm_hash_round = 1;
1827 | +
1828 | +/*
1829 | + * How many times the whole memory is scanned.
1830 | + */
1831 | +static unsigned long long fully_scanned_round = 1;
1832 | +
1833 | +/* The total number of virtual pages of all vma slots */
1834 | +static u64 uksm_pages_total;
1835 | +
1836 | +/* The number of pages has been scanned since the start up */
1837 | +static u64 uksm_pages_scanned;
1838 | +
1839 | +static u64 scanned_virtual_pages;
1840 | +
1841 | +/* The number of pages has been scanned since last encode_benefit call */
1842 | +static u64 uksm_pages_scanned_last;
1843 | +
1844 | +/* If the scanned number is tooo large, we encode it here */
1845 | +static u64 pages_scanned_stored;
1846 | +
1847 | +static unsigned long pages_scanned_base;
1848 | +
1849 | +/* The number of nodes in the stable tree */
1850 | +static unsigned long uksm_pages_shared;
1851 | +
1852 | +/* The number of page slots additionally sharing those nodes */
1853 | +static unsigned long uksm_pages_sharing;
1854 | +
1855 | +/* The number of nodes in the unstable tree */
1856 | +static unsigned long uksm_pages_unshared;
1857 | +
1858 | +/*
1859 | + * Milliseconds ksmd should sleep between scans,
1860 | + * >= 100ms to be consistent with
1861 | + * scan_time_to_sleep_msec()
1862 | + */
1863 | +static unsigned int uksm_sleep_jiffies;
1864 | +
1865 | +/* The real value for the uksmd next sleep */
1866 | +static unsigned int uksm_sleep_real;
1867 | +
1868 | +/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
1869 | +static unsigned int uksm_sleep_saved;
1870 | +
1871 | +/* Max percentage of cpu utilization ksmd can take to scan in one batch */
1872 | +static unsigned int uksm_max_cpu_percentage;
1873 | +
1874 | +static int uksm_cpu_governor;
1875 | +
1876 | +static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
1877 | +
1878 | +struct uksm_cpu_preset_s {
1879 | +	int cpu_ratio[SCAN_LADDER_SIZE];
1880 | +	unsigned int cover_msecs[SCAN_LADDER_SIZE];
1881 | +	unsigned int max_cpu; /* percentage */
1882 | +};
1883 | +
1884 | +struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
1885 | +	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
1886 | +	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
1887 | +	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
1888 | +	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
1889 | +};
1890 | +
1891 | +/* The default value for uksm_ema_page_time if it's not initialized */
1892 | +#define UKSM_PAGE_TIME_DEFAULT	500
1893 | +
1894 | +/*cost to scan one page by expotional moving average in nsecs */
1895 | +static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
1896 | +
1897 | +/* The expotional moving average alpha weight, in percentage. */
1898 | +#define EMA_ALPHA	20
1899 | +
1900 | +/*
1901 | + * The threshold used to filter out thrashing areas,
1902 | + * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
1903 | + * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
1904 | + * will be considered as having a zero duplication ratio.
1905 | + */
1906 | +static unsigned int uksm_thrash_threshold = 50;
1907 | +
1908 | +/* How much dedup ratio is considered to be abundant*/
1909 | +static unsigned int uksm_abundant_threshold = 10;
1910 | +
1911 | +/* All slots having merged pages in this eval round. */
1912 | +struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
1913 | +
1914 | +/* How many times the ksmd has slept since startup */
1915 | +static unsigned long long uksm_sleep_times;
1916 | +
1917 | +#define UKSM_RUN_STOP	0
1918 | +#define UKSM_RUN_MERGE	1
1919 | +static unsigned int uksm_run = 1;
1920 | +
1921 | +static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
1922 | +static DEFINE_MUTEX(uksm_thread_mutex);
1923 | +
1924 | +/*
1925 | + * List vma_slot_new is for newly created vma_slot waiting to be added by
1926 | + * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
1927 | + * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
1928 | + * VMA has been removed/freed.
1929 | + */
1930 | +struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
1931 | +struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
1932 | +struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
1933 | +static DEFINE_SPINLOCK(vma_slot_list_lock);
1934 | +
1935 | +/* The unstable tree heads */
1936 | +static struct rb_root root_unstable_tree = RB_ROOT;
1937 | +
1938 | +/*
1939 | + * All tree_nodes are in a list to be freed at once when unstable tree is
1940 | + * freed after each scan round.
1941 | + */
1942 | +static struct list_head unstable_tree_node_list =
1943 | +				LIST_HEAD_INIT(unstable_tree_node_list);
1944 | +
1945 | +/* List contains all stable nodes */
1946 | +static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
1947 | +
1948 | +/*
1949 | + * When the hash strength is changed, the stable tree must be delta_hashed and
1950 | + * re-structured. We use two set of below structs to speed up the
1951 | + * re-structuring of stable tree.
1952 | + */
1953 | +static struct list_head
1954 | +stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
1955 | +			    LIST_HEAD_INIT(stable_tree_node_list[1])};
1956 | +
1957 | +static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
1958 | +static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
1959 | +static struct rb_root *root_stable_treep = &root_stable_tree[0];
1960 | +static unsigned long stable_tree_index;
1961 | +
1962 | +/* The hash strength needed to hash a full page */
1963 | +#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))
1964 | +
1965 | +/* The hash strength needed for loop-back hashing */
1966 | +#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)
1967 | +
1968 | +/* The random offsets in a page */
1969 | +static u32 *random_nums;
1970 | +
1971 | +/* The hash strength */
1972 | +static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
1973 | +
1974 | +/* The delta value each time the hash strength increases or decreases */
1975 | +static unsigned long hash_strength_delta;
1976 | +#define HASH_STRENGTH_DELTA_MAX	5
1977 | +
1978 | +/* The time we have saved due to random_sample_hash */
1979 | +static u64 rshash_pos;
1980 | +
1981 | +/* The time we have wasted due to hash collision */
1982 | +static u64 rshash_neg;
1983 | +
1984 | +struct uksm_benefit {
1985 | +	u64 pos;
1986 | +	u64 neg;
1987 | +	u64 scanned;
1988 | +	unsigned long base;
1989 | +} benefit;
1990 | +
1991 | +/*
1992 | + * The relative cost of memcmp, compared to 1 time unit of random sample
1993 | + * hash, this value is tested when ksm module is initialized
1994 | + */
1995 | +static unsigned long memcmp_cost;
1996 | +
1997 | +static unsigned long  rshash_neg_cont_zero;
1998 | +static unsigned long  rshash_cont_obscure;
1999 | +
2000 | +/* The possible states of hash strength adjustment heuristic */
2001 | +enum rshash_states {
2002 | +		RSHASH_STILL,
2003 | +		RSHASH_TRYUP,
2004 | +		RSHASH_TRYDOWN,
2005 | +		RSHASH_NEW,
2006 | +		RSHASH_PRE_STILL,
2007 | +};
2008 | +
2009 | +/* The possible direction we are about to adjust hash strength */
2010 | +enum rshash_direct {
2011 | +	GO_UP,
2012 | +	GO_DOWN,
2013 | +	OBSCURE,
2014 | +	STILL,
2015 | +};
2016 | +
2017 | +/* random sampling hash state machine */
2018 | +static struct {
2019 | +	enum rshash_states state;
2020 | +	enum rshash_direct pre_direct;
2021 | +	u8 below_count;
2022 | +	/* Keep a lookup window of size 5, iff above_count/below_count > 3
2023 | +	 * in this window we stop trying.
2024 | +	 */
2025 | +	u8 lookup_window_index;
2026 | +	u64 stable_benefit;
2027 | +	unsigned long turn_point_down;
2028 | +	unsigned long turn_benefit_down;
2029 | +	unsigned long turn_point_up;
2030 | +	unsigned long turn_benefit_up;
2031 | +	unsigned long stable_point;
2032 | +} rshash_state;
2033 | +
2034 | +/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
2035 | +static u32 *zero_hash_table;
2036 | +
2037 | +static inline struct node_vma *alloc_node_vma(void)
2038 | +{
2039 | +	struct node_vma *node_vma;
2040 | +	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
2041 | +	if (node_vma) {
2042 | +		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
2043 | +		INIT_HLIST_NODE(&node_vma->hlist);
2044 | +	}
2045 | +	return node_vma;
2046 | +}
2047 | +
2048 | +static inline void free_node_vma(struct node_vma *node_vma)
2049 | +{
2050 | +	kmem_cache_free(node_vma_cache, node_vma);
2051 | +}
2052 | +
2053 | +
2054 | +static inline struct vma_slot *alloc_vma_slot(void)
2055 | +{
2056 | +	struct vma_slot *slot;
2057 | +
2058 | +	/*
2059 | +	 * In case ksm is not initialized by now.
2060 | +	 * Oops, we need to consider the call site of uksm_init() in the future.
2061 | +	 */
2062 | +	if (!vma_slot_cache)
2063 | +		return NULL;
2064 | +
2065 | +	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
2066 | +	if (slot) {
2067 | +		INIT_LIST_HEAD(&slot->slot_list);
2068 | +		INIT_LIST_HEAD(&slot->dedup_list);
2069 | +		slot->flags |= UKSM_SLOT_NEED_RERAND;
2070 | +	}
2071 | +	return slot;
2072 | +}
2073 | +
2074 | +static inline void free_vma_slot(struct vma_slot *vma_slot)
2075 | +{
2076 | +	kmem_cache_free(vma_slot_cache, vma_slot);
2077 | +}
2078 | +
2079 | +
2080 | +
2081 | +static inline struct rmap_item *alloc_rmap_item(void)
2082 | +{
2083 | +	struct rmap_item *rmap_item;
2084 | +
2085 | +	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
2086 | +	if (rmap_item) {
2087 | +		/* bug on lowest bit is not clear for flag use */
2088 | +		BUG_ON(is_addr(rmap_item));
2089 | +	}
2090 | +	return rmap_item;
2091 | +}
2092 | +
2093 | +static inline void free_rmap_item(struct rmap_item *rmap_item)
2094 | +{
2095 | +	rmap_item->slot = NULL;	/* debug safety */
2096 | +	kmem_cache_free(rmap_item_cache, rmap_item);
2097 | +}
2098 | +
2099 | +static inline struct stable_node *alloc_stable_node(void)
2100 | +{
2101 | +	struct stable_node *node;
2102 | +	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
2103 | +	if (!node)
2104 | +		return NULL;
2105 | +
2106 | +	INIT_HLIST_HEAD(&node->hlist);
2107 | +	list_add(&node->all_list, &stable_node_list);
2108 | +	return node;
2109 | +}
2110 | +
2111 | +static inline void free_stable_node(struct stable_node *stable_node)
2112 | +{
2113 | +	list_del(&stable_node->all_list);
2114 | +	kmem_cache_free(stable_node_cache, stable_node);
2115 | +}
2116 | +
2117 | +static inline struct tree_node *alloc_tree_node(struct list_head *list)
2118 | +{
2119 | +	struct tree_node *node;
2120 | +	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
2121 | +	if (!node)
2122 | +		return NULL;
2123 | +
2124 | +	list_add(&node->all_list, list);
2125 | +	return node;
2126 | +}
2127 | +
2128 | +static inline void free_tree_node(struct tree_node *node)
2129 | +{
2130 | +	list_del(&node->all_list);
2131 | +	kmem_cache_free(tree_node_cache, node);
2132 | +}
2133 | +
2134 | +static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
2135 | +{
2136 | +	struct anon_vma *anon_vma = rmap_item->anon_vma;
2137 | +
2138 | +	put_anon_vma(anon_vma);
2139 | +}
2140 | +
2141 | +
2142 | +/**
2143 | + * Remove a stable node from stable_tree, may unlink from its tree_node and
2144 | + * may remove its parent tree_node if no other stable node is pending.
2145 | + *
2146 | + * @stable_node 	The node need to be removed
2147 | + * @unlink_rb 		Will this node be unlinked from the rbtree?
2148 | + * @remove_tree_	node Will its tree_node be removed if empty?
2149 | + */
2150 | +static void remove_node_from_stable_tree(struct stable_node *stable_node,
2151 | +					 int unlink_rb,  int remove_tree_node)
2152 | +{
2153 | +	struct node_vma *node_vma;
2154 | +	struct rmap_item *rmap_item;
2155 | +	struct hlist_node *n;
2156 | +
2157 | +	if (!hlist_empty(&stable_node->hlist)) {
2158 | +		hlist_for_each_entry_safe(node_vma, n,
2159 | +					  &stable_node->hlist, hlist) {
2160 | +			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
2161 | +				uksm_pages_sharing--;
2162 | +
2163 | +				uksm_drop_anon_vma(rmap_item);
2164 | +				rmap_item->address &= PAGE_MASK;
2165 | +			}
2166 | +			free_node_vma(node_vma);
2167 | +			cond_resched();
2168 | +		}
2169 | +
2170 | +		/* the last one is counted as shared */
2171 | +		uksm_pages_shared--;
2172 | +		uksm_pages_sharing++;
2173 | +	}
2174 | +
2175 | +	if (stable_node->tree_node && unlink_rb) {
2176 | +		rb_erase(&stable_node->node,
2177 | +			 &stable_node->tree_node->sub_root);
2178 | +
2179 | +		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
2180 | +		    remove_tree_node) {
2181 | +			rb_erase(&stable_node->tree_node->node,
2182 | +				 root_stable_treep);
2183 | +			free_tree_node(stable_node->tree_node);
2184 | +		} else {
2185 | +			stable_node->tree_node->count--;
2186 | +		}
2187 | +	}
2188 | +
2189 | +	free_stable_node(stable_node);
2190 | +}
2191 | +
2192 | +
2193 | +/*
2194 | + * get_uksm_page: checks if the page indicated by the stable node
2195 | + * is still its ksm page, despite having held no reference to it.
2196 | + * In which case we can trust the content of the page, and it
2197 | + * returns the gotten page; but if the page has now been zapped,
2198 | + * remove the stale node from the stable tree and return NULL.
2199 | + *
2200 | + * You would expect the stable_node to hold a reference to the ksm page.
2201 | + * But if it increments the page's count, swapping out has to wait for
2202 | + * ksmd to come around again before it can free the page, which may take
2203 | + * seconds or even minutes: much too unresponsive.  So instead we use a
2204 | + * "keyhole reference": access to the ksm page from the stable node peeps
2205 | + * out through its keyhole to see if that page still holds the right key,
2206 | + * pointing back to this stable node.  This relies on freeing a PageAnon
2207 | + * page to reset its page->mapping to NULL, and relies on no other use of
2208 | + * a page to put something that might look like our key in page->mapping.
2209 | + *
2210 | + * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
2211 | + * but this is different - made simpler by uksm_thread_mutex being held, but
2212 | + * interesting for assuming that no other use of the struct page could ever
2213 | + * put our expected_mapping into page->mapping (or a field of the union which
2214 | + * coincides with page->mapping).  The RCU calls are not for KSM at all, but
2215 | + * to keep the page_count protocol described with page_cache_get_speculative.
2216 | + *
2217 | + * Note: it is possible that get_uksm_page() will return NULL one moment,
2218 | + * then page the next, if the page is in between page_freeze_refs() and
2219 | + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
2220 | + * is on its way to being freed; but it is an anomaly to bear in mind.
2221 | + *
2222 | + * @unlink_rb: 		if the removal of this node will firstly unlink from
2223 | + * its rbtree. stable_node_reinsert will prevent this when restructuring the
2224 | + * node from its old tree.
2225 | + *
2226 | + * @remove_tree_node:	if this is the last one of its tree_node, will the
2227 | + * tree_node be freed ? If we are inserting stable node, this tree_node may
2228 | + * be reused, so don't free it.
2229 | + */
2230 | +static struct page *get_uksm_page(struct stable_node *stable_node,
2231 | +				 int unlink_rb, int remove_tree_node)
2232 | +{
2233 | +	struct page *page;
2234 | +	void *expected_mapping;
2235 | +	unsigned long kpfn;
2236 | +
2237 | +again:
2238 | +	kpfn = stable_node->kpfn;
2239 | +	page = pfn_to_page(kpfn);
2240 | +	expected_mapping = (void *)stable_node +
2241 | +				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
2242 | +
2243 | +	if (page->mapping != expected_mapping)
2244 | +		goto stale;
2245 | +	if (!get_page_unless_zero(page))
2246 | +		goto stale;
2247 | +	if (page->mapping != expected_mapping) {
2248 | +		put_page(page);
2249 | +		goto stale;
2250 | +	}
2251 | +
2252 | +	lock_page(page);
2253 | +	if (page->mapping != expected_mapping) {
2254 | +		unlock_page(page);
2255 | +		put_page(page);
2256 | +		goto stale;
2257 | +	}
2258 | +	unlock_page(page);
2259 | +	return page;
2260 | +stale:
2261 | +	/*
2262 | +	 * We come here from above when page->mapping or !PageSwapCache
2263 | +	 * suggests that the node is stale; but it might be under migration.
2264 | +	 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
2265 | +	 * before checking whether node->kpfn has been changed.
2266 | +	 */
2267 | +	smp_rmb();
2268 | +	if (stable_node->kpfn != kpfn)
2269 | +		goto again;
2270 | +
2271 | +	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
2272 | +
2273 | +	return NULL;
2274 | +}
2275 | +
2276 | +/*
2277 | + * Removing rmap_item from stable or unstable tree.
2278 | + * This function will clean the information from the stable/unstable tree.
2279 | + */
2280 | +static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
2281 | +{
2282 | +	if (rmap_item->address & STABLE_FLAG) {
2283 | +		struct stable_node *stable_node;
2284 | +		struct node_vma *node_vma;
2285 | +		struct page *page;
2286 | +
2287 | +		node_vma = rmap_item->head;
2288 | +		stable_node = node_vma->head;
2289 | +		page = get_uksm_page(stable_node, 1, 1);
2290 | +		if (!page)
2291 | +			goto out;
2292 | +
2293 | +		/*
2294 | +		 * page lock is needed because it's racing with
2295 | +		 * try_to_unmap_ksm(), etc.
2296 | +		 */
2297 | +		lock_page(page);
2298 | +		hlist_del(&rmap_item->hlist);
2299 | +
2300 | +		if (hlist_empty(&node_vma->rmap_hlist)) {
2301 | +			hlist_del(&node_vma->hlist);
2302 | +			free_node_vma(node_vma);
2303 | +		}
2304 | +		unlock_page(page);
2305 | +
2306 | +		put_page(page);
2307 | +		if (hlist_empty(&stable_node->hlist)) {
2308 | +			/* do NOT call remove_node_from_stable_tree() here,
2309 | +			 * it's possible for a forked rmap_item not in
2310 | +			 * stable tree while the in-tree rmap_items were
2311 | +			 * deleted.
2312 | +			 */
2313 | +			uksm_pages_shared--;
2314 | +		} else
2315 | +			uksm_pages_sharing--;
2316 | +
2317 | +
2318 | +		uksm_drop_anon_vma(rmap_item);
2319 | +	} else if (rmap_item->address & UNSTABLE_FLAG) {
2320 | +		if (rmap_item->hash_round == uksm_hash_round) {
2321 | +
2322 | +			rb_erase(&rmap_item->node,
2323 | +				 &rmap_item->tree_node->sub_root);
2324 | +			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
2325 | +				rb_erase(&rmap_item->tree_node->node,
2326 | +					 &root_unstable_tree);
2327 | +
2328 | +				free_tree_node(rmap_item->tree_node);
2329 | +			} else
2330 | +				rmap_item->tree_node->count--;
2331 | +		}
2332 | +		uksm_pages_unshared--;
2333 | +	}
2334 | +
2335 | +	rmap_item->address &= PAGE_MASK;
2336 | +	rmap_item->hash_max = 0;
2337 | +
2338 | +out:
2339 | +	cond_resched();		/* we're called from many long loops */
2340 | +}
2341 | +
2342 | +static inline int slot_in_uksm(struct vma_slot *slot)
2343 | +{
2344 | +	return list_empty(&slot->slot_list);
2345 | +}
2346 | +
2347 | +/*
2348 | + * Test if the mm is exiting
2349 | + */
2350 | +static inline bool uksm_test_exit(struct mm_struct *mm)
2351 | +{
2352 | +	return atomic_read(&mm->mm_users) == 0;
2353 | +}
2354 | +
2355 | +static inline unsigned long vma_pool_size(struct vma_slot *slot)
2356 | +{
2357 | +	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
2358 | +			PAGE_SIZE) >> PAGE_SHIFT;
2359 | +}
2360 | +
2361 | +#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
2362 | +
2363 | +/* must be done with sem locked */
2364 | +static int slot_pool_alloc(struct vma_slot *slot)
2365 | +{
2366 | +	unsigned long pool_size;
2367 | +
2368 | +	if (slot->rmap_list_pool)
2369 | +		return 0;
2370 | +
2371 | +	pool_size = vma_pool_size(slot);
2372 | +	slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
2373 | +				       pool_size, GFP_KERNEL);
2374 | +	if (!slot->rmap_list_pool)
2375 | +		return -ENOMEM;
2376 | +
2377 | +	slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
2378 | +				    GFP_KERNEL);
2379 | +	if (!slot->pool_counts) {
2380 | +		kfree(slot->rmap_list_pool);
2381 | +		return -ENOMEM;
2382 | +	}
2383 | +
2384 | +	slot->pool_size = pool_size;
2385 | +	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
2386 | +	slot->flags |= UKSM_SLOT_IN_UKSM;
2387 | +	uksm_pages_total += slot->pages;
2388 | +
2389 | +	return 0;
2390 | +}
2391 | +
2392 | +/*
2393 | + * Called after vma is unlinked from its mm
2394 | + */
2395 | +void uksm_remove_vma(struct vm_area_struct *vma)
2396 | +{
2397 | +	struct vma_slot *slot;
2398 | +
2399 | +	if (!vma->uksm_vma_slot)
2400 | +		return;
2401 | +
2402 | +	spin_lock(&vma_slot_list_lock);
2403 | +	slot = vma->uksm_vma_slot;
2404 | +	if (!slot)
2405 | +		goto out;
2406 | +
2407 | +	if (slot_in_uksm(slot)) {
2408 | +		/**
2409 | +		 * This slot has been added by ksmd, so move to the del list
2410 | +		 * waiting ksmd to free it.
2411 | +		 */
2412 | +		list_add_tail(&slot->slot_list, &vma_slot_del);
2413 | +	} else {
2414 | +		/**
2415 | +		 * It's still on new list. It's ok to free slot directly.
2416 | +		 */
2417 | +		list_del(&slot->slot_list);
2418 | +		free_vma_slot(slot);
2419 | +	}
2420 | +out:
2421 | +	vma->uksm_vma_slot = NULL;
2422 | +	spin_unlock(&vma_slot_list_lock);
2423 | +}
2424 | +
2425 | +/**
2426 | + * Need to do two things:
2427 | + * 1. check if slot was moved to del list
2428 | + * 2. make sure the mmap_sem is manipulated under valid vma.
2429 | + *
2430 | + * My concern here is that in some cases, this may make
2431 | + * vma_slot_list_lock() waiters to serialized further by some
2432 | + * sem->wait_lock, can this really be expensive?
2433 | + *
2434 | + *
2435 | + * @return
2436 | + * 0: if successfully locked mmap_sem
2437 | + * -ENOENT: this slot was moved to del list
2438 | + * -EBUSY: vma lock failed
2439 | + */
2440 | +static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
2441 | +{
2442 | +	struct vm_area_struct *vma;
2443 | +	struct mm_struct *mm;
2444 | +	struct rw_semaphore *sem;
2445 | +
2446 | +	spin_lock(&vma_slot_list_lock);
2447 | +
2448 | +	/* the slot_list was removed and inited from new list, when it enters
2449 | +	 * uksm_list. If now it's not empty, then it must be moved to del list
2450 | +	 */
2451 | +	if (!slot_in_uksm(slot)) {
2452 | +		spin_unlock(&vma_slot_list_lock);
2453 | +		return -ENOENT;
2454 | +	}
2455 | +
2456 | +	BUG_ON(slot->pages != vma_pages(slot->vma));
2457 | +	/* Ok, vma still valid */
2458 | +	vma = slot->vma;
2459 | +	mm = vma->vm_mm;
2460 | +	sem = &mm->mmap_sem;
2461 | +
2462 | +	if (uksm_test_exit(mm)) {
2463 | +		spin_unlock(&vma_slot_list_lock);
2464 | +		return -ENOENT;
2465 | +	}
2466 | +
2467 | +	if (down_read_trylock(sem)) {
2468 | +		spin_unlock(&vma_slot_list_lock);
2469 | +		if (slot_pool_alloc(slot)) {
2470 | +			uksm_remove_vma(vma);
2471 | +			up_read(sem);
2472 | +			return -ENOENT;
2473 | +		}
2474 | +		return 0;
2475 | +	}
2476 | +
2477 | +	spin_unlock(&vma_slot_list_lock);
2478 | +	return -EBUSY;
2479 | +}
2480 | +
2481 | +static inline unsigned long
2482 | +vma_page_address(struct page *page, struct vm_area_struct *vma)
2483 | +{
2484 | +	pgoff_t pgoff = page->index;
2485 | +	unsigned long address;
2486 | +
2487 | +	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
2488 | +	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
2489 | +		/* page should be within @vma mapping range */
2490 | +		return -EFAULT;
2491 | +	}
2492 | +	return address;
2493 | +}
2494 | +
2495 | +
2496 | +/* return 0 on success with the item's mmap_sem locked */
2497 | +static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
2498 | +{
2499 | +	struct mm_struct *mm;
2500 | +	struct vma_slot *slot = item->slot;
2501 | +	int err = -EINVAL;
2502 | +
2503 | +	struct page *page;
2504 | +
2505 | +	/*
2506 | +	 * try_down_read_slot_mmap_sem() returns non-zero if the slot
2507 | +	 * has been removed by uksm_remove_vma().
2508 | +	 */
2509 | +	if (try_down_read_slot_mmap_sem(slot))
2510 | +		return -EBUSY;
2511 | +
2512 | +	mm = slot->vma->vm_mm;
2513 | +
2514 | +	if (uksm_test_exit(mm))
2515 | +		goto failout_up;
2516 | +
2517 | +	page = item->page;
2518 | +	rcu_read_lock();
2519 | +	if (!get_page_unless_zero(page)) {
2520 | +		rcu_read_unlock();
2521 | +		goto failout_up;
2522 | +	}
2523 | +
2524 | +	/* No need to consider huge page here. */
2525 | +	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
2526 | +	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
2527 | +		/*
2528 | +		 * TODO:
2529 | +		 * should we release this item becase of its stale page
2530 | +		 * mapping?
2531 | +		 */
2532 | +		put_page(page);
2533 | +		rcu_read_unlock();
2534 | +		goto failout_up;
2535 | +	}
2536 | +	rcu_read_unlock();
2537 | +	return 0;
2538 | +
2539 | +failout_up:
2540 | +	up_read(&mm->mmap_sem);
2541 | +	return err;
2542 | +}
2543 | +
2544 | +/*
2545 | + * What kind of VMA is considered ?
2546 | + */
2547 | +static inline int vma_can_enter(struct vm_area_struct *vma)
2548 | +{
2549 | +	return uksm_flags_can_scan(vma->vm_flags);
2550 | +}
2551 | +
2552 | +/*
2553 | + * Called whenever a fresh new vma is created A new vma_slot.
2554 | + * is created and inserted into a global list Must be called.
2555 | + * after vma is inserted to its mm      		    .
2556 | + */
2557 | +void uksm_vma_add_new(struct vm_area_struct *vma)
2558 | +{
2559 | +	struct vma_slot *slot;
2560 | +
2561 | +	if (!vma_can_enter(vma)) {
2562 | +		vma->uksm_vma_slot = NULL;
2563 | +		return;
2564 | +	}
2565 | +
2566 | +	slot = alloc_vma_slot();
2567 | +	if (!slot) {
2568 | +		vma->uksm_vma_slot = NULL;
2569 | +		return;
2570 | +	}
2571 | +
2572 | +	vma->uksm_vma_slot = slot;
2573 | +	vma->vm_flags |= VM_MERGEABLE;
2574 | +	slot->vma = vma;
2575 | +	slot->mm = vma->vm_mm;
2576 | +	slot->ctime_j = jiffies;
2577 | +	slot->pages = vma_pages(vma);
2578 | +	spin_lock(&vma_slot_list_lock);
2579 | +	list_add_tail(&slot->slot_list, &vma_slot_new);
2580 | +	spin_unlock(&vma_slot_list_lock);
2581 | +}
2582 | +
2583 | +/*   32/3 < they < 32/2 */
2584 | +#define shiftl	8
2585 | +#define shiftr	12
2586 | +
2587 | +#define HASH_FROM_TO(from, to) 				\
2588 | +for (index = from; index < to; index++) {		\
2589 | +	pos = random_nums[index];			\
2590 | +	hash += key[pos];				\
2591 | +	hash += (hash << shiftl);			\
2592 | +	hash ^= (hash >> shiftr);			\
2593 | +}
2594 | +
2595 | +
2596 | +#define HASH_FROM_DOWN_TO(from, to) 			\
2597 | +for (index = from - 1; index >= to; index--) {		\
2598 | +	hash ^= (hash >> shiftr);			\
2599 | +	hash ^= (hash >> (shiftr*2));			\
2600 | +	hash -= (hash << shiftl);			\
2601 | +	hash += (hash << (shiftl*2));			\
2602 | +	pos = random_nums[index];			\
2603 | +	hash -= key[pos];				\
2604 | +}
2605 | +
2606 | +/*
2607 | + * The main random sample hash function.
2608 | + */
2609 | +static u32 random_sample_hash(void *addr, u32 hash_strength)
2610 | +{
2611 | +	u32 hash = 0xdeadbeef;
2612 | +	int index, pos, loop = hash_strength;
2613 | +	u32 *key = (u32 *)addr;
2614 | +
2615 | +	if (loop > HASH_STRENGTH_FULL)
2616 | +		loop = HASH_STRENGTH_FULL;
2617 | +
2618 | +	HASH_FROM_TO(0, loop);
2619 | +
2620 | +	if (hash_strength > HASH_STRENGTH_FULL) {
2621 | +		loop = hash_strength - HASH_STRENGTH_FULL;
2622 | +		HASH_FROM_TO(0, loop);
2623 | +	}
2624 | +
2625 | +	return hash;
2626 | +}
2627 | +
2628 | +
2629 | +/**
2630 | + * It's used when hash strength is adjusted
2631 | + *
2632 | + * @addr The page's virtual address
2633 | + * @from The original hash strength
2634 | + * @to   The hash strength changed to
2635 | + * @hash The hash value generated with "from" hash value
2636 | + *
2637 | + * return the hash value
2638 | + */
2639 | +static u32 delta_hash(void *addr, int from, int to, u32 hash)
2640 | +{
2641 | +	u32 *key = (u32 *)addr;
2642 | +	int index, pos; /* make sure they are int type */
2643 | +
2644 | +	if (to > from) {
2645 | +		if (from >= HASH_STRENGTH_FULL) {
2646 | +			from -= HASH_STRENGTH_FULL;
2647 | +			to -= HASH_STRENGTH_FULL;
2648 | +			HASH_FROM_TO(from, to);
2649 | +		} else if (to <= HASH_STRENGTH_FULL) {
2650 | +			HASH_FROM_TO(from, to);
2651 | +		} else {
2652 | +			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
2653 | +			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
2654 | +		}
2655 | +	} else {
2656 | +		if (from <= HASH_STRENGTH_FULL) {
2657 | +			HASH_FROM_DOWN_TO(from, to);
2658 | +		} else if (to >= HASH_STRENGTH_FULL) {
2659 | +			from -= HASH_STRENGTH_FULL;
2660 | +			to -= HASH_STRENGTH_FULL;
2661 | +			HASH_FROM_DOWN_TO(from, to);
2662 | +		} else {
2663 | +			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
2664 | +			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
2665 | +		}
2666 | +	}
2667 | +
2668 | +	return hash;
2669 | +}
2670 | +
2671 | +/**
2672 | + *
2673 | + * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
2674 | + * has finished.
2675 | + *
2676 | + * return 0 if no page has been scanned since last call, 1 otherwise.
2677 | + */
2678 | +static inline int encode_benefit(void)
2679 | +{
2680 | +	u64 scanned_delta, pos_delta, neg_delta;
2681 | +	unsigned long base = benefit.base;
2682 | +
2683 | +	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
2684 | +
2685 | +	if (!scanned_delta)
2686 | +		return 0;
2687 | +
2688 | +	scanned_delta >>= base;
2689 | +	pos_delta = rshash_pos >> base;
2690 | +	neg_delta = rshash_neg >> base;
2691 | +
2692 | +	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
2693 | +	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
2694 | +	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
2695 | +		benefit.scanned >>= 1;
2696 | +		benefit.neg >>= 1;
2697 | +		benefit.pos >>= 1;
2698 | +		benefit.base++;
2699 | +		scanned_delta >>= 1;
2700 | +		pos_delta >>= 1;
2701 | +		neg_delta >>= 1;
2702 | +	}
2703 | +
2704 | +	benefit.pos += pos_delta;
2705 | +	benefit.neg += neg_delta;
2706 | +	benefit.scanned += scanned_delta;
2707 | +
2708 | +	BUG_ON(!benefit.scanned);
2709 | +
2710 | +	rshash_pos = rshash_neg = 0;
2711 | +	uksm_pages_scanned_last = uksm_pages_scanned;
2712 | +
2713 | +	return 1;
2714 | +}
2715 | +
2716 | +static inline void reset_benefit(void)
2717 | +{
2718 | +	benefit.pos = 0;
2719 | +	benefit.neg = 0;
2720 | +	benefit.base = 0;
2721 | +	benefit.scanned = 0;
2722 | +}
2723 | +
2724 | +static inline void inc_rshash_pos(unsigned long delta)
2725 | +{
2726 | +	if (CAN_OVERFLOW_U64(rshash_pos, delta))
2727 | +		encode_benefit();
2728 | +
2729 | +	rshash_pos += delta;
2730 | +}
2731 | +
2732 | +static inline void inc_rshash_neg(unsigned long delta)
2733 | +{
2734 | +	if (CAN_OVERFLOW_U64(rshash_neg, delta))
2735 | +		encode_benefit();
2736 | +
2737 | +	rshash_neg += delta;
2738 | +}
2739 | +
2740 | +
2741 | +static inline u32 page_hash(struct page *page, unsigned long hash_strength,
2742 | +			    int cost_accounting)
2743 | +{
2744 | +	u32 val;
2745 | +	unsigned long delta;
2746 | +
2747 | +	void *addr = kmap_atomic(page);
2748 | +
2749 | +	val = random_sample_hash(addr, hash_strength);
2750 | +	kunmap_atomic(addr);
2751 | +
2752 | +	if (cost_accounting) {
2753 | +		if (HASH_STRENGTH_FULL > hash_strength)
2754 | +			delta = HASH_STRENGTH_FULL - hash_strength;
2755 | +		else
2756 | +			delta = 0;
2757 | +
2758 | +		inc_rshash_pos(delta);
2759 | +	}
2760 | +
2761 | +	return val;
2762 | +}
2763 | +
2764 | +static int memcmp_pages(struct page *page1, struct page *page2,
2765 | +			int cost_accounting)
2766 | +{
2767 | +	char *addr1, *addr2;
2768 | +	int ret;
2769 | +
2770 | +	addr1 = kmap_atomic(page1);
2771 | +	addr2 = kmap_atomic(page2);
2772 | +	ret = memcmp(addr1, addr2, PAGE_SIZE);
2773 | +	kunmap_atomic(addr2);
2774 | +	kunmap_atomic(addr1);
2775 | +
2776 | +	if (cost_accounting)
2777 | +		inc_rshash_neg(memcmp_cost);
2778 | +
2779 | +	return ret;
2780 | +}
2781 | +
2782 | +static inline int pages_identical(struct page *page1, struct page *page2)
2783 | +{
2784 | +	return !memcmp_pages(page1, page2, 0);
2785 | +}
2786 | +
2787 | +static inline int is_page_full_zero(struct page *page)
2788 | +{
2789 | +	char *addr;
2790 | +	int ret;
2791 | +
2792 | +	addr = kmap_atomic(page);
2793 | +	ret = is_full_zero(addr, PAGE_SIZE);
2794 | +	kunmap_atomic(addr);
2795 | +
2796 | +	return ret;
2797 | +}
2798 | +
2799 | +static int write_protect_page(struct vm_area_struct *vma, struct page *page,
2800 | +			      pte_t *orig_pte, pte_t *old_pte)
2801 | +{
2802 | +	struct mm_struct *mm = vma->vm_mm;
2803 | +	unsigned long addr;
2804 | +	pte_t *ptep;
2805 | +	spinlock_t *ptl;
2806 | +	int swapped;
2807 | +	int err = -EFAULT;
2808 | +	unsigned long mmun_start;	/* For mmu_notifiers */
2809 | +	unsigned long mmun_end;		/* For mmu_notifiers */
2810 | +
2811 | +	addr = page_address_in_vma(page, vma);
2812 | +	if (addr == -EFAULT)
2813 | +		goto out;
2814 | +
2815 | +	BUG_ON(PageTransCompound(page));
2816 | +
2817 | +	mmun_start = addr;
2818 | +	mmun_end   = addr + PAGE_SIZE;
2819 | +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2820 | +
2821 | +	ptep = page_check_address(page, mm, addr, &ptl, 0);
2822 | +	if (!ptep)
2823 | +		goto out_mn;
2824 | +
2825 | +	if (old_pte)
2826 | +		*old_pte = *ptep;
2827 | +
2828 | +	if (pte_write(*ptep) || pte_dirty(*ptep)) {
2829 | +		pte_t entry;
2830 | +
2831 | +		swapped = PageSwapCache(page);
2832 | +		flush_cache_page(vma, addr, page_to_pfn(page));
2833 | +		/*
2834 | +		 * Ok this is tricky, when get_user_pages_fast() run it doesnt
2835 | +		 * take any lock, therefore the check that we are going to make
2836 | +		 * with the pagecount against the mapcount is racey and
2837 | +		 * O_DIRECT can happen right after the check.
2838 | +		 * So we clear the pte and flush the tlb before the check
2839 | +		 * this assure us that no O_DIRECT can happen after the check
2840 | +		 * or in the middle of the check.
2841 | +		 */
2842 | +		entry = ptep_clear_flush_notify(vma, addr, ptep);
2843 | +		/*
2844 | +		 * Check that no O_DIRECT or similar I/O is in progress on the
2845 | +		 * page
2846 | +		 */
2847 | +		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
2848 | +			set_pte_at(mm, addr, ptep, entry);
2849 | +			goto out_unlock;
2850 | +		}
2851 | +		if (pte_dirty(entry))
2852 | +			set_page_dirty(page);
2853 | +		entry = pte_mkclean(pte_wrprotect(entry));
2854 | +		set_pte_at_notify(mm, addr, ptep, entry);
2855 | +	}
2856 | +	*orig_pte = *ptep;
2857 | +	err = 0;
2858 | +
2859 | +out_unlock:
2860 | +	pte_unmap_unlock(ptep, ptl);
2861 | +out_mn:
2862 | +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2863 | +out:
2864 | +	return err;
2865 | +}
2866 | +
2867 | +#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
2868 | +#define MERGE_ERR_COLLI		2 /* there is a collision */
2869 | +#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
2870 | +#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */
2871 | +
2872 | +
2873 | +/**
2874 | + * replace_page - replace page in vma by new ksm page
2875 | + * @vma:      vma that holds the pte pointing to page
2876 | + * @page:     the page we are replacing by kpage
2877 | + * @kpage:    the ksm page we replace page by
2878 | + * @orig_pte: the original value of the pte
2879 | + *
2880 | + * Returns 0 on success, MERGE_ERR_PGERR on failure.
2881 | + */
2882 | +static int replace_page(struct vm_area_struct *vma, struct page *page,
2883 | +			struct page *kpage, pte_t orig_pte)
2884 | +{
2885 | +	struct mm_struct *mm = vma->vm_mm;
2886 | +	pgd_t *pgd;
2887 | +	pud_t *pud;
2888 | +	pmd_t *pmd;
2889 | +	pte_t *ptep;
2890 | +	spinlock_t *ptl;
2891 | +	pte_t entry;
2892 | +
2893 | +	unsigned long addr;
2894 | +	int err = MERGE_ERR_PGERR;
2895 | +	unsigned long mmun_start;	/* For mmu_notifiers */
2896 | +	unsigned long mmun_end;		/* For mmu_notifiers */
2897 | +
2898 | +	addr = page_address_in_vma(page, vma);
2899 | +	if (addr == -EFAULT)
2900 | +		goto out;
2901 | +
2902 | +	pgd = pgd_offset(mm, addr);
2903 | +	if (!pgd_present(*pgd))
2904 | +		goto out;
2905 | +
2906 | +	pud = pud_offset(pgd, addr);
2907 | +	if (!pud_present(*pud))
2908 | +		goto out;
2909 | +
2910 | +	pmd = pmd_offset(pud, addr);
2911 | +	BUG_ON(pmd_trans_huge(*pmd));
2912 | +	if (!pmd_present(*pmd))
2913 | +		goto out;
2914 | +
2915 | +	mmun_start = addr;
2916 | +	mmun_end   = addr + PAGE_SIZE;
2917 | +	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2918 | +
2919 | +	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
2920 | +	if (!pte_same(*ptep, orig_pte)) {
2921 | +		pte_unmap_unlock(ptep, ptl);
2922 | +		goto out_mn;
2923 | +	}
2924 | +
2925 | +	flush_cache_page(vma, addr, pte_pfn(*ptep));
2926 | +	ptep_clear_flush_notify(vma, addr, ptep);
2927 | +	entry = mk_pte(kpage, vma->vm_page_prot);
2928 | +
2929 | +	/* special treatment is needed for zero_page */
2930 | +	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
2931 | +				(page_to_pfn(kpage) == zero_pfn)) {
2932 | +		entry = pte_mkspecial(entry);
2933 | +		dec_mm_counter(mm, MM_ANONPAGES);
2934 | +	} else {
2935 | +		get_page(kpage);
2936 | +		page_add_anon_rmap(kpage, vma, addr, false);
2937 | +	}
2938 | +
2939 | +	set_pte_at_notify(mm, addr, ptep, entry);
2940 | +
2941 | +	page_remove_rmap(page, false);
2942 | +	if (!page_mapped(page))
2943 | +		try_to_free_swap(page);
2944 | +	put_page(page);
2945 | +
2946 | +	pte_unmap_unlock(ptep, ptl);
2947 | +	err = 0;
2948 | +out_mn:
2949 | +	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2950 | +out:
2951 | +	return err;
2952 | +}
2953 | +
2954 | +
2955 | +/**
2956 | + *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
2957 | + *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
2958 | + *  hash_max member has not been calculated.
2959 | + *
2960 | + * @page The page needs to be hashed
2961 | + * @hash_old The hash value calculated with current hash strength
2962 | + *
2963 | + * return the new hash value calculated at HASH_STRENGTH_MAX
2964 | + */
2965 | +static inline u32 page_hash_max(struct page *page, u32 hash_old)
2966 | +{
2967 | +	u32 hash_max = 0;
2968 | +	void *addr;
2969 | +
2970 | +	addr = kmap_atomic(page);
2971 | +	hash_max = delta_hash(addr, hash_strength,
2972 | +			      HASH_STRENGTH_MAX, hash_old);
2973 | +
2974 | +	kunmap_atomic(addr);
2975 | +
2976 | +	if (!hash_max)
2977 | +		hash_max = 1;
2978 | +
2979 | +	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
2980 | +	return hash_max;
2981 | +}
2982 | +
2983 | +/*
2984 | + * We compare the hash again, to ensure that it is really a hash collision
2985 | + * instead of being caused by page write.
2986 | + */
2987 | +static inline int check_collision(struct rmap_item *rmap_item,
2988 | +				  u32 hash)
2989 | +{
2990 | +	int err;
2991 | +	struct page *page = rmap_item->page;
2992 | +
2993 | +	/* if this rmap_item has already been hash_maxed, then the collision
2994 | +	 * must appears in the second-level rbtree search. In this case we check
2995 | +	 * if its hash_max value has been changed. Otherwise, the collision
2996 | +	 * happens in the first-level rbtree search, so we check against it's
2997 | +	 * current hash value.
2998 | +	 */
2999 | +	if (rmap_item->hash_max) {
3000 | +		inc_rshash_neg(memcmp_cost);
3001 | +		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
3002 | +
3003 | +		if (rmap_item->hash_max == page_hash_max(page, hash))
3004 | +			err = MERGE_ERR_COLLI;
3005 | +		else
3006 | +			err = MERGE_ERR_CHANGED;
3007 | +	} else {
3008 | +		inc_rshash_neg(memcmp_cost + hash_strength);
3009 | +
3010 | +		if (page_hash(page, hash_strength, 0) == hash)
3011 | +			err = MERGE_ERR_COLLI;
3012 | +		else
3013 | +			err = MERGE_ERR_CHANGED;
3014 | +	}
3015 | +
3016 | +	return err;
3017 | +}
3018 | +
3019 | +/**
3020 | + * Try to merge a rmap_item.page with a kpage in stable node. kpage must
3021 | + * already be a ksm page.
3022 | + *
3023 | + * @return 0 if the pages were merged, -EFAULT otherwise.
3024 | + */
3025 | +static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
3026 | +				      struct page *kpage, u32 hash)
3027 | +{
3028 | +	struct vm_area_struct *vma = rmap_item->slot->vma;
3029 | +	struct mm_struct *mm = vma->vm_mm;
3030 | +	pte_t orig_pte = __pte(0);
3031 | +	int err = MERGE_ERR_PGERR;
3032 | +	struct page *page;
3033 | +
3034 | +	if (uksm_test_exit(mm))
3035 | +		goto out;
3036 | +
3037 | +	page = rmap_item->page;
3038 | +
3039 | +	if (page == kpage) { /* ksm page forked */
3040 | +		err = 0;
3041 | +		goto out;
3042 | +	}
3043 | +
3044 | +	/*
3045 | +	 * We need the page lock to read a stable PageSwapCache in
3046 | +	 * write_protect_page().  We use trylock_page() instead of
3047 | +	 * lock_page() because we don't want to wait here - we
3048 | +	 * prefer to continue scanning and merging different pages,
3049 | +	 * then come back to this page when it is unlocked.
3050 | +	 */
3051 | +	if (!trylock_page(page))
3052 | +		goto out;
3053 | +
3054 | +	if (!PageAnon(page) || !PageKsm(kpage))
3055 | +		goto out_unlock;
3056 | +
3057 | +	if (PageTransCompound(page)) {
3058 | +		err = split_huge_page(page);
3059 | +		if (err)
3060 | +			goto out_unlock;
3061 | +	}
3062 | +
3063 | +	/*
3064 | +	 * If this anonymous page is mapped only here, its pte may need
3065 | +	 * to be write-protected.  If it's mapped elsewhere, all of its
3066 | +	 * ptes are necessarily already write-protected.  But in either
3067 | +	 * case, we need to lock and check page_count is not raised.
3068 | +	 */
3069 | +	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
3070 | +		if (pages_identical(page, kpage))
3071 | +			err = replace_page(vma, page, kpage, orig_pte);
3072 | +		else
3073 | +			err = check_collision(rmap_item, hash);
3074 | +	}
3075 | +
3076 | +	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
3077 | +		munlock_vma_page(page);
3078 | +		if (!PageMlocked(kpage)) {
3079 | +			unlock_page(page);
3080 | +			lock_page(kpage);
3081 | +			mlock_vma_page(kpage);
3082 | +			page = kpage;		/* for final unlock */
3083 | +		}
3084 | +	}
3085 | +
3086 | +out_unlock:
3087 | +	unlock_page(page);
3088 | +out:
3089 | +	return err;
3090 | +}
3091 | +
3092 | +
3093 | +
3094 | +/**
3095 | + * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
3096 | + * to restore a page mapping that has been changed in try_to_merge_two_pages.
3097 | + *
3098 | + * @return 0 on success.
3099 | + */
3100 | +static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
3101 | +			     pte_t orig_pte, pte_t wprt_pte)
3102 | +{
3103 | +	struct mm_struct *mm = vma->vm_mm;
3104 | +	pgd_t *pgd;
3105 | +	pud_t *pud;
3106 | +	pmd_t *pmd;
3107 | +	pte_t *ptep;
3108 | +	spinlock_t *ptl;
3109 | +
3110 | +	int err = -EFAULT;
3111 | +
3112 | +	pgd = pgd_offset(mm, addr);
3113 | +	if (!pgd_present(*pgd))
3114 | +		goto out;
3115 | +
3116 | +	pud = pud_offset(pgd, addr);
3117 | +	if (!pud_present(*pud))
3118 | +		goto out;
3119 | +
3120 | +	pmd = pmd_offset(pud, addr);
3121 | +	if (!pmd_present(*pmd))
3122 | +		goto out;
3123 | +
3124 | +	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
3125 | +	if (!pte_same(*ptep, wprt_pte)) {
3126 | +		/* already copied, let it be */
3127 | +		pte_unmap_unlock(ptep, ptl);
3128 | +		goto out;
3129 | +	}
3130 | +
3131 | +	/*
3132 | +	 * Good boy, still here. When we still get the ksm page, it does not
3133 | +	 * return to the free page pool, there is no way that a pte was changed
3134 | +	 * to other page and gets back to this page. And remind that ksm page
3135 | +	 * do not reuse in do_wp_page(). So it's safe to restore the original
3136 | +	 * pte.
3137 | +	 */
3138 | +	flush_cache_page(vma, addr, pte_pfn(*ptep));
3139 | +	ptep_clear_flush_notify(vma, addr, ptep);
3140 | +	set_pte_at_notify(mm, addr, ptep, orig_pte);
3141 | +
3142 | +	pte_unmap_unlock(ptep, ptl);
3143 | +	err = 0;
3144 | +out:
3145 | +	return err;
3146 | +}
3147 | +
3148 | +/**
3149 | + * try_to_merge_two_pages() - take two identical pages and prepare
3150 | + * them to be merged into one page(rmap_item->page)
3151 | + *
3152 | + * @return 0 if we successfully merged two identical pages into
3153 | + *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
3154 | + *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
3155 | + *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
3156 | + *
3157 | + */
3158 | +static int try_to_merge_two_pages(struct rmap_item *rmap_item,
3159 | +				  struct rmap_item *tree_rmap_item,
3160 | +				  u32 hash)
3161 | +{
3162 | +	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
3163 | +	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
3164 | +	struct vm_area_struct *vma1 = rmap_item->slot->vma;
3165 | +	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
3166 | +	struct page *page = rmap_item->page;
3167 | +	struct page *tree_page = tree_rmap_item->page;
3168 | +	int err = MERGE_ERR_PGERR;
3169 | +	struct address_space *saved_mapping;
3170 | +
3171 | +
3172 | +	if (rmap_item->page == tree_rmap_item->page)
3173 | +		goto out;
3174 | +
3175 | +	if (!trylock_page(page))
3176 | +		goto out;
3177 | +
3178 | +	if (!PageAnon(page))
3179 | +		goto out_unlock;
3180 | +
3181 | +	if (PageTransCompound(page)) {
3182 | +		err = split_huge_page(page);
3183 | +		if (err)
3184 | +			goto out_unlock;
3185 | +	}
3186 | +
3187 | +	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
3188 | +		unlock_page(page);
3189 | +		goto out;
3190 | +	}
3191 | +
3192 | +	/*
3193 | +	 * While we hold page lock, upgrade page from
3194 | +	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
3195 | +	 * stable_tree_insert() will update stable_node.
3196 | +	 */
3197 | +	saved_mapping = page->mapping;
3198 | +	set_page_stable_node(page, NULL);
3199 | +	mark_page_accessed(page);
3200 | +	if (!PageDirty(page))
3201 | +		SetPageDirty(page);
3202 | +
3203 | +	unlock_page(page);
3204 | +
3205 | +	if (!trylock_page(tree_page))
3206 | +		goto restore_out;
3207 | +
3208 | +	if (!PageAnon(tree_page)) {
3209 | +		unlock_page(tree_page);
3210 | +		goto restore_out;
3211 | +	}
3212 | +
3213 | +	if (PageTransCompound(tree_page)) {
3214 | +		err = split_huge_page(tree_page);
3215 | +		if (err) {
3216 | +			unlock_page(tree_page);
3217 | +			goto restore_out;
3218 | +		}
3219 | +	}
3220 | +
3221 | +	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
3222 | +		unlock_page(tree_page);
3223 | +		goto restore_out;
3224 | +	}
3225 | +
3226 | +	if (pages_identical(page, tree_page)) {
3227 | +		err = replace_page(vma2, tree_page, page, wprt_pte2);
3228 | +		if (err) {
3229 | +			unlock_page(tree_page);
3230 | +			goto restore_out;
3231 | +		}
3232 | +
3233 | +		if ((vma2->vm_flags & VM_LOCKED)) {
3234 | +			munlock_vma_page(tree_page);
3235 | +			if (!PageMlocked(page)) {
3236 | +				unlock_page(tree_page);
3237 | +				lock_page(page);
3238 | +				mlock_vma_page(page);
3239 | +				tree_page = page; /* for final unlock */
3240 | +			}
3241 | +		}
3242 | +
3243 | +		unlock_page(tree_page);
3244 | +
3245 | +		goto out; /* success */
3246 | +
3247 | +	} else {
3248 | +		if (tree_rmap_item->hash_max &&
3249 | +		    tree_rmap_item->hash_max == rmap_item->hash_max) {
3250 | +			err = MERGE_ERR_COLLI_MAX;
3251 | +		} else if (page_hash(page, hash_strength, 0) ==
3252 | +		    page_hash(tree_page, hash_strength, 0)) {
3253 | +			inc_rshash_neg(memcmp_cost + hash_strength * 2);
3254 | +			err = MERGE_ERR_COLLI;
3255 | +		} else {
3256 | +			err = MERGE_ERR_CHANGED;
3257 | +		}
3258 | +
3259 | +		unlock_page(tree_page);
3260 | +	}
3261 | +
3262 | +restore_out:
3263 | +	lock_page(page);
3264 | +	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
3265 | +				  orig_pte1, wprt_pte1))
3266 | +		page->mapping = saved_mapping;
3267 | +
3268 | +out_unlock:
3269 | +	unlock_page(page);
3270 | +out:
3271 | +	return err;
3272 | +}
3273 | +
3274 | +static inline int hash_cmp(u32 new_val, u32 node_val)
3275 | +{
3276 | +	if (new_val > node_val)
3277 | +		return 1;
3278 | +	else if (new_val < node_val)
3279 | +		return -1;
3280 | +	else
3281 | +		return 0;
3282 | +}
3283 | +
3284 | +static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
3285 | +{
3286 | +	u32 hash_max = item->hash_max;
3287 | +
3288 | +	if (!hash_max) {
3289 | +		hash_max = page_hash_max(item->page, hash);
3290 | +
3291 | +		item->hash_max = hash_max;
3292 | +	}
3293 | +
3294 | +	return hash_max;
3295 | +}
3296 | +
3297 | +
3298 | +
3299 | +/**
3300 | + * stable_tree_search() - search the stable tree for a page
3301 | + *
3302 | + * @item: 	the rmap_item we are comparing with
3303 | + * @hash: 	the hash value of this item->page already calculated
3304 | + *
3305 | + * @return 	the page we have found, NULL otherwise. The page returned has
3306 | + *         	been gotten.
3307 | + */
3308 | +static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
3309 | +{
3310 | +	struct rb_node *node = root_stable_treep->rb_node;
3311 | +	struct tree_node *tree_node;
3312 | +	unsigned long hash_max;
3313 | +	struct page *page = item->page;
3314 | +	struct stable_node *stable_node;
3315 | +
3316 | +	stable_node = page_stable_node(page);
3317 | +	if (stable_node) {
3318 | +		/* ksm page forked, that is
3319 | +		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
3320 | +		 * it's actually gotten once outside.
3321 | +		 */
3322 | +		get_page(page);
3323 | +		return page;
3324 | +	}
3325 | +
3326 | +	while (node) {
3327 | +		int cmp;
3328 | +
3329 | +		tree_node = rb_entry(node, struct tree_node, node);
3330 | +
3331 | +		cmp = hash_cmp(hash, tree_node->hash);
3332 | +
3333 | +		if (cmp < 0)
3334 | +			node = node->rb_left;
3335 | +		else if (cmp > 0)
3336 | +			node = node->rb_right;
3337 | +		else
3338 | +			break;
3339 | +	}
3340 | +
3341 | +	if (!node)
3342 | +		return NULL;
3343 | +
3344 | +	if (tree_node->count == 1) {
3345 | +		stable_node = rb_entry(tree_node->sub_root.rb_node,
3346 | +				       struct stable_node, node);
3347 | +		BUG_ON(!stable_node);
3348 | +
3349 | +		goto get_page_out;
3350 | +	}
3351 | +
3352 | +	/*
3353 | +	 * ok, we have to search the second
3354 | +	 * level subtree, hash the page to a
3355 | +	 * full strength.
3356 | +	 */
3357 | +	node = tree_node->sub_root.rb_node;
3358 | +	BUG_ON(!node);
3359 | +	hash_max = rmap_item_hash_max(item, hash);
3360 | +
3361 | +	while (node) {
3362 | +		int cmp;
3363 | +
3364 | +		stable_node = rb_entry(node, struct stable_node, node);
3365 | +
3366 | +		cmp = hash_cmp(hash_max, stable_node->hash_max);
3367 | +
3368 | +		if (cmp < 0)
3369 | +			node = node->rb_left;
3370 | +		else if (cmp > 0)
3371 | +			node = node->rb_right;
3372 | +		else
3373 | +			goto get_page_out;
3374 | +	}
3375 | +
3376 | +	return NULL;
3377 | +
3378 | +get_page_out:
3379 | +	page = get_uksm_page(stable_node, 1, 1);
3380 | +	return page;
3381 | +}
3382 | +
3383 | +static int try_merge_rmap_item(struct rmap_item *item,
3384 | +			       struct page *kpage,
3385 | +			       struct page *tree_page)
3386 | +{
3387 | +	spinlock_t *ptl;
3388 | +	pte_t *ptep;
3389 | +	unsigned long addr;
3390 | +	struct vm_area_struct *vma = item->slot->vma;
3391 | +
3392 | +	addr = get_rmap_addr(item);
3393 | +	ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
3394 | +	if (!ptep)
3395 | +		return 0;
3396 | +
3397 | +	if (pte_write(*ptep)) {
3398 | +		/* has changed, abort! */
3399 | +		pte_unmap_unlock(ptep, ptl);
3400 | +		return 0;
3401 | +	}
3402 | +
3403 | +	get_page(tree_page);
3404 | +	page_add_anon_rmap(tree_page, vma, addr, false);
3405 | +
3406 | +	flush_cache_page(vma, addr, pte_pfn(*ptep));
3407 | +	ptep_clear_flush_notify(vma, addr, ptep);
3408 | +	set_pte_at_notify(vma->vm_mm, addr, ptep,
3409 | +			  mk_pte(tree_page, vma->vm_page_prot));
3410 | +
3411 | +	page_remove_rmap(kpage, false);
3412 | +	put_page(kpage);
3413 | +
3414 | +	pte_unmap_unlock(ptep, ptl);
3415 | +
3416 | +	return 1;
3417 | +}
3418 | +
3419 | +/**
3420 | + * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
3421 | + * into stable tree, the page was found to be identical to a stable ksm page,
3422 | + * this is the last chance we can merge them into one.
3423 | + *
3424 | + * @item1:	the rmap_item holding the page which we wanted to insert
3425 | + *       	into stable tree.
3426 | + * @item2:	the other rmap_item we found when unstable tree search
3427 | + * @oldpage:	the page currently mapped by the two rmap_items
3428 | + * @tree_page: 	the page we found identical in stable tree node
3429 | + * @success1:	return if item1 is successfully merged
3430 | + * @success2:	return if item2 is successfully merged
3431 | + */
3432 | +static void try_merge_with_stable(struct rmap_item *item1,
3433 | +				  struct rmap_item *item2,
3434 | +				  struct page **kpage,
3435 | +				  struct page *tree_page,
3436 | +				  int *success1, int *success2)
3437 | +{
3438 | +	struct vm_area_struct *vma1 = item1->slot->vma;
3439 | +	struct vm_area_struct *vma2 = item2->slot->vma;
3440 | +	*success1 = 0;
3441 | +	*success2 = 0;
3442 | +
3443 | +	if (unlikely(*kpage == tree_page)) {
3444 | +		/* I don't think this can really happen */
3445 | +		printk(KERN_WARNING "UKSM: unexpected condition detected in "
3446 | +			"try_merge_with_stable() -- *kpage == tree_page !\n");
3447 | +		*success1 = 1;
3448 | +		*success2 = 1;
3449 | +		return;
3450 | +	}
3451 | +
3452 | +	if (!PageAnon(*kpage) || !PageKsm(*kpage))
3453 | +		goto failed;
3454 | +
3455 | +	if (!trylock_page(tree_page))
3456 | +		goto failed;
3457 | +
3458 | +	/* If the oldpage is still ksm and still pointed
3459 | +	 * to in the right place, and still write protected,
3460 | +	 * we are confident it's not changed, no need to
3461 | +	 * memcmp anymore.
3462 | +	 * be ware, we cannot take nested pte locks,
3463 | +	 * deadlock risk.
3464 | +	 */
3465 | +	if (!try_merge_rmap_item(item1, *kpage, tree_page))
3466 | +		goto unlock_failed;
3467 | +
3468 | +	/* ok, then vma2, remind that pte1 already set */
3469 | +	if (!try_merge_rmap_item(item2, *kpage, tree_page))
3470 | +		goto success_1;
3471 | +
3472 | +	*success2 = 1;
3473 | +success_1:
3474 | +	*success1 = 1;
3475 | +
3476 | +
3477 | +	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
3478 | +	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
3479 | +		munlock_vma_page(*kpage);
3480 | +		if (!PageMlocked(tree_page))
3481 | +			mlock_vma_page(tree_page);
3482 | +	}
3483 | +
3484 | +	/*
3485 | +	 * We do not need oldpage any more in the caller, so can break the lock
3486 | +	 * now.
3487 | +	 */
3488 | +	unlock_page(*kpage);
3489 | +	*kpage = tree_page; /* Get unlocked outside. */
3490 | +	return;
3491 | +
3492 | +unlock_failed:
3493 | +	unlock_page(tree_page);
3494 | +failed:
3495 | +	return;
3496 | +}
3497 | +
3498 | +static inline void stable_node_hash_max(struct stable_node *node,
3499 | +					 struct page *page, u32 hash)
3500 | +{
3501 | +	u32 hash_max = node->hash_max;
3502 | +
3503 | +	if (!hash_max) {
3504 | +		hash_max = page_hash_max(page, hash);
3505 | +		node->hash_max = hash_max;
3506 | +	}
3507 | +}
3508 | +
3509 | +static inline
3510 | +struct stable_node *new_stable_node(struct tree_node *tree_node,
3511 | +				    struct page *kpage, u32 hash_max)
3512 | +{
3513 | +	struct stable_node *new_stable_node;
3514 | +
3515 | +	new_stable_node = alloc_stable_node();
3516 | +	if (!new_stable_node)
3517 | +		return NULL;
3518 | +
3519 | +	new_stable_node->kpfn = page_to_pfn(kpage);
3520 | +	new_stable_node->hash_max = hash_max;
3521 | +	new_stable_node->tree_node = tree_node;
3522 | +	set_page_stable_node(kpage, new_stable_node);
3523 | +
3524 | +	return new_stable_node;
3525 | +}
3526 | +
3527 | +static inline
3528 | +struct stable_node *first_level_insert(struct tree_node *tree_node,
3529 | +				       struct rmap_item *rmap_item,
3530 | +				       struct rmap_item *tree_rmap_item,
3531 | +				       struct page **kpage, u32 hash,
3532 | +				       int *success1, int *success2)
3533 | +{
3534 | +	int cmp;
3535 | +	struct page *tree_page;
3536 | +	u32 hash_max = 0;
3537 | +	struct stable_node *stable_node, *new_snode;
3538 | +	struct rb_node *parent = NULL, **new;
3539 | +
3540 | +	/* this tree node contains no sub-tree yet */
3541 | +	stable_node = rb_entry(tree_node->sub_root.rb_node,
3542 | +			       struct stable_node, node);
3543 | +
3544 | +	tree_page = get_uksm_page(stable_node, 1, 0);
3545 | +	if (tree_page) {
3546 | +		cmp = memcmp_pages(*kpage, tree_page, 1);
3547 | +		if (!cmp) {
3548 | +			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
3549 | +					      tree_page, success1, success2);
3550 | +			put_page(tree_page);
3551 | +			if (!*success1 && !*success2)
3552 | +				goto failed;
3553 | +
3554 | +			return stable_node;
3555 | +
3556 | +		} else {
3557 | +			/*
3558 | +			 * collision in first level try to create a subtree.
3559 | +			 * A new node need to be created.
3560 | +			 */
3561 | +			put_page(tree_page);
3562 | +
3563 | +			stable_node_hash_max(stable_node, tree_page,
3564 | +					     tree_node->hash);
3565 | +			hash_max = rmap_item_hash_max(rmap_item, hash);
3566 | +			cmp = hash_cmp(hash_max, stable_node->hash_max);
3567 | +
3568 | +			parent = &stable_node->node;
3569 | +			if (cmp < 0) {
3570 | +				new = &parent->rb_left;
3571 | +			} else if (cmp > 0) {
3572 | +				new = &parent->rb_right;
3573 | +			} else {
3574 | +				goto failed;
3575 | +			}
3576 | +		}
3577 | +
3578 | +	} else {
3579 | +		/* the only stable_node deleted, we reuse its tree_node.
3580 | +		 */
3581 | +		parent = NULL;
3582 | +		new = &tree_node->sub_root.rb_node;
3583 | +	}
3584 | +
3585 | +	new_snode = new_stable_node(tree_node, *kpage, hash_max);
3586 | +	if (!new_snode)
3587 | +		goto failed;
3588 | +
3589 | +	rb_link_node(&new_snode->node, parent, new);
3590 | +	rb_insert_color(&new_snode->node, &tree_node->sub_root);
3591 | +	tree_node->count++;
3592 | +	*success1 = *success2 = 1;
3593 | +
3594 | +	return new_snode;
3595 | +
3596 | +failed:
3597 | +	return NULL;
3598 | +}
3599 | +
3600 | +static inline
3601 | +struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
3602 | +					  struct rmap_item *rmap_item,
3603 | +					  struct rmap_item *tree_rmap_item,
3604 | +					  struct page **kpage, u32 hash,
3605 | +					  int *success1, int *success2)
3606 | +{
3607 | +	struct page *tree_page;
3608 | +	u32 hash_max;
3609 | +	struct stable_node *stable_node, *new_snode;
3610 | +	struct rb_node *parent, **new;
3611 | +
3612 | +research:
3613 | +	parent = NULL;
3614 | +	new = &tree_node->sub_root.rb_node;
3615 | +	BUG_ON(!*new);
3616 | +	hash_max = rmap_item_hash_max(rmap_item, hash);
3617 | +	while (*new) {
3618 | +		int cmp;
3619 | +
3620 | +		stable_node = rb_entry(*new, struct stable_node, node);
3621 | +
3622 | +		cmp = hash_cmp(hash_max, stable_node->hash_max);
3623 | +
3624 | +		if (cmp < 0) {
3625 | +			parent = *new;
3626 | +			new = &parent->rb_left;
3627 | +		} else if (cmp > 0) {
3628 | +			parent = *new;
3629 | +			new = &parent->rb_right;
3630 | +		} else {
3631 | +			tree_page = get_uksm_page(stable_node, 1, 0);
3632 | +			if (tree_page) {
3633 | +				cmp = memcmp_pages(*kpage, tree_page, 1);
3634 | +				if (!cmp) {
3635 | +					try_merge_with_stable(rmap_item,
3636 | +						tree_rmap_item, kpage,
3637 | +						tree_page, success1, success2);
3638 | +
3639 | +					put_page(tree_page);
3640 | +					if (!*success1 && !*success2)
3641 | +						goto failed;
3642 | +					/*
3643 | +					 * successfully merged with a stable
3644 | +					 * node
3645 | +					 */
3646 | +					return stable_node;
3647 | +				} else {
3648 | +					put_page(tree_page);
3649 | +					goto failed;
3650 | +				}
3651 | +			} else {
3652 | +				/*
3653 | +				 * stable node may be deleted,
3654 | +				 * and subtree maybe
3655 | +				 * restructed, cannot
3656 | +				 * continue, research it.
3657 | +				 */
3658 | +				if (tree_node->count) {
3659 | +					goto research;
3660 | +				} else {
3661 | +					/* reuse the tree node*/
3662 | +					parent = NULL;
3663 | +					new = &tree_node->sub_root.rb_node;
3664 | +				}
3665 | +			}
3666 | +		}
3667 | +	}
3668 | +
3669 | +	new_snode = new_stable_node(tree_node, *kpage, hash_max);
3670 | +	if (!new_snode)
3671 | +		goto failed;
3672 | +
3673 | +	rb_link_node(&new_snode->node, parent, new);
3674 | +	rb_insert_color(&new_snode->node, &tree_node->sub_root);
3675 | +	tree_node->count++;
3676 | +	*success1 = *success2 = 1;
3677 | +
3678 | +	return new_snode;
3679 | +
3680 | +failed:
3681 | +	return NULL;
3682 | +}
3683 | +
3684 | +
3685 | +/**
3686 | + * stable_tree_insert() - try to insert a merged page in unstable tree to
3687 | + * the stable tree
3688 | + *
3689 | + * @kpage:		the page need to be inserted
3690 | + * @hash:		the current hash of this page
3691 | + * @rmap_item:		the rmap_item being scanned
3692 | + * @tree_rmap_item:	the rmap_item found on unstable tree
3693 | + * @success1:		return if rmap_item is merged
3694 | + * @success2:		return if tree_rmap_item is merged
3695 | + *
3696 | + * @return 		the stable_node on stable tree if at least one
3697 | + *      		rmap_item is inserted into stable tree, NULL
3698 | + *      		otherwise.
3699 | + */
3700 | +static struct stable_node *
3701 | +stable_tree_insert(struct page **kpage, u32 hash,
3702 | +		   struct rmap_item *rmap_item,
3703 | +		   struct rmap_item *tree_rmap_item,
3704 | +		   int *success1, int *success2)
3705 | +{
3706 | +	struct rb_node **new = &root_stable_treep->rb_node;
3707 | +	struct rb_node *parent = NULL;
3708 | +	struct stable_node *stable_node;
3709 | +	struct tree_node *tree_node;
3710 | +	u32 hash_max = 0;
3711 | +
3712 | +	*success1 = *success2 = 0;
3713 | +
3714 | +	while (*new) {
3715 | +		int cmp;
3716 | +
3717 | +		tree_node = rb_entry(*new, struct tree_node, node);
3718 | +
3719 | +		cmp = hash_cmp(hash, tree_node->hash);
3720 | +
3721 | +		if (cmp < 0) {
3722 | +			parent = *new;
3723 | +			new = &parent->rb_left;
3724 | +		} else if (cmp > 0) {
3725 | +			parent = *new;
3726 | +			new = &parent->rb_right;
3727 | +		} else
3728 | +			break;
3729 | +	}
3730 | +
3731 | +	if (*new) {
3732 | +		if (tree_node->count == 1) {
3733 | +			stable_node = first_level_insert(tree_node, rmap_item,
3734 | +						tree_rmap_item, kpage,
3735 | +						hash, success1, success2);
3736 | +		} else {
3737 | +			stable_node = stable_subtree_insert(tree_node,
3738 | +					rmap_item, tree_rmap_item, kpage,
3739 | +					hash, success1, success2);
3740 | +		}
3741 | +	} else {
3742 | +
3743 | +		/* no tree node found */
3744 | +		tree_node = alloc_tree_node(stable_tree_node_listp);
3745 | +		if (!tree_node) {
3746 | +			stable_node = NULL;
3747 | +			goto out;
3748 | +		}
3749 | +
3750 | +		stable_node = new_stable_node(tree_node, *kpage, hash_max);
3751 | +		if (!stable_node) {
3752 | +			free_tree_node(tree_node);
3753 | +			goto out;
3754 | +		}
3755 | +
3756 | +		tree_node->hash = hash;
3757 | +		rb_link_node(&tree_node->node, parent, new);
3758 | +		rb_insert_color(&tree_node->node, root_stable_treep);
3759 | +		parent = NULL;
3760 | +		new = &tree_node->sub_root.rb_node;
3761 | +
3762 | +		rb_link_node(&stable_node->node, parent, new);
3763 | +		rb_insert_color(&stable_node->node, &tree_node->sub_root);
3764 | +		tree_node->count++;
3765 | +		*success1 = *success2 = 1;
3766 | +	}
3767 | +
3768 | +out:
3769 | +	return stable_node;
3770 | +}
3771 | +
3772 | +
3773 | +/**
3774 | + * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
3775 | + *
3776 | + * @return 	0 on success, -EBUSY if unable to lock the mmap_sem,
3777 | + *         	-EINVAL if the page mapping has been changed.
3778 | + */
3779 | +static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
3780 | +{
3781 | +	int err;
3782 | +
3783 | +	err = get_mergeable_page_lock_mmap(tree_rmap_item);
3784 | +
3785 | +	if (err == -EINVAL) {
3786 | +		/* its page map has been changed, remove it */
3787 | +		remove_rmap_item_from_tree(tree_rmap_item);
3788 | +	}
3789 | +
3790 | +	/* The page is gotten and mmap_sem is locked now. */
3791 | +	return err;
3792 | +}
3793 | +
3794 | +
3795 | +/**
3796 | + * unstable_tree_search_insert() - search an unstable tree rmap_item with the
3797 | + * same hash value. Get its page and trylock the mmap_sem
3798 | + */
3799 | +static inline
3800 | +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
3801 | +					      u32 hash)
3802 | +
3803 | +{
3804 | +	struct rb_node **new = &root_unstable_tree.rb_node;
3805 | +	struct rb_node *parent = NULL;
3806 | +	struct tree_node *tree_node;
3807 | +	u32 hash_max;
3808 | +	struct rmap_item *tree_rmap_item;
3809 | +
3810 | +	while (*new) {
3811 | +		int cmp;
3812 | +
3813 | +		tree_node = rb_entry(*new, struct tree_node, node);
3814 | +
3815 | +		cmp = hash_cmp(hash, tree_node->hash);
3816 | +
3817 | +		if (cmp < 0) {
3818 | +			parent = *new;
3819 | +			new = &parent->rb_left;
3820 | +		} else if (cmp > 0) {
3821 | +			parent = *new;
3822 | +			new = &parent->rb_right;
3823 | +		} else
3824 | +			break;
3825 | +	}
3826 | +
3827 | +	if (*new) {
3828 | +		/* got the tree_node */
3829 | +		if (tree_node->count == 1) {
3830 | +			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
3831 | +						  struct rmap_item, node);
3832 | +			BUG_ON(!tree_rmap_item);
3833 | +
3834 | +			goto get_page_out;
3835 | +		}
3836 | +
3837 | +		/* well, search the collision subtree */
3838 | +		new = &tree_node->sub_root.rb_node;
3839 | +		BUG_ON(!*new);
3840 | +		hash_max = rmap_item_hash_max(rmap_item, hash);
3841 | +
3842 | +		while (*new) {
3843 | +			int cmp;
3844 | +
3845 | +			tree_rmap_item = rb_entry(*new, struct rmap_item,
3846 | +						  node);
3847 | +
3848 | +			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
3849 | +			parent = *new;
3850 | +			if (cmp < 0)
3851 | +				new = &parent->rb_left;
3852 | +			else if (cmp > 0)
3853 | +				new = &parent->rb_right;
3854 | +			else
3855 | +				goto get_page_out;
3856 | +		}
3857 | +	} else {
3858 | +		/* alloc a new tree_node */
3859 | +		tree_node = alloc_tree_node(&unstable_tree_node_list);
3860 | +		if (!tree_node)
3861 | +			return NULL;
3862 | +
3863 | +		tree_node->hash = hash;
3864 | +		rb_link_node(&tree_node->node, parent, new);
3865 | +		rb_insert_color(&tree_node->node, &root_unstable_tree);
3866 | +		parent = NULL;
3867 | +		new = &tree_node->sub_root.rb_node;
3868 | +	}
3869 | +
3870 | +	/* did not found even in sub-tree */
3871 | +	rmap_item->tree_node = tree_node;
3872 | +	rmap_item->address |= UNSTABLE_FLAG;
3873 | +	rmap_item->hash_round = uksm_hash_round;
3874 | +	rb_link_node(&rmap_item->node, parent, new);
3875 | +	rb_insert_color(&rmap_item->node, &tree_node->sub_root);
3876 | +
3877 | +	uksm_pages_unshared++;
3878 | +	return NULL;
3879 | +
3880 | +get_page_out:
3881 | +	if (tree_rmap_item->page == rmap_item->page)
3882 | +		return NULL;
3883 | +
3884 | +	if (get_tree_rmap_item_page(tree_rmap_item))
3885 | +		return NULL;
3886 | +
3887 | +	return tree_rmap_item;
3888 | +}
3889 | +
3890 | +static void hold_anon_vma(struct rmap_item *rmap_item,
3891 | +			  struct anon_vma *anon_vma)
3892 | +{
3893 | +	rmap_item->anon_vma = anon_vma;
3894 | +	get_anon_vma(anon_vma);
3895 | +}
3896 | +
3897 | +
3898 | +/**
3899 | + * stable_tree_append() - append a rmap_item to a stable node. Deduplication
3900 | + * ratio statistics is done in this function.
3901 | + *
3902 | + */
3903 | +static void stable_tree_append(struct rmap_item *rmap_item,
3904 | +			       struct stable_node *stable_node, int logdedup)
3905 | +{
3906 | +	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
3907 | +	unsigned long key = (unsigned long)rmap_item->slot;
3908 | +	unsigned long factor = rmap_item->slot->rung->step;
3909 | +
3910 | +	BUG_ON(!stable_node);
3911 | +	rmap_item->address |= STABLE_FLAG;
3912 | +
3913 | +	if (hlist_empty(&stable_node->hlist)) {
3914 | +		uksm_pages_shared++;
3915 | +		goto node_vma_new;
3916 | +	} else {
3917 | +		uksm_pages_sharing++;
3918 | +	}
3919 | +
3920 | +	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
3921 | +		if (node_vma->key >= key)
3922 | +			break;
3923 | +
3924 | +		if (logdedup) {
3925 | +			node_vma->slot->pages_bemerged += factor;
3926 | +			if (list_empty(&node_vma->slot->dedup_list))
3927 | +				list_add(&node_vma->slot->dedup_list,
3928 | +					 &vma_slot_dedup);
3929 | +		}
3930 | +	}
3931 | +
3932 | +	if (node_vma) {
3933 | +		if (node_vma->key == key) {
3934 | +			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
3935 | +			goto node_vma_ok;
3936 | +		} else if (node_vma->key > key) {
3937 | +			node_vma_cont = node_vma;
3938 | +		}
3939 | +	}
3940 | +
3941 | +node_vma_new:
3942 | +	/* no same vma already in node, alloc a new node_vma */
3943 | +	new_node_vma = alloc_node_vma();
3944 | +	BUG_ON(!new_node_vma);
3945 | +	new_node_vma->head = stable_node;
3946 | +	new_node_vma->slot = rmap_item->slot;
3947 | +
3948 | +	if (!node_vma) {
3949 | +		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
3950 | +	} else if (node_vma->key != key) {
3951 | +		if (node_vma->key < key)
3952 | +			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
3953 | +		else {
3954 | +			hlist_add_before(&new_node_vma->hlist,
3955 | +					 &node_vma->hlist);
3956 | +		}
3957 | +
3958 | +	}
3959 | +	node_vma = new_node_vma;
3960 | +
3961 | +node_vma_ok: /* ok, ready to add to the list */
3962 | +	rmap_item->head = node_vma;
3963 | +	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
3964 | +	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
3965 | +	if (logdedup) {
3966 | +		rmap_item->slot->pages_merged++;
3967 | +		if (node_vma_cont) {
3968 | +			node_vma = node_vma_cont;
3969 | +			hlist_for_each_entry_continue(node_vma, hlist) {
3970 | +				node_vma->slot->pages_bemerged += factor;
3971 | +				if (list_empty(&node_vma->slot->dedup_list))
3972 | +					list_add(&node_vma->slot->dedup_list,
3973 | +						 &vma_slot_dedup);
3974 | +			}
3975 | +		}
3976 | +	}
3977 | +}
3978 | +
3979 | +/*
3980 | + * We use break_ksm to break COW on a ksm page: it's a stripped down
3981 | + *
3982 | + *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
3983 | + *		put_page(page);
3984 | + *
3985 | + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
3986 | + * in case the application has unmapped and remapped mm,addr meanwhile.
3987 | + * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
3988 | + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
3989 | + */
3990 | +static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
3991 | +{
3992 | +	struct page *page;
3993 | +	int ret = 0;
3994 | +
3995 | +	do {
3996 | +		cond_resched();
3997 | +		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
3998 | +		if (IS_ERR_OR_NULL(page))
3999 | +			break;
4000 | +		if (PageKsm(page)) {
4001 | +			ret = handle_mm_fault(vma->vm_mm, vma, addr,
4002 | +					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
4003 | +		} else
4004 | +			ret = VM_FAULT_WRITE;
4005 | +		put_page(page);
4006 | +	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
4007 | +	/*
4008 | +	 * We must loop because handle_mm_fault() may back out if there's
4009 | +	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
4010 | +	 *
4011 | +	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
4012 | +	 * COW has been broken, even if the vma does not permit VM_WRITE;
4013 | +	 * but note that a concurrent fault might break PageKsm for us.
4014 | +	 *
4015 | +	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
4016 | +	 * backing file, which also invalidates anonymous pages: that's
4017 | +	 * okay, that truncation will have unmapped the PageKsm for us.
4018 | +	 *
4019 | +	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
4020 | +	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
4021 | +	 * current task has TIF_MEMDIE set, and will be OOM killed on return
4022 | +	 * to user; and ksmd, having no mm, would never be chosen for that.
4023 | +	 *
4024 | +	 * But if the mm is in a limited mem_cgroup, then the fault may fail
4025 | +	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
4026 | +	 * even ksmd can fail in this way - though it's usually breaking ksm
4027 | +	 * just to undo a merge it made a moment before, so unlikely to oom.
4028 | +	 *
4029 | +	 * That's a pity: we might therefore have more kernel pages allocated
4030 | +	 * than we're counting as nodes in the stable tree; but uksm_do_scan
4031 | +	 * will retry to break_cow on each pass, so should recover the page
4032 | +	 * in due course.  The important thing is to not let VM_MERGEABLE
4033 | +	 * be cleared while any such pages might remain in the area.
4034 | +	 */
4035 | +	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
4036 | +}
4037 | +
4038 | +static void break_cow(struct rmap_item *rmap_item)
4039 | +{
4040 | +	struct vm_area_struct *vma = rmap_item->slot->vma;
4041 | +	struct mm_struct *mm = vma->vm_mm;
4042 | +	unsigned long addr = get_rmap_addr(rmap_item);
4043 | +
4044 | +	if (uksm_test_exit(mm))
4045 | +		goto out;
4046 | +
4047 | +	break_ksm(vma, addr);
4048 | +out:
4049 | +	return;
4050 | +}
4051 | +
4052 | +/*
4053 | + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
4054 | + * than check every pte of a given vma, the locking doesn't quite work for
4055 | + * that - an rmap_item is assigned to the stable tree after inserting ksm
4056 | + * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
4057 | + * rmap_items from parent to child at fork time (so as not to waste time
4058 | + * if exit comes before the next scan reaches it).
4059 | + *
4060 | + * Similarly, although we'd like to remove rmap_items (so updating counts
4061 | + * and freeing memory) when unmerging an area, it's easier to leave that
4062 | + * to the next pass of ksmd - consider, for example, how ksmd might be
4063 | + * in cmp_and_merge_page on one of the rmap_items we would be removing.
4064 | + */
4065 | +inline int unmerge_uksm_pages(struct vm_area_struct *vma,
4066 | +		      unsigned long start, unsigned long end)
4067 | +{
4068 | +	unsigned long addr;
4069 | +	int err = 0;
4070 | +
4071 | +	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
4072 | +		if (uksm_test_exit(vma->vm_mm))
4073 | +			break;
4074 | +		if (signal_pending(current))
4075 | +			err = -ERESTARTSYS;
4076 | +		else
4077 | +			err = break_ksm(vma, addr);
4078 | +	}
4079 | +	return err;
4080 | +}
4081 | +
4082 | +static inline void inc_uksm_pages_scanned(void)
4083 | +{
4084 | +	u64 delta;
4085 | +
4086 | +
4087 | +	if (uksm_pages_scanned == U64_MAX) {
4088 | +		encode_benefit();
4089 | +
4090 | +		delta = uksm_pages_scanned >> pages_scanned_base;
4091 | +
4092 | +		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
4093 | +			pages_scanned_stored >>= 1;
4094 | +			delta >>= 1;
4095 | +			pages_scanned_base++;
4096 | +		}
4097 | +
4098 | +		pages_scanned_stored += delta;
4099 | +
4100 | +		uksm_pages_scanned = uksm_pages_scanned_last = 0;
4101 | +	}
4102 | +
4103 | +	uksm_pages_scanned++;
4104 | +}
4105 | +
4106 | +static inline int find_zero_page_hash(int strength, u32 hash)
4107 | +{
4108 | +	return (zero_hash_table[strength] == hash);
4109 | +}
4110 | +
4111 | +static
4112 | +int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
4113 | +{
4114 | +	struct page *zero_page = empty_uksm_zero_page;
4115 | +	struct mm_struct *mm = vma->vm_mm;
4116 | +	pte_t orig_pte = __pte(0);
4117 | +	int err = -EFAULT;
4118 | +
4119 | +	if (uksm_test_exit(mm))
4120 | +		goto out;
4121 | +
4122 | +	if (!trylock_page(page))
4123 | +		goto out;
4124 | +
4125 | +	if (!PageAnon(page))
4126 | +		goto out_unlock;
4127 | +
4128 | +	if (PageTransCompound(page)) {
4129 | +		err = split_huge_page(page);
4130 | +		if (err)
4131 | +			goto out_unlock;
4132 | +	}
4133 | +
4134 | +	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
4135 | +		if (is_page_full_zero(page))
4136 | +			err = replace_page(vma, page, zero_page, orig_pte);
4137 | +	}
4138 | +
4139 | +out_unlock:
4140 | +	unlock_page(page);
4141 | +out:
4142 | +	return err;
4143 | +}
4144 | +
4145 | +/*
4146 | + * cmp_and_merge_page() - first see if page can be merged into the stable
4147 | + * tree; if not, compare hash to previous and if it's the same, see if page
4148 | + * can be inserted into the unstable tree, or merged with a page already there
4149 | + * and both transferred to the stable tree.
4150 | + *
4151 | + * @page: the page that we are searching identical page to.
4152 | + * @rmap_item: the reverse mapping into the virtual address of this page
4153 | + */
4154 | +static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
4155 | +{
4156 | +	struct rmap_item *tree_rmap_item;
4157 | +	struct page *page;
4158 | +	struct page *kpage = NULL;
4159 | +	u32 hash_max;
4160 | +	int err;
4161 | +	unsigned int success1, success2;
4162 | +	struct stable_node *snode;
4163 | +	int cmp;
4164 | +	struct rb_node *parent = NULL, **new;
4165 | +
4166 | +	remove_rmap_item_from_tree(rmap_item);
4167 | +	page = rmap_item->page;
4168 | +
4169 | +	/* We first start with searching the page inside the stable tree */
4170 | +	kpage = stable_tree_search(rmap_item, hash);
4171 | +	if (kpage) {
4172 | +		err = try_to_merge_with_uksm_page(rmap_item, kpage,
4173 | +						 hash);
4174 | +		if (!err) {
4175 | +			/*
4176 | +			 * The page was successfully merged, add
4177 | +			 * its rmap_item to the stable tree.
4178 | +			 * page lock is needed because it's
4179 | +			 * racing with try_to_unmap_ksm(), etc.
4180 | +			 */
4181 | +			lock_page(kpage);
4182 | +			snode = page_stable_node(kpage);
4183 | +			stable_tree_append(rmap_item, snode, 1);
4184 | +			unlock_page(kpage);
4185 | +			put_page(kpage);
4186 | +			return; /* success */
4187 | +		}
4188 | +		put_page(kpage);
4189 | +
4190 | +		/*
4191 | +		 * if it's a collision and it has been search in sub-rbtree
4192 | +		 * (hash_max != 0), we want to abort, because if it is
4193 | +		 * successfully merged in unstable tree, the collision trends to
4194 | +		 * happen again.
4195 | +		 */
4196 | +		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
4197 | +			return;
4198 | +	}
4199 | +
4200 | +	tree_rmap_item =
4201 | +		unstable_tree_search_insert(rmap_item, hash);
4202 | +	if (tree_rmap_item) {
4203 | +		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
4204 | +		/*
4205 | +		 * As soon as we merge this page, we want to remove the
4206 | +		 * rmap_item of the page we have merged with from the unstable
4207 | +		 * tree, and insert it instead as new node in the stable tree.
4208 | +		 */
4209 | +		if (!err) {
4210 | +			kpage = page;
4211 | +			remove_rmap_item_from_tree(tree_rmap_item);
4212 | +			lock_page(kpage);
4213 | +			snode = stable_tree_insert(&kpage, hash,
4214 | +						   rmap_item, tree_rmap_item,
4215 | +						   &success1, &success2);
4216 | +
4217 | +			/*
4218 | +			 * Do not log dedup for tree item, it's not counted as
4219 | +			 * scanned in this round.
4220 | +			 */
4221 | +			if (success2)
4222 | +				stable_tree_append(tree_rmap_item, snode, 0);
4223 | +
4224 | +			/*
4225 | +			 * The order of these two stable append is important:
4226 | +			 * we are scanning rmap_item.
4227 | +			 */
4228 | +			if (success1)
4229 | +				stable_tree_append(rmap_item, snode, 1);
4230 | +
4231 | +			/*
4232 | +			 * The original kpage may be unlocked inside
4233 | +			 * stable_tree_insert() already. This page
4234 | +			 * should be unlocked before doing
4235 | +			 * break_cow().
4236 | +			 */
4237 | +			unlock_page(kpage);
4238 | +
4239 | +			if (!success1)
4240 | +				break_cow(rmap_item);
4241 | +
4242 | +			if (!success2)
4243 | +				break_cow(tree_rmap_item);
4244 | +
4245 | +		} else if (err == MERGE_ERR_COLLI) {
4246 | +			BUG_ON(tree_rmap_item->tree_node->count > 1);
4247 | +
4248 | +			rmap_item_hash_max(tree_rmap_item,
4249 | +					   tree_rmap_item->tree_node->hash);
4250 | +
4251 | +			hash_max = rmap_item_hash_max(rmap_item, hash);
4252 | +			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
4253 | +			parent = &tree_rmap_item->node;
4254 | +			if (cmp < 0)
4255 | +				new = &parent->rb_left;
4256 | +			else if (cmp > 0)
4257 | +				new = &parent->rb_right;
4258 | +			else
4259 | +				goto put_up_out;
4260 | +
4261 | +			rmap_item->tree_node = tree_rmap_item->tree_node;
4262 | +			rmap_item->address |= UNSTABLE_FLAG;
4263 | +			rmap_item->hash_round = uksm_hash_round;
4264 | +			rb_link_node(&rmap_item->node, parent, new);
4265 | +			rb_insert_color(&rmap_item->node,
4266 | +					&tree_rmap_item->tree_node->sub_root);
4267 | +			rmap_item->tree_node->count++;
4268 | +		} else {
4269 | +			/*
4270 | +			 * either one of the page has changed or they collide
4271 | +			 * at the max hash, we consider them as ill items.
4272 | +			 */
4273 | +			remove_rmap_item_from_tree(tree_rmap_item);
4274 | +		}
4275 | +put_up_out:
4276 | +		put_page(tree_rmap_item->page);
4277 | +		up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
4278 | +	}
4279 | +}
4280 | +
4281 | +
4282 | +
4283 | +
4284 | +static inline unsigned long get_pool_index(struct vma_slot *slot,
4285 | +					   unsigned long index)
4286 | +{
4287 | +	unsigned long pool_index;
4288 | +
4289 | +	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
4290 | +	if (pool_index >= slot->pool_size)
4291 | +		BUG();
4292 | +	return pool_index;
4293 | +}
4294 | +
4295 | +static inline unsigned long index_page_offset(unsigned long index)
4296 | +{
4297 | +	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
4298 | +}
4299 | +
4300 | +static inline
4301 | +struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
4302 | +					    unsigned long index, int need_alloc)
4303 | +{
4304 | +	unsigned long pool_index;
4305 | +	struct page *page;
4306 | +	void *addr;
4307 | +
4308 | +
4309 | +	pool_index = get_pool_index(slot, index);
4310 | +	if (!slot->rmap_list_pool[pool_index]) {
4311 | +		if (!need_alloc)
4312 | +			return NULL;
4313 | +
4314 | +		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
4315 | +		if (!page)
4316 | +			return NULL;
4317 | +
4318 | +		slot->rmap_list_pool[pool_index] = page;
4319 | +	}
4320 | +
4321 | +	addr = kmap(slot->rmap_list_pool[pool_index]);
4322 | +	addr += index_page_offset(index);
4323 | +
4324 | +	return addr;
4325 | +}
4326 | +
4327 | +static inline void put_rmap_list_entry(struct vma_slot *slot,
4328 | +				       unsigned long index)
4329 | +{
4330 | +	unsigned long pool_index;
4331 | +
4332 | +	pool_index = get_pool_index(slot, index);
4333 | +	BUG_ON(!slot->rmap_list_pool[pool_index]);
4334 | +	kunmap(slot->rmap_list_pool[pool_index]);
4335 | +}
4336 | +
4337 | +static inline int entry_is_new(struct rmap_list_entry *entry)
4338 | +{
4339 | +	return !entry->item;
4340 | +}
4341 | +
4342 | +static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
4343 | +						unsigned long index)
4344 | +{
4345 | +	return slot->vma->vm_start + (index << PAGE_SHIFT);
4346 | +}
4347 | +
4348 | +static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
4349 | +{
4350 | +	unsigned long addr;
4351 | +
4352 | +	if (is_addr(entry->addr))
4353 | +		addr = get_clean_addr(entry->addr);
4354 | +	else if (entry->item)
4355 | +		addr = get_rmap_addr(entry->item);
4356 | +	else
4357 | +		BUG();
4358 | +
4359 | +	return addr;
4360 | +}
4361 | +
4362 | +static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
4363 | +{
4364 | +	if (is_addr(entry->addr))
4365 | +		return NULL;
4366 | +
4367 | +	return entry->item;
4368 | +}
4369 | +
4370 | +static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
4371 | +					    unsigned long index)
4372 | +{
4373 | +	unsigned long pool_index;
4374 | +
4375 | +	pool_index = get_pool_index(slot, index);
4376 | +	BUG_ON(!slot->rmap_list_pool[pool_index]);
4377 | +	slot->pool_counts[pool_index]++;
4378 | +}
4379 | +
4380 | +static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
4381 | +					    unsigned long index)
4382 | +{
4383 | +	unsigned long pool_index;
4384 | +
4385 | +	pool_index = get_pool_index(slot, index);
4386 | +	BUG_ON(!slot->rmap_list_pool[pool_index]);
4387 | +	BUG_ON(!slot->pool_counts[pool_index]);
4388 | +	slot->pool_counts[pool_index]--;
4389 | +}
4390 | +
4391 | +static inline int entry_has_rmap(struct rmap_list_entry *entry)
4392 | +{
4393 | +	return !is_addr(entry->addr) && entry->item;
4394 | +}
4395 | +
4396 | +static inline void swap_entries(struct rmap_list_entry *entry1,
4397 | +				unsigned long index1,
4398 | +				struct rmap_list_entry *entry2,
4399 | +				unsigned long index2)
4400 | +{
4401 | +	struct rmap_list_entry tmp;
4402 | +
4403 | +	/* swapping two new entries is meaningless */
4404 | +	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
4405 | +
4406 | +	tmp = *entry1;
4407 | +	*entry1 = *entry2;
4408 | +	*entry2 = tmp;
4409 | +
4410 | +	if (entry_has_rmap(entry1))
4411 | +		entry1->item->entry_index = index1;
4412 | +
4413 | +	if (entry_has_rmap(entry2))
4414 | +		entry2->item->entry_index = index2;
4415 | +
4416 | +	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
4417 | +		inc_rmap_list_pool_count(entry1->item->slot, index1);
4418 | +		dec_rmap_list_pool_count(entry1->item->slot, index2);
4419 | +	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
4420 | +		inc_rmap_list_pool_count(entry2->item->slot, index2);
4421 | +		dec_rmap_list_pool_count(entry2->item->slot, index1);
4422 | +	}
4423 | +}
4424 | +
4425 | +static inline void free_entry_item(struct rmap_list_entry *entry)
4426 | +{
4427 | +	unsigned long index;
4428 | +	struct rmap_item *item;
4429 | +
4430 | +	if (!is_addr(entry->addr)) {
4431 | +		BUG_ON(!entry->item);
4432 | +		item = entry->item;
4433 | +		entry->addr = get_rmap_addr(item);
4434 | +		set_is_addr(entry->addr);
4435 | +		index = item->entry_index;
4436 | +		remove_rmap_item_from_tree(item);
4437 | +		dec_rmap_list_pool_count(item->slot, index);
4438 | +		free_rmap_item(item);
4439 | +	}
4440 | +}
4441 | +
4442 | +static inline int pool_entry_boundary(unsigned long index)
4443 | +{
4444 | +	unsigned long linear_addr;
4445 | +
4446 | +	linear_addr = sizeof(struct rmap_list_entry *) * index;
4447 | +	return index && !offset_in_page(linear_addr);
4448 | +}
4449 | +
4450 | +static inline void try_free_last_pool(struct vma_slot *slot,
4451 | +				      unsigned long index)
4452 | +{
4453 | +	unsigned long pool_index;
4454 | +
4455 | +	pool_index = get_pool_index(slot, index);
4456 | +	if (slot->rmap_list_pool[pool_index] &&
4457 | +	    !slot->pool_counts[pool_index]) {
4458 | +		__free_page(slot->rmap_list_pool[pool_index]);
4459 | +		slot->rmap_list_pool[pool_index] = NULL;
4460 | +		slot->flags |= UKSM_SLOT_NEED_SORT;
4461 | +	}
4462 | +
4463 | +}
4464 | +
4465 | +static inline unsigned long vma_item_index(struct vm_area_struct *vma,
4466 | +					   struct rmap_item *item)
4467 | +{
4468 | +	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
4469 | +}
4470 | +
4471 | +static int within_same_pool(struct vma_slot *slot,
4472 | +			    unsigned long i, unsigned long j)
4473 | +{
4474 | +	unsigned long pool_i, pool_j;
4475 | +
4476 | +	pool_i = get_pool_index(slot, i);
4477 | +	pool_j = get_pool_index(slot, j);
4478 | +
4479 | +	return (pool_i == pool_j);
4480 | +}
4481 | +
4482 | +static void sort_rmap_entry_list(struct vma_slot *slot)
4483 | +{
4484 | +	unsigned long i, j;
4485 | +	struct rmap_list_entry *entry, *swap_entry;
4486 | +
4487 | +	entry = get_rmap_list_entry(slot, 0, 0);
4488 | +	for (i = 0; i < slot->pages; ) {
4489 | +
4490 | +		if (!entry)
4491 | +			goto skip_whole_pool;
4492 | +
4493 | +		if (entry_is_new(entry))
4494 | +			goto next_entry;
4495 | +
4496 | +		if (is_addr(entry->addr)) {
4497 | +			entry->addr = 0;
4498 | +			goto next_entry;
4499 | +		}
4500 | +
4501 | +		j = vma_item_index(slot->vma, entry->item);
4502 | +		if (j == i)
4503 | +			goto next_entry;
4504 | +
4505 | +		if (within_same_pool(slot, i, j))
4506 | +			swap_entry = entry + j - i;
4507 | +		else
4508 | +			swap_entry = get_rmap_list_entry(slot, j, 1);
4509 | +
4510 | +		swap_entries(entry, i, swap_entry, j);
4511 | +		if (!within_same_pool(slot, i, j))
4512 | +			put_rmap_list_entry(slot, j);
4513 | +		continue;
4514 | +
4515 | +skip_whole_pool:
4516 | +		i += PAGE_SIZE / sizeof(*entry);
4517 | +		if (i < slot->pages)
4518 | +			entry = get_rmap_list_entry(slot, i, 0);
4519 | +		continue;
4520 | +
4521 | +next_entry:
4522 | +		if (i >= slot->pages - 1 ||
4523 | +		    !within_same_pool(slot, i, i + 1)) {
4524 | +			put_rmap_list_entry(slot, i);
4525 | +			if (i + 1 < slot->pages)
4526 | +				entry = get_rmap_list_entry(slot, i + 1, 0);
4527 | +		} else
4528 | +			entry++;
4529 | +		i++;
4530 | +		continue;
4531 | +	}
4532 | +
4533 | +	/* free empty pool entries which contain no rmap_item */
4534 | +	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
4535 | +	for (i = 0; i < slot->pool_size; i++) {
4536 | +		unsigned char has_rmap;
4537 | +		void *addr;
4538 | +
4539 | +		if (!slot->rmap_list_pool[i])
4540 | +			continue;
4541 | +
4542 | +		has_rmap = 0;
4543 | +		addr = kmap(slot->rmap_list_pool[i]);
4544 | +		BUG_ON(!addr);
4545 | +		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
4546 | +			entry = (struct rmap_list_entry *)addr + j;
4547 | +			if (is_addr(entry->addr))
4548 | +				continue;
4549 | +			if (!entry->item)
4550 | +				continue;
4551 | +			has_rmap = 1;
4552 | +		}
4553 | +		kunmap(slot->rmap_list_pool[i]);
4554 | +		if (!has_rmap) {
4555 | +			BUG_ON(slot->pool_counts[i]);
4556 | +			__free_page(slot->rmap_list_pool[i]);
4557 | +			slot->rmap_list_pool[i] = NULL;
4558 | +		}
4559 | +	}
4560 | +
4561 | +	slot->flags &= ~UKSM_SLOT_NEED_SORT;
4562 | +}
4563 | +
4564 | +/*
4565 | + * vma_fully_scanned() - if all the pages in this slot have been scanned.
4566 | + */
4567 | +static inline int vma_fully_scanned(struct vma_slot *slot)
4568 | +{
4569 | +	return slot->pages_scanned == slot->pages;
4570 | +}
4571 | +
4572 | +/**
4573 | + * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
4574 | + * its random permutation. This function is embedded with the random
4575 | + * permutation index management code.
4576 | + */
4577 | +static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
4578 | +{
4579 | +	unsigned long rand_range, addr, swap_index, scan_index;
4580 | +	struct rmap_item *item = NULL;
4581 | +	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
4582 | +	struct page *page;
4583 | +
4584 | +	scan_index = swap_index = slot->pages_scanned % slot->pages;
4585 | +
4586 | +	if (pool_entry_boundary(scan_index))
4587 | +		try_free_last_pool(slot, scan_index - 1);
4588 | +
4589 | +	if (vma_fully_scanned(slot)) {
4590 | +		if (slot->flags & UKSM_SLOT_NEED_SORT)
4591 | +			slot->flags |= UKSM_SLOT_NEED_RERAND;
4592 | +		else
4593 | +			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
4594 | +		if (slot->flags & UKSM_SLOT_NEED_SORT)
4595 | +			sort_rmap_entry_list(slot);
4596 | +	}
4597 | +
4598 | +	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
4599 | +	if (!scan_entry)
4600 | +		return NULL;
4601 | +
4602 | +	if (entry_is_new(scan_entry)) {
4603 | +		scan_entry->addr = get_index_orig_addr(slot, scan_index);
4604 | +		set_is_addr(scan_entry->addr);
4605 | +	}
4606 | +
4607 | +	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
4608 | +		rand_range = slot->pages - scan_index;
4609 | +		BUG_ON(!rand_range);
4610 | +		swap_index = scan_index + (prandom_u32() % rand_range);
4611 | +	}
4612 | +
4613 | +	if (swap_index != scan_index) {
4614 | +		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
4615 | +		if (entry_is_new(swap_entry)) {
4616 | +			swap_entry->addr = get_index_orig_addr(slot,
4617 | +							       swap_index);
4618 | +			set_is_addr(swap_entry->addr);
4619 | +		}
4620 | +		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
4621 | +	}
4622 | +
4623 | +	addr = get_entry_address(scan_entry);
4624 | +	item = get_entry_item(scan_entry);
4625 | +	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
4626 | +
4627 | +	page = follow_page(slot->vma, addr, FOLL_GET);
4628 | +	if (IS_ERR_OR_NULL(page))
4629 | +		goto nopage;
4630 | +
4631 | +	if (!PageAnon(page))
4632 | +		goto putpage;
4633 | +
4634 | +	/*check is zero_page pfn or uksm_zero_page*/
4635 | +	if ((page_to_pfn(page) == zero_pfn)
4636 | +			|| (page_to_pfn(page) == uksm_zero_pfn))
4637 | +		goto putpage;
4638 | +
4639 | +	flush_anon_page(slot->vma, page, addr);
4640 | +	flush_dcache_page(page);
4641 | +
4642 | +
4643 | +	*hash = page_hash(page, hash_strength, 1);
4644 | +	inc_uksm_pages_scanned();
4645 | +	/*if the page content all zero, re-map to zero-page*/
4646 | +	if (find_zero_page_hash(hash_strength, *hash)) {
4647 | +		if (!cmp_and_merge_zero_page(slot->vma, page)) {
4648 | +			slot->pages_merged++;
4649 | +			inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
4650 | +
4651 | +			/* For full-zero pages, no need to create rmap item */
4652 | +			goto putpage;
4653 | +		} else {
4654 | +			inc_rshash_neg(memcmp_cost / 2);
4655 | +		}
4656 | +	}
4657 | +
4658 | +	if (!item) {
4659 | +		item = alloc_rmap_item();
4660 | +		if (item) {
4661 | +			/* It has already been zeroed */
4662 | +			item->slot = slot;
4663 | +			item->address = addr;
4664 | +			item->entry_index = scan_index;
4665 | +			scan_entry->item = item;
4666 | +			inc_rmap_list_pool_count(slot, scan_index);
4667 | +		} else
4668 | +			goto putpage;
4669 | +	}
4670 | +
4671 | +	BUG_ON(item->slot != slot);
4672 | +	/* the page may have changed */
4673 | +	item->page = page;
4674 | +	put_rmap_list_entry(slot, scan_index);
4675 | +	if (swap_entry)
4676 | +		put_rmap_list_entry(slot, swap_index);
4677 | +	return item;
4678 | +
4679 | +putpage:
4680 | +	put_page(page);
4681 | +	page = NULL;
4682 | +nopage:
4683 | +	/* no page, store addr back and free rmap_item if possible */
4684 | +	free_entry_item(scan_entry);
4685 | +	put_rmap_list_entry(slot, scan_index);
4686 | +	if (swap_entry)
4687 | +		put_rmap_list_entry(slot, swap_index);
4688 | +	return NULL;
4689 | +}
4690 | +
4691 | +static inline int in_stable_tree(struct rmap_item *rmap_item)
4692 | +{
4693 | +	return rmap_item->address & STABLE_FLAG;
4694 | +}
4695 | +
4696 | +/**
4697 | + * scan_vma_one_page() - scan the next page in a vma_slot. Called with
4698 | + * mmap_sem locked.
4699 | + */
4700 | +static noinline void scan_vma_one_page(struct vma_slot *slot)
4701 | +{
4702 | +	u32 hash;
4703 | +	struct mm_struct *mm;
4704 | +	struct rmap_item *rmap_item = NULL;
4705 | +	struct vm_area_struct *vma = slot->vma;
4706 | +
4707 | +	mm = vma->vm_mm;
4708 | +	BUG_ON(!mm);
4709 | +	BUG_ON(!slot);
4710 | +
4711 | +	rmap_item = get_next_rmap_item(slot, &hash);
4712 | +	if (!rmap_item)
4713 | +		goto out1;
4714 | +
4715 | +	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
4716 | +		goto out2;
4717 | +
4718 | +	cmp_and_merge_page(rmap_item, hash);
4719 | +out2:
4720 | +	put_page(rmap_item->page);
4721 | +out1:
4722 | +	slot->pages_scanned++;
4723 | +	slot->this_sampled++;
4724 | +	if (slot->fully_scanned_round != fully_scanned_round)
4725 | +		scanned_virtual_pages++;
4726 | +
4727 | +	if (vma_fully_scanned(slot))
4728 | +		slot->fully_scanned_round = fully_scanned_round;
4729 | +}
4730 | +
4731 | +static inline unsigned long rung_get_pages(struct scan_rung *rung)
4732 | +{
4733 | +	struct slot_tree_node *node;
4734 | +
4735 | +	if (!rung->vma_root.rnode)
4736 | +		return 0;
4737 | +
4738 | +	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
4739 | +
4740 | +	return node->size;
4741 | +}
4742 | +
4743 | +#define RUNG_SAMPLED_MIN	3
4744 | +
4745 | +static inline
4746 | +void uksm_calc_rung_step(struct scan_rung *rung,
4747 | +			 unsigned long page_time, unsigned long ratio)
4748 | +{
4749 | +	unsigned long sampled, pages;
4750 | +
4751 | +	/* will be fully scanned ? */
4752 | +	if (!rung->cover_msecs) {
4753 | +		rung->step = 1;
4754 | +		return;
4755 | +	}
4756 | +
4757 | +	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
4758 | +		  * ratio / page_time;
4759 | +
4760 | +	/*
4761 | +	 *  Before we finsish a scan round and expensive per-round jobs,
4762 | +	 *  we need to have a chance to estimate the per page time. So
4763 | +	 *  the sampled number can not be too small.
4764 | +	 */
4765 | +	if (sampled < RUNG_SAMPLED_MIN)
4766 | +		sampled = RUNG_SAMPLED_MIN;
4767 | +
4768 | +	pages = rung_get_pages(rung);
4769 | +	if (likely(pages > sampled))
4770 | +		rung->step = pages / sampled;
4771 | +	else
4772 | +		rung->step = 1;
4773 | +}
4774 | +
4775 | +static inline int step_need_recalc(struct scan_rung *rung)
4776 | +{
4777 | +	unsigned long pages, stepmax;
4778 | +
4779 | +	pages = rung_get_pages(rung);
4780 | +	stepmax = pages / RUNG_SAMPLED_MIN;
4781 | +
4782 | +	return pages && (rung->step > pages ||
4783 | +			 (stepmax && rung->step > stepmax));
4784 | +}
4785 | +
4786 | +static inline
4787 | +void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
4788 | +{
4789 | +	struct vma_slot *slot;
4790 | +
4791 | +	if (finished)
4792 | +		rung->flags |= UKSM_RUNG_ROUND_FINISHED;
4793 | +
4794 | +	if (step_recalc || step_need_recalc(rung)) {
4795 | +		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4796 | +		BUG_ON(step_need_recalc(rung));
4797 | +	}
4798 | +
4799 | +	slot_iter_index = prandom_u32() % rung->step;
4800 | +	BUG_ON(!rung->vma_root.rnode);
4801 | +	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
4802 | +	BUG_ON(!slot);
4803 | +
4804 | +	rung->current_scan = slot;
4805 | +	rung->current_offset = slot_iter_index;
4806 | +}
4807 | +
4808 | +static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
4809 | +{
4810 | +	return &slot->rung->vma_root;
4811 | +}
4812 | +
4813 | +/*
4814 | + * return if resetted.
4815 | + */
4816 | +static int advance_current_scan(struct scan_rung *rung)
4817 | +{
4818 | +	unsigned short n;
4819 | +	struct vma_slot *slot, *next = NULL;
4820 | +
4821 | +	BUG_ON(!rung->vma_root.num);
4822 | +
4823 | +	slot = rung->current_scan;
4824 | +	n = (slot->pages - rung->current_offset) % rung->step;
4825 | +	slot_iter_index = rung->step - n;
4826 | +	next = sradix_tree_next(&rung->vma_root, slot->snode,
4827 | +				slot->sindex, slot_iter);
4828 | +
4829 | +	if (next) {
4830 | +		rung->current_offset = slot_iter_index;
4831 | +		rung->current_scan = next;
4832 | +		return 0;
4833 | +	} else {
4834 | +		reset_current_scan(rung, 1, 0);
4835 | +		return 1;
4836 | +	}
4837 | +}
4838 | +
4839 | +static inline void rung_rm_slot(struct vma_slot *slot)
4840 | +{
4841 | +	struct scan_rung *rung = slot->rung;
4842 | +	struct sradix_tree_root *root;
4843 | +
4844 | +	if (rung->current_scan == slot)
4845 | +		advance_current_scan(rung);
4846 | +
4847 | +	root = slot_get_root(slot);
4848 | +	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
4849 | +	slot->snode = NULL;
4850 | +	if (step_need_recalc(rung)) {
4851 | +		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
4852 | +		BUG_ON(step_need_recalc(rung));
4853 | +	}
4854 | +
4855 | +	/* In case advance_current_scan loop back to this slot again */
4856 | +	if (rung->vma_root.num && rung->current_scan == slot)
4857 | +		reset_current_scan(slot->rung, 1, 0);
4858 | +}
4859 | +
4860 | +static inline void rung_add_new_slots(struct scan_rung *rung,
4861 | +			struct vma_slot **slots, unsigned long num)
4862 | +{
4863 | +	int err;
4864 | +	struct vma_slot *slot;
4865 | +	unsigned long i;
4866 | +	struct sradix_tree_root *root = &rung->vma_root;
4867 | +
4868 | +	err = sradix_tree_enter(root, (void **)slots, num);
4869 | +	BUG_ON(err);
4870 | +
4871 | +	for (i = 0; i < num; i++) {
4872 | +		slot = slots[i];
4873 | +		slot->rung = rung;
4874 | +		BUG_ON(vma_fully_scanned(slot));
4875 | +	}
4876 | +
4877 | +	if (rung->vma_root.num == num)
4878 | +		reset_current_scan(rung, 0, 1);
4879 | +}
4880 | +
4881 | +static inline int rung_add_one_slot(struct scan_rung *rung,
4882 | +				     struct vma_slot *slot)
4883 | +{
4884 | +	int err;
4885 | +
4886 | +	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
4887 | +	if (err)
4888 | +		return err;
4889 | +
4890 | +	slot->rung = rung;
4891 | +	if (rung->vma_root.num == 1)
4892 | +		reset_current_scan(rung, 0, 1);
4893 | +
4894 | +	return 0;
4895 | +}
4896 | +
4897 | +/*
4898 | + * Return true if the slot is deleted from its rung.
4899 | + */
4900 | +static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
4901 | +{
4902 | +	struct scan_rung *old_rung = slot->rung;
4903 | +	int err;
4904 | +
4905 | +	if (old_rung == rung)
4906 | +		return 0;
4907 | +
4908 | +	rung_rm_slot(slot);
4909 | +	err = rung_add_one_slot(rung, slot);
4910 | +	if (err) {
4911 | +		err = rung_add_one_slot(old_rung, slot);
4912 | +		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
4913 | +	}
4914 | +
4915 | +	return 1;
4916 | +}
4917 | +
4918 | +static inline int vma_rung_up(struct vma_slot *slot)
4919 | +{
4920 | +	struct scan_rung *rung;
4921 | +
4922 | +	rung = slot->rung;
4923 | +	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
4924 | +		rung++;
4925 | +
4926 | +	return vma_rung_enter(slot, rung);
4927 | +}
4928 | +
4929 | +static inline int vma_rung_down(struct vma_slot *slot)
4930 | +{
4931 | +	struct scan_rung *rung;
4932 | +
4933 | +	rung = slot->rung;
4934 | +	if (slot->rung != &uksm_scan_ladder[0])
4935 | +		rung--;
4936 | +
4937 | +	return vma_rung_enter(slot, rung);
4938 | +}
4939 | +
4940 | +/**
4941 | + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4942 | + */
4943 | +static unsigned long cal_dedup_ratio(struct vma_slot *slot)
4944 | +{
4945 | +	unsigned long ret;
4946 | +	unsigned long pages;
4947 | +
4948 | +	pages = slot->this_sampled;
4949 | +	if (!pages)
4950 | +		return 0;
4951 | +
4952 | +	BUG_ON(slot->pages_scanned == slot->last_scanned);
4953 | +
4954 | +	ret = slot->pages_merged;
4955 | +
4956 | +	/* Thrashing area filtering */
4957 | +	if (ret && uksm_thrash_threshold) {
4958 | +		if (slot->pages_cowed * 100 / slot->pages_merged
4959 | +		    > uksm_thrash_threshold) {
4960 | +			ret = 0;
4961 | +		} else {
4962 | +			ret = slot->pages_merged - slot->pages_cowed;
4963 | +		}
4964 | +	}
4965 | +
4966 | +	return ret * 100 / pages;
4967 | +}
4968 | +
4969 | +/**
4970 | + * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
4971 | + */
4972 | +static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
4973 | +{
4974 | +	unsigned long ret;
4975 | +	unsigned long pages;
4976 | +
4977 | +	pages = slot->pages;
4978 | +	if (!pages)
4979 | +		return 0;
4980 | +
4981 | +	ret = slot->pages_bemerged;
4982 | +
4983 | +	/* Thrashing area filtering */
4984 | +	if (ret && uksm_thrash_threshold) {
4985 | +		if (slot->pages_cowed * 100 / slot->pages_bemerged
4986 | +		    > uksm_thrash_threshold) {
4987 | +			ret = 0;
4988 | +		} else {
4989 | +			ret = slot->pages_bemerged - slot->pages_cowed;
4990 | +		}
4991 | +	}
4992 | +
4993 | +	return ret * 100 / pages;
4994 | +}
4995 | +
4996 | +/**
4997 | + * stable_node_reinsert() - When the hash_strength has been adjusted, the
4998 | + * stable tree need to be restructured, this is the function re-inserting the
4999 | + * stable node.
5000 | + */
5001 | +static inline void stable_node_reinsert(struct stable_node *new_node,
5002 | +					struct page *page,
5003 | +					struct rb_root *root_treep,
5004 | +					struct list_head *tree_node_listp,
5005 | +					u32 hash)
5006 | +{
5007 | +	struct rb_node **new = &root_treep->rb_node;
5008 | +	struct rb_node *parent = NULL;
5009 | +	struct stable_node *stable_node;
5010 | +	struct tree_node *tree_node;
5011 | +	struct page *tree_page;
5012 | +	int cmp;
5013 | +
5014 | +	while (*new) {
5015 | +		int cmp;
5016 | +
5017 | +		tree_node = rb_entry(*new, struct tree_node, node);
5018 | +
5019 | +		cmp = hash_cmp(hash, tree_node->hash);
5020 | +
5021 | +		if (cmp < 0) {
5022 | +			parent = *new;
5023 | +			new = &parent->rb_left;
5024 | +		} else if (cmp > 0) {
5025 | +			parent = *new;
5026 | +			new = &parent->rb_right;
5027 | +		} else
5028 | +			break;
5029 | +	}
5030 | +
5031 | +	if (*new) {
5032 | +		/* find a stable tree node with same first level hash value */
5033 | +		stable_node_hash_max(new_node, page, hash);
5034 | +		if (tree_node->count == 1) {
5035 | +			stable_node = rb_entry(tree_node->sub_root.rb_node,
5036 | +					       struct stable_node, node);
5037 | +			tree_page = get_uksm_page(stable_node, 1, 0);
5038 | +			if (tree_page) {
5039 | +				stable_node_hash_max(stable_node,
5040 | +						      tree_page, hash);
5041 | +				put_page(tree_page);
5042 | +
5043 | +				/* prepare for stable node insertion */
5044 | +
5045 | +				cmp = hash_cmp(new_node->hash_max,
5046 | +						   stable_node->hash_max);
5047 | +				parent = &stable_node->node;
5048 | +				if (cmp < 0)
5049 | +					new = &parent->rb_left;
5050 | +				else if (cmp > 0)
5051 | +					new = &parent->rb_right;
5052 | +				else
5053 | +					goto failed;
5054 | +
5055 | +				goto add_node;
5056 | +			} else {
5057 | +				/* the only stable_node deleted, the tree node
5058 | +				 * was not deleted.
5059 | +				 */
5060 | +				goto tree_node_reuse;
5061 | +			}
5062 | +		}
5063 | +
5064 | +		/* well, search the collision subtree */
5065 | +		new = &tree_node->sub_root.rb_node;
5066 | +		parent = NULL;
5067 | +		BUG_ON(!*new);
5068 | +		while (*new) {
5069 | +			int cmp;
5070 | +
5071 | +			stable_node = rb_entry(*new, struct stable_node, node);
5072 | +
5073 | +			cmp = hash_cmp(new_node->hash_max,
5074 | +					   stable_node->hash_max);
5075 | +
5076 | +			if (cmp < 0) {
5077 | +				parent = *new;
5078 | +				new = &parent->rb_left;
5079 | +			} else if (cmp > 0) {
5080 | +				parent = *new;
5081 | +				new = &parent->rb_right;
5082 | +			} else {
5083 | +				/* oh, no, still a collision */
5084 | +				goto failed;
5085 | +			}
5086 | +		}
5087 | +
5088 | +		goto add_node;
5089 | +	}
5090 | +
5091 | +	/* no tree node found */
5092 | +	tree_node = alloc_tree_node(tree_node_listp);
5093 | +	if (!tree_node) {
5094 | +		printk(KERN_ERR "UKSM: memory allocation error!\n");
5095 | +		goto failed;
5096 | +	} else {
5097 | +		tree_node->hash = hash;
5098 | +		rb_link_node(&tree_node->node, parent, new);
5099 | +		rb_insert_color(&tree_node->node, root_treep);
5100 | +
5101 | +tree_node_reuse:
5102 | +		/* prepare for stable node insertion */
5103 | +		parent = NULL;
5104 | +		new = &tree_node->sub_root.rb_node;
5105 | +	}
5106 | +
5107 | +add_node:
5108 | +	rb_link_node(&new_node->node, parent, new);
5109 | +	rb_insert_color(&new_node->node, &tree_node->sub_root);
5110 | +	new_node->tree_node = tree_node;
5111 | +	tree_node->count++;
5112 | +	return;
5113 | +
5114 | +failed:
5115 | +	/* This can only happen when two nodes have collided
5116 | +	 * in two levels.
5117 | +	 */
5118 | +	new_node->tree_node = NULL;
5119 | +	return;
5120 | +}
5121 | +
5122 | +static inline void free_all_tree_nodes(struct list_head *list)
5123 | +{
5124 | +	struct tree_node *node, *tmp;
5125 | +
5126 | +	list_for_each_entry_safe(node, tmp, list, all_list) {
5127 | +		free_tree_node(node);
5128 | +	}
5129 | +}
5130 | +
5131 | +/**
5132 | + * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
5133 | + * strength to the current hash_strength. It re-structures the hole tree.
5134 | + */
5135 | +static inline void stable_tree_delta_hash(u32 prev_hash_strength)
5136 | +{
5137 | +	struct stable_node *node, *tmp;
5138 | +	struct rb_root *root_new_treep;
5139 | +	struct list_head *new_tree_node_listp;
5140 | +
5141 | +	stable_tree_index = (stable_tree_index + 1) % 2;
5142 | +	root_new_treep = &root_stable_tree[stable_tree_index];
5143 | +	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
5144 | +	*root_new_treep = RB_ROOT;
5145 | +	BUG_ON(!list_empty(new_tree_node_listp));
5146 | +
5147 | +	/*
5148 | +	 * we need to be safe, the node could be removed by get_uksm_page()
5149 | +	 */
5150 | +	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
5151 | +		void *addr;
5152 | +		struct page *node_page;
5153 | +		u32 hash;
5154 | +
5155 | +		/*
5156 | +		 * We are completely re-structuring the stable nodes to a new
5157 | +		 * stable tree. We don't want to touch the old tree unlinks and
5158 | +		 * old tree_nodes. The old tree_nodes will be freed at once.
5159 | +		 */
5160 | +		node_page = get_uksm_page(node, 0, 0);
5161 | +		if (!node_page)
5162 | +			continue;
5163 | +
5164 | +		if (node->tree_node) {
5165 | +			hash = node->tree_node->hash;
5166 | +
5167 | +			addr = kmap_atomic(node_page);
5168 | +
5169 | +			hash = delta_hash(addr, prev_hash_strength,
5170 | +					  hash_strength, hash);
5171 | +			kunmap_atomic(addr);
5172 | +		} else {
5173 | +			/*
5174 | +			 *it was not inserted to rbtree due to collision in last
5175 | +			 *round scan.
5176 | +			 */
5177 | +			hash = page_hash(node_page, hash_strength, 0);
5178 | +		}
5179 | +
5180 | +		stable_node_reinsert(node, node_page, root_new_treep,
5181 | +				     new_tree_node_listp, hash);
5182 | +		put_page(node_page);
5183 | +	}
5184 | +
5185 | +	root_stable_treep = root_new_treep;
5186 | +	free_all_tree_nodes(stable_tree_node_listp);
5187 | +	BUG_ON(!list_empty(stable_tree_node_listp));
5188 | +	stable_tree_node_listp = new_tree_node_listp;
5189 | +}
5190 | +
5191 | +static inline void inc_hash_strength(unsigned long delta)
5192 | +{
5193 | +	hash_strength += 1 << delta;
5194 | +	if (hash_strength > HASH_STRENGTH_MAX)
5195 | +		hash_strength = HASH_STRENGTH_MAX;
5196 | +}
5197 | +
5198 | +static inline void dec_hash_strength(unsigned long delta)
5199 | +{
5200 | +	unsigned long change = 1 << delta;
5201 | +
5202 | +	if (hash_strength <= change + 1)
5203 | +		hash_strength = 1;
5204 | +	else
5205 | +		hash_strength -= change;
5206 | +}
5207 | +
5208 | +static inline void inc_hash_strength_delta(void)
5209 | +{
5210 | +	hash_strength_delta++;
5211 | +	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
5212 | +		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
5213 | +}
5214 | +
5215 | +/*
5216 | +static inline unsigned long get_current_neg_ratio(void)
5217 | +{
5218 | +	if (!rshash_pos || rshash_neg > rshash_pos)
5219 | +		return 100;
5220 | +
5221 | +	return div64_u64(100 * rshash_neg , rshash_pos);
5222 | +}
5223 | +*/
5224 | +
5225 | +static inline unsigned long get_current_neg_ratio(void)
5226 | +{
5227 | +	u64 pos = benefit.pos;
5228 | +	u64 neg = benefit.neg;
5229 | +
5230 | +	if (!neg)
5231 | +		return 0;
5232 | +
5233 | +	if (!pos || neg > pos)
5234 | +		return 100;
5235 | +
5236 | +	if (neg > div64_u64(U64_MAX, 100))
5237 | +		pos = div64_u64(pos, 100);
5238 | +	else
5239 | +		neg *= 100;
5240 | +
5241 | +	return div64_u64(neg, pos);
5242 | +}
5243 | +
5244 | +static inline unsigned long get_current_benefit(void)
5245 | +{
5246 | +	u64 pos = benefit.pos;
5247 | +	u64 neg = benefit.neg;
5248 | +	u64 scanned = benefit.scanned;
5249 | +
5250 | +	if (neg > pos)
5251 | +		return 0;
5252 | +
5253 | +	return div64_u64((pos - neg), scanned);
5254 | +}
5255 | +
5256 | +static inline int judge_rshash_direction(void)
5257 | +{
5258 | +	u64 current_neg_ratio, stable_benefit;
5259 | +	u64 current_benefit, delta = 0;
5260 | +	int ret = STILL;
5261 | +
5262 | +	/* Try to probe a value after the boot, and in case the system
5263 | +	   are still for a long time. */
5264 | +	if ((fully_scanned_round & 0xFFULL) == 10) {
5265 | +		ret = OBSCURE;
5266 | +		goto out;
5267 | +	}
5268 | +
5269 | +	current_neg_ratio = get_current_neg_ratio();
5270 | +
5271 | +	if (current_neg_ratio == 0) {
5272 | +		rshash_neg_cont_zero++;
5273 | +		if (rshash_neg_cont_zero > 2)
5274 | +			return GO_DOWN;
5275 | +		else
5276 | +			return STILL;
5277 | +	}
5278 | +	rshash_neg_cont_zero = 0;
5279 | +
5280 | +	if (current_neg_ratio > 90) {
5281 | +		ret = GO_UP;
5282 | +		goto out;
5283 | +	}
5284 | +
5285 | +	current_benefit = get_current_benefit();
5286 | +	stable_benefit = rshash_state.stable_benefit;
5287 | +
5288 | +	if (!stable_benefit) {
5289 | +		ret = OBSCURE;
5290 | +		goto out;
5291 | +	}
5292 | +
5293 | +	if (current_benefit > stable_benefit)
5294 | +		delta = current_benefit - stable_benefit;
5295 | +	else if (current_benefit < stable_benefit)
5296 | +		delta = stable_benefit - current_benefit;
5297 | +
5298 | +	delta = div64_u64(100 * delta , stable_benefit);
5299 | +
5300 | +	if (delta > 50) {
5301 | +		rshash_cont_obscure++;
5302 | +		if (rshash_cont_obscure > 2)
5303 | +			return OBSCURE;
5304 | +		else
5305 | +			return STILL;
5306 | +	}
5307 | +
5308 | +out:
5309 | +	rshash_cont_obscure = 0;
5310 | +	return ret;
5311 | +}
5312 | +
5313 | +/**
5314 | + * rshash_adjust() - The main function to control the random sampling state
5315 | + * machine for hash strength adapting.
5316 | + *
5317 | + * return true if hash_strength has changed.
5318 | + */
5319 | +static inline int rshash_adjust(void)
5320 | +{
5321 | +	unsigned long prev_hash_strength = hash_strength;
5322 | +
5323 | +	if (!encode_benefit())
5324 | +		return 0;
5325 | +
5326 | +	switch (rshash_state.state) {
5327 | +	case RSHASH_STILL:
5328 | +		switch (judge_rshash_direction()) {
5329 | +		case GO_UP:
5330 | +			if (rshash_state.pre_direct == GO_DOWN)
5331 | +				hash_strength_delta = 0;
5332 | +
5333 | +			inc_hash_strength(hash_strength_delta);
5334 | +			inc_hash_strength_delta();
5335 | +			rshash_state.stable_benefit = get_current_benefit();
5336 | +			rshash_state.pre_direct = GO_UP;
5337 | +			break;
5338 | +
5339 | +		case GO_DOWN:
5340 | +			if (rshash_state.pre_direct == GO_UP)
5341 | +				hash_strength_delta = 0;
5342 | +
5343 | +			dec_hash_strength(hash_strength_delta);
5344 | +			inc_hash_strength_delta();
5345 | +			rshash_state.stable_benefit = get_current_benefit();
5346 | +			rshash_state.pre_direct = GO_DOWN;
5347 | +			break;
5348 | +
5349 | +		case OBSCURE:
5350 | +			rshash_state.stable_point = hash_strength;
5351 | +			rshash_state.turn_point_down = hash_strength;
5352 | +			rshash_state.turn_point_up = hash_strength;
5353 | +			rshash_state.turn_benefit_down = get_current_benefit();
5354 | +			rshash_state.turn_benefit_up = get_current_benefit();
5355 | +			rshash_state.lookup_window_index = 0;
5356 | +			rshash_state.state = RSHASH_TRYDOWN;
5357 | +			dec_hash_strength(hash_strength_delta);
5358 | +			inc_hash_strength_delta();
5359 | +			break;
5360 | +
5361 | +		case STILL:
5362 | +			break;
5363 | +		default:
5364 | +			BUG();
5365 | +		}
5366 | +		break;
5367 | +
5368 | +	case RSHASH_TRYDOWN:
5369 | +		if (rshash_state.lookup_window_index++ % 5 == 0)
5370 | +			rshash_state.below_count = 0;
5371 | +
5372 | +		if (get_current_benefit() < rshash_state.stable_benefit)
5373 | +			rshash_state.below_count++;
5374 | +		else if (get_current_benefit() >
5375 | +			 rshash_state.turn_benefit_down) {
5376 | +			rshash_state.turn_point_down = hash_strength;
5377 | +			rshash_state.turn_benefit_down = get_current_benefit();
5378 | +		}
5379 | +
5380 | +		if (rshash_state.below_count >= 3 ||
5381 | +		    judge_rshash_direction() == GO_UP ||
5382 | +		    hash_strength == 1) {
5383 | +			hash_strength = rshash_state.stable_point;
5384 | +			hash_strength_delta = 0;
5385 | +			inc_hash_strength(hash_strength_delta);
5386 | +			inc_hash_strength_delta();
5387 | +			rshash_state.lookup_window_index = 0;
5388 | +			rshash_state.state = RSHASH_TRYUP;
5389 | +			hash_strength_delta = 0;
5390 | +		} else {
5391 | +			dec_hash_strength(hash_strength_delta);
5392 | +			inc_hash_strength_delta();
5393 | +		}
5394 | +		break;
5395 | +
5396 | +	case RSHASH_TRYUP:
5397 | +		if (rshash_state.lookup_window_index++ % 5 == 0)
5398 | +			rshash_state.below_count = 0;
5399 | +
5400 | +		if (get_current_benefit() < rshash_state.turn_benefit_down)
5401 | +			rshash_state.below_count++;
5402 | +		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
5403 | +			rshash_state.turn_point_up = hash_strength;
5404 | +			rshash_state.turn_benefit_up = get_current_benefit();
5405 | +		}
5406 | +
5407 | +		if (rshash_state.below_count >= 3 ||
5408 | +		    judge_rshash_direction() == GO_DOWN ||
5409 | +		    hash_strength == HASH_STRENGTH_MAX) {
5410 | +			hash_strength = rshash_state.turn_benefit_up >
5411 | +				rshash_state.turn_benefit_down ?
5412 | +				rshash_state.turn_point_up :
5413 | +				rshash_state.turn_point_down;
5414 | +
5415 | +			rshash_state.state = RSHASH_PRE_STILL;
5416 | +		} else {
5417 | +			inc_hash_strength(hash_strength_delta);
5418 | +			inc_hash_strength_delta();
5419 | +		}
5420 | +
5421 | +		break;
5422 | +
5423 | +	case RSHASH_NEW:
5424 | +	case RSHASH_PRE_STILL:
5425 | +		rshash_state.stable_benefit = get_current_benefit();
5426 | +		rshash_state.state = RSHASH_STILL;
5427 | +		hash_strength_delta = 0;
5428 | +		break;
5429 | +	default:
5430 | +		BUG();
5431 | +	}
5432 | +
5433 | +	/* rshash_neg = rshash_pos = 0; */
5434 | +	reset_benefit();
5435 | +
5436 | +	if (prev_hash_strength != hash_strength)
5437 | +		stable_tree_delta_hash(prev_hash_strength);
5438 | +
5439 | +	return prev_hash_strength != hash_strength;
5440 | +}
5441 | +
5442 | +/**
5443 | + * round_update_ladder() - The main function to do update of all the
5444 | + * adjustments whenever a scan round is finished.
5445 | + */
5446 | +static noinline void round_update_ladder(void)
5447 | +{
5448 | +	int i;
5449 | +	unsigned long dedup;
5450 | +	struct vma_slot *slot, *tmp_slot;
5451 | +
5452 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5453 | +		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
5454 | +	}
5455 | +
5456 | +	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
5457 | +
5458 | +		/* slot may be rung_rm_slot() when mm exits */
5459 | +		if (slot->snode) {
5460 | +			dedup = cal_dedup_ratio_old(slot);
5461 | +			if (dedup && dedup >= uksm_abundant_threshold)
5462 | +				vma_rung_up(slot);
5463 | +		}
5464 | +
5465 | +		slot->pages_bemerged = 0;
5466 | +		slot->pages_cowed = 0;
5467 | +
5468 | +		list_del_init(&slot->dedup_list);
5469 | +	}
5470 | +}
5471 | +
5472 | +static void uksm_del_vma_slot(struct vma_slot *slot)
5473 | +{
5474 | +	int i, j;
5475 | +	struct rmap_list_entry *entry;
5476 | +
5477 | +	if (slot->snode) {
5478 | +		/*
5479 | +		 * In case it just failed when entering the rung, it's not
5480 | +		 * necessary.
5481 | +		 */
5482 | +		rung_rm_slot(slot);
5483 | +	}
5484 | +
5485 | +	if (!list_empty(&slot->dedup_list))
5486 | +		list_del(&slot->dedup_list);
5487 | +
5488 | +	if (!slot->rmap_list_pool || !slot->pool_counts) {
5489 | +		/* In case it OOMed in uksm_vma_enter() */
5490 | +		goto out;
5491 | +	}
5492 | +
5493 | +	for (i = 0; i < slot->pool_size; i++) {
5494 | +		void *addr;
5495 | +
5496 | +		if (!slot->rmap_list_pool[i])
5497 | +			continue;
5498 | +
5499 | +		addr = kmap(slot->rmap_list_pool[i]);
5500 | +		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
5501 | +			entry = (struct rmap_list_entry *)addr + j;
5502 | +			if (is_addr(entry->addr))
5503 | +				continue;
5504 | +			if (!entry->item)
5505 | +				continue;
5506 | +
5507 | +			remove_rmap_item_from_tree(entry->item);
5508 | +			free_rmap_item(entry->item);
5509 | +			slot->pool_counts[i]--;
5510 | +		}
5511 | +		BUG_ON(slot->pool_counts[i]);
5512 | +		kunmap(slot->rmap_list_pool[i]);
5513 | +		__free_page(slot->rmap_list_pool[i]);
5514 | +	}
5515 | +	kfree(slot->rmap_list_pool);
5516 | +	kfree(slot->pool_counts);
5517 | +
5518 | +out:
5519 | +	slot->rung = NULL;
5520 | +	if (slot->flags & UKSM_SLOT_IN_UKSM) {
5521 | +		BUG_ON(uksm_pages_total < slot->pages);
5522 | +		uksm_pages_total -= slot->pages;
5523 | +	}
5524 | +
5525 | +	if (slot->fully_scanned_round == fully_scanned_round)
5526 | +		scanned_virtual_pages -= slot->pages;
5527 | +	else
5528 | +		scanned_virtual_pages -= slot->pages_scanned;
5529 | +	free_vma_slot(slot);
5530 | +}
5531 | +
5532 | +
5533 | +#define SPIN_LOCK_PERIOD	32
5534 | +static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
5535 | +static inline void cleanup_vma_slots(void)
5536 | +{
5537 | +	struct vma_slot *slot;
5538 | +	int i;
5539 | +
5540 | +	i = 0;
5541 | +	spin_lock(&vma_slot_list_lock);
5542 | +	while (!list_empty(&vma_slot_del)) {
5543 | +		slot = list_entry(vma_slot_del.next,
5544 | +				  struct vma_slot, slot_list);
5545 | +		list_del(&slot->slot_list);
5546 | +		cleanup_slots[i++] = slot;
5547 | +		if (i == SPIN_LOCK_PERIOD) {
5548 | +			spin_unlock(&vma_slot_list_lock);
5549 | +			while (--i >= 0)
5550 | +				uksm_del_vma_slot(cleanup_slots[i]);
5551 | +			i = 0;
5552 | +			spin_lock(&vma_slot_list_lock);
5553 | +		}
5554 | +	}
5555 | +	spin_unlock(&vma_slot_list_lock);
5556 | +
5557 | +	while (--i >= 0)
5558 | +		uksm_del_vma_slot(cleanup_slots[i]);
5559 | +}
5560 | +
5561 | +/*
5562 | +*expotional moving average formula
5563 | +*/
5564 | +static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
5565 | +{
5566 | +	/*
5567 | +	 * For a very high burst, even the ema cannot work well, a false very
5568 | +	 * high per-page time estimation can result in feedback in very high
5569 | +	 * overhead of context swith and rung update -- this will then lead
5570 | +	 * to higher per-paper time, this may not converge.
5571 | +	 *
5572 | +	 * Instead, we try to approach this value in a binary manner.
5573 | +	 */
5574 | +	if (curr > last_ema * 10)
5575 | +		return last_ema * 2;
5576 | +
5577 | +	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
5578 | +}
5579 | +
5580 | +/*
5581 | + * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
5582 | + * nanoseconds based on current uksm_sleep_jiffies.
5583 | + */
5584 | +static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
5585 | +{
5586 | +	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
5587 | +		(TIME_RATIO_SCALE - ratio) * ratio;
5588 | +}
5589 | +
5590 | +
5591 | +static inline unsigned long rung_real_ratio(int cpu_time_ratio)
5592 | +{
5593 | +	unsigned long ret;
5594 | +
5595 | +	BUG_ON(!cpu_time_ratio);
5596 | +
5597 | +	if (cpu_time_ratio > 0)
5598 | +		ret = cpu_time_ratio;
5599 | +	else
5600 | +		ret = (unsigned long)(-cpu_time_ratio) *
5601 | +			uksm_max_cpu_percentage / 100UL;
5602 | +
5603 | +	return ret ? ret : 1;
5604 | +}
5605 | +
5606 | +static noinline void uksm_calc_scan_pages(void)
5607 | +{
5608 | +	struct scan_rung *ladder = uksm_scan_ladder;
5609 | +	unsigned long sleep_usecs, nsecs;
5610 | +	unsigned long ratio;
5611 | +	int i;
5612 | +	unsigned long per_page;
5613 | +
5614 | +	if (uksm_ema_page_time > 100000 ||
5615 | +	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
5616 | +		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
5617 | +
5618 | +	per_page = uksm_ema_page_time;
5619 | +	BUG_ON(!per_page);
5620 | +
5621 | +	/*
5622 | +	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
5623 | +	 * based on saved user input.
5624 | +	 */
5625 | +	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
5626 | +		uksm_sleep_jiffies = uksm_sleep_saved;
5627 | +
5628 | +	/* We require a rung scan at least 1 page in a period. */
5629 | +	nsecs = per_page;
5630 | +	ratio = rung_real_ratio(ladder[0].cpu_ratio);
5631 | +	if (cpu_ratio_to_nsec(ratio) < nsecs) {
5632 | +		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
5633 | +				/ NSEC_PER_USEC;
5634 | +		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
5635 | +	}
5636 | +
5637 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5638 | +		ratio = rung_real_ratio(ladder[i].cpu_ratio);
5639 | +		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
5640 | +					per_page;
5641 | +		BUG_ON(!ladder[i].pages_to_scan);
5642 | +		uksm_calc_rung_step(&ladder[i], per_page, ratio);
5643 | +	}
5644 | +}
5645 | +
5646 | +/*
5647 | + * From the scan time of this round (ns) to next expected min sleep time
5648 | + * (ms), be careful of the possible overflows. ratio is taken from
5649 | + * rung_real_ratio()
5650 | + */
5651 | +static inline
5652 | +unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
5653 | +{
5654 | +	scan_time >>= 20; /* to msec level now */
5655 | +	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
5656 | +
5657 | +	return (unsigned int) ((unsigned long) scan_time *
5658 | +			       (TIME_RATIO_SCALE - ratio) / ratio);
5659 | +}
5660 | +
5661 | +#define __round_mask(x, y) ((__typeof__(x))((y)-1))
5662 | +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
5663 | +
5664 | +static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
5665 | +{
5666 | +	struct scan_rung *rung;
5667 | +
5668 | +	rung = &uksm_scan_ladder[0];
5669 | +	rung_add_new_slots(rung, slots, num);
5670 | +}
5671 | +
5672 | +static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
5673 | +
5674 | +static void uksm_enter_all_slots(void)
5675 | +{
5676 | +	struct vma_slot *slot;
5677 | +	unsigned long index;
5678 | +	struct list_head empty_vma_list;
5679 | +	int i;
5680 | +
5681 | +	i = 0;
5682 | +	index = 0;
5683 | +	INIT_LIST_HEAD(&empty_vma_list);
5684 | +
5685 | +	spin_lock(&vma_slot_list_lock);
5686 | +	while (!list_empty(&vma_slot_new)) {
5687 | +		slot = list_entry(vma_slot_new.next,
5688 | +				  struct vma_slot, slot_list);
5689 | +
5690 | +		if (!slot->vma->anon_vma) {
5691 | +			list_move(&slot->slot_list, &empty_vma_list);
5692 | +		} else if (vma_can_enter(slot->vma)) {
5693 | +			batch_slots[index++] = slot;
5694 | +			list_del_init(&slot->slot_list);
5695 | +		} else {
5696 | +			list_move(&slot->slot_list, &vma_slot_noadd);
5697 | +		}
5698 | +
5699 | +		if (++i == SPIN_LOCK_PERIOD ||
5700 | +		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
5701 | +			spin_unlock(&vma_slot_list_lock);
5702 | +
5703 | +			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
5704 | +				uksm_vma_enter(batch_slots, index);
5705 | +				index = 0;
5706 | +			}
5707 | +			i = 0;
5708 | +			cond_resched();
5709 | +			spin_lock(&vma_slot_list_lock);
5710 | +		}
5711 | +	}
5712 | +
5713 | +	list_splice(&empty_vma_list, &vma_slot_new);
5714 | +
5715 | +	spin_unlock(&vma_slot_list_lock);
5716 | +
5717 | +	if (index)
5718 | +		uksm_vma_enter(batch_slots, index);
5719 | +
5720 | +}
5721 | +
5722 | +static inline int rung_round_finished(struct scan_rung *rung)
5723 | +{
5724 | +	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
5725 | +}
5726 | +
5727 | +static inline void judge_slot(struct vma_slot *slot)
5728 | +{
5729 | +	struct scan_rung *rung = slot->rung;
5730 | +	unsigned long dedup;
5731 | +	int deleted;
5732 | +
5733 | +	dedup = cal_dedup_ratio(slot);
5734 | +	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
5735 | +		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
5736 | +	else if (dedup && dedup >= uksm_abundant_threshold)
5737 | +		deleted = vma_rung_up(slot);
5738 | +	else
5739 | +		deleted = vma_rung_down(slot);
5740 | +
5741 | +	slot->pages_merged = 0;
5742 | +	slot->pages_cowed = 0;
5743 | +	slot->this_sampled = 0;
5744 | +
5745 | +	if (vma_fully_scanned(slot)) {
5746 | +		slot->pages_scanned = 0;
5747 | +	}
5748 | +
5749 | +	slot->last_scanned = slot->pages_scanned;
5750 | +
5751 | +	/* If its deleted in above, then rung was already advanced. */
5752 | +	if (!deleted)
5753 | +		advance_current_scan(rung);
5754 | +}
5755 | +
5756 | +
5757 | +static inline int hash_round_finished(void)
5758 | +{
5759 | +	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
5760 | +		scanned_virtual_pages = 0;
5761 | +		if (uksm_pages_scanned)
5762 | +			fully_scanned_round++;
5763 | +
5764 | +		return 1;
5765 | +	} else {
5766 | +		return 0;
5767 | +	}
5768 | +}
5769 | +
5770 | +#define UKSM_MMSEM_BATCH	5
5771 | +#define BUSY_RETRY		100
5772 | +
5773 | +/**
5774 | + * uksm_do_scan()  - the main worker function.
5775 | + */
5776 | +static noinline void uksm_do_scan(void)
5777 | +{
5778 | +	struct vma_slot *slot, *iter;
5779 | +	struct mm_struct *busy_mm;
5780 | +	unsigned char round_finished, all_rungs_emtpy;
5781 | +	int i, err, mmsem_batch;
5782 | +	unsigned long pcost;
5783 | +	long long delta_exec;
5784 | +	unsigned long vpages, max_cpu_ratio;
5785 | +	unsigned long long start_time, end_time, scan_time;
5786 | +	unsigned int expected_jiffies;
5787 | +
5788 | +	might_sleep();
5789 | +
5790 | +	vpages = 0;
5791 | +
5792 | +	start_time = task_sched_runtime(current);
5793 | +	max_cpu_ratio = 0;
5794 | +	mmsem_batch = 0;
5795 | +
5796 | +	for (i = 0; i < SCAN_LADDER_SIZE;) {
5797 | +		struct scan_rung *rung = &uksm_scan_ladder[i];
5798 | +		unsigned long ratio;
5799 | +		int busy_retry;
5800 | +
5801 | +		if (!rung->pages_to_scan) {
5802 | +			i++;
5803 | +			continue;
5804 | +		}
5805 | +
5806 | +		if (!rung->vma_root.num) {
5807 | +			rung->pages_to_scan = 0;
5808 | +			i++;
5809 | +			continue;
5810 | +		}
5811 | +
5812 | +		ratio = rung_real_ratio(rung->cpu_ratio);
5813 | +		if (ratio > max_cpu_ratio)
5814 | +			max_cpu_ratio = ratio;
5815 | +
5816 | +		busy_retry = BUSY_RETRY;
5817 | +		/*
5818 | +		 * Do not consider rung_round_finished() here, just used up the
5819 | +		 * rung->pages_to_scan quota.
5820 | +		 */
5821 | +		while (rung->pages_to_scan && rung->vma_root.num &&
5822 | +		       likely(!freezing(current))) {
5823 | +			int reset = 0;
5824 | +
5825 | +			slot = rung->current_scan;
5826 | +
5827 | +			BUG_ON(vma_fully_scanned(slot));
5828 | +
5829 | +			if (mmsem_batch) {
5830 | +				err = 0;
5831 | +			} else {
5832 | +				err = try_down_read_slot_mmap_sem(slot);
5833 | +			}
5834 | +
5835 | +			if (err == -ENOENT) {
5836 | +rm_slot:
5837 | +				rung_rm_slot(slot);
5838 | +				continue;
5839 | +			}
5840 | +
5841 | +			busy_mm = slot->mm;
5842 | +
5843 | +			if (err == -EBUSY) {
5844 | +				/* skip other vmas on the same mm */
5845 | +				do {
5846 | +					reset = advance_current_scan(rung);
5847 | +					iter = rung->current_scan;
5848 | +					busy_retry--;
5849 | +					if (iter->vma->vm_mm != busy_mm ||
5850 | +					    !busy_retry || reset)
5851 | +						break;
5852 | +				} while (1);
5853 | +
5854 | +				if (iter->vma->vm_mm != busy_mm) {
5855 | +					continue;
5856 | +				} else {
5857 | +					/* scan round finsished */
5858 | +					break;
5859 | +				}
5860 | +			}
5861 | +
5862 | +			BUG_ON(!vma_can_enter(slot->vma));
5863 | +			if (uksm_test_exit(slot->vma->vm_mm)) {
5864 | +				mmsem_batch = 0;
5865 | +				up_read(&slot->vma->vm_mm->mmap_sem);
5866 | +				goto rm_slot;
5867 | +			}
5868 | +
5869 | +			if (mmsem_batch)
5870 | +				mmsem_batch--;
5871 | +			else
5872 | +				mmsem_batch = UKSM_MMSEM_BATCH;
5873 | +
5874 | +			/* Ok, we have take the mmap_sem, ready to scan */
5875 | +			scan_vma_one_page(slot);
5876 | +			rung->pages_to_scan--;
5877 | +			vpages++;
5878 | +
5879 | +			if (rung->current_offset + rung->step > slot->pages - 1
5880 | +			    || vma_fully_scanned(slot)) {
5881 | +				up_read(&slot->vma->vm_mm->mmap_sem);
5882 | +				judge_slot(slot);
5883 | +				mmsem_batch = 0;
5884 | +			} else {
5885 | +				rung->current_offset += rung->step;
5886 | +				if (!mmsem_batch)
5887 | +					up_read(&slot->vma->vm_mm->mmap_sem);
5888 | +			}
5889 | +
5890 | +			busy_retry = BUSY_RETRY;
5891 | +			cond_resched();
5892 | +		}
5893 | +
5894 | +		if (mmsem_batch) {
5895 | +			up_read(&slot->vma->vm_mm->mmap_sem);
5896 | +			mmsem_batch = 0;
5897 | +		}
5898 | +
5899 | +		if (freezing(current))
5900 | +			break;
5901 | +
5902 | +		cond_resched();
5903 | +	}
5904 | +	end_time = task_sched_runtime(current);
5905 | +	delta_exec = end_time - start_time;
5906 | +
5907 | +	if (freezing(current))
5908 | +		return;
5909 | +
5910 | +	cleanup_vma_slots();
5911 | +	uksm_enter_all_slots();
5912 | +
5913 | +	round_finished = 1;
5914 | +	all_rungs_emtpy = 1;
5915 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
5916 | +		struct scan_rung *rung = &uksm_scan_ladder[i];
5917 | +
5918 | +		if (rung->vma_root.num) {
5919 | +			all_rungs_emtpy = 0;
5920 | +			if (!rung_round_finished(rung))
5921 | +				round_finished = 0;
5922 | +		}
5923 | +	}
5924 | +
5925 | +	if (all_rungs_emtpy)
5926 | +		round_finished = 0;
5927 | +
5928 | +	if (round_finished) {
5929 | +		round_update_ladder();
5930 | +		uksm_eval_round++;
5931 | +
5932 | +		if (hash_round_finished() && rshash_adjust()) {
5933 | +			/* Reset the unstable root iff hash strength changed */
5934 | +			uksm_hash_round++;
5935 | +			root_unstable_tree = RB_ROOT;
5936 | +			free_all_tree_nodes(&unstable_tree_node_list);
5937 | +		}
5938 | +
5939 | +		/*
5940 | +		 * A number of pages can hang around indefinitely on per-cpu
5941 | +		 * pagevecs, raised page count preventing write_protect_page
5942 | +		 * from merging them.  Though it doesn't really matter much,
5943 | +		 * it is puzzling to see some stuck in pages_volatile until
5944 | +		 * other activity jostles them out, and they also prevented
5945 | +		 * LTP's KSM test from succeeding deterministically; so drain
5946 | +		 * them here (here rather than on entry to uksm_do_scan(),
5947 | +		 * so we don't IPI too often when pages_to_scan is set low).
5948 | +		 */
5949 | +		lru_add_drain_all();
5950 | +	}
5951 | +
5952 | +
5953 | +	if (vpages && delta_exec > 0) {
5954 | +		pcost = (unsigned long) delta_exec / vpages;
5955 | +		if (likely(uksm_ema_page_time))
5956 | +			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
5957 | +		else
5958 | +			uksm_ema_page_time = pcost;
5959 | +	}
5960 | +
5961 | +	uksm_calc_scan_pages();
5962 | +	uksm_sleep_real = uksm_sleep_jiffies;
5963 | +	/* in case of radical cpu bursts, apply the upper bound */
5964 | +	end_time = task_sched_runtime(current);
5965 | +	if (max_cpu_ratio && end_time > start_time) {
5966 | +		scan_time = end_time - start_time;
5967 | +		expected_jiffies = msecs_to_jiffies(
5968 | +			scan_time_to_sleep(scan_time, max_cpu_ratio));
5969 | +
5970 | +		if (expected_jiffies > uksm_sleep_real)
5971 | +			uksm_sleep_real = expected_jiffies;
5972 | +
5973 | +		/* We have a 1 second up bound for responsiveness. */
5974 | +		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
5975 | +			uksm_sleep_real = msecs_to_jiffies(1000);
5976 | +	}
5977 | +
5978 | +	return;
5979 | +}
5980 | +
5981 | +static int ksmd_should_run(void)
5982 | +{
5983 | +	return uksm_run & UKSM_RUN_MERGE;
5984 | +}
5985 | +
5986 | +static int uksm_scan_thread(void *nothing)
5987 | +{
5988 | +	set_freezable();
5989 | +	set_user_nice(current, 5);
5990 | +
5991 | +	while (!kthread_should_stop()) {
5992 | +		mutex_lock(&uksm_thread_mutex);
5993 | +		if (ksmd_should_run()) {
5994 | +			uksm_do_scan();
5995 | +		}
5996 | +		mutex_unlock(&uksm_thread_mutex);
5997 | +
5998 | +		try_to_freeze();
5999 | +
6000 | +		if (ksmd_should_run()) {
6001 | +			schedule_timeout_interruptible(uksm_sleep_real);
6002 | +			uksm_sleep_times++;
6003 | +		} else {
6004 | +			wait_event_freezable(uksm_thread_wait,
6005 | +				ksmd_should_run() || kthread_should_stop());
6006 | +		}
6007 | +	}
6008 | +	return 0;
6009 | +}
6010 | +
6011 | +int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
6012 | +{
6013 | +	struct stable_node *stable_node;
6014 | +	struct node_vma *node_vma;
6015 | +	struct rmap_item *rmap_item;
6016 | +	int ret = SWAP_AGAIN;
6017 | +	int search_new_forks = 0;
6018 | +	unsigned long address;
6019 | +
6020 | +	VM_BUG_ON_PAGE(!PageKsm(page), page);
6021 | +	VM_BUG_ON_PAGE(!PageLocked(page), page);
6022 | +
6023 | +	stable_node = page_stable_node(page);
6024 | +	if (!stable_node)
6025 | +		return ret;
6026 | +again:
6027 | +	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
6028 | +		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
6029 | +			struct anon_vma *anon_vma = rmap_item->anon_vma;
6030 | +			struct anon_vma_chain *vmac;
6031 | +			struct vm_area_struct *vma;
6032 | +
6033 | +			cond_resched();
6034 | +			anon_vma_lock_read(anon_vma);
6035 | +			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
6036 | +						       0, ULONG_MAX) {
6037 | +				cond_resched();
6038 | +				vma = vmac->vma;
6039 | +				address = get_rmap_addr(rmap_item);
6040 | +
6041 | +				if (address < vma->vm_start ||
6042 | +				    address >= vma->vm_end)
6043 | +					continue;
6044 | +
6045 | +				if ((rmap_item->slot->vma == vma) ==
6046 | +				    search_new_forks)
6047 | +					continue;
6048 | +
6049 | +				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
6050 | +					continue;
6051 | +
6052 | +				ret = rwc->rmap_one(page, vma, address, rwc->arg);
6053 | +				if (ret != SWAP_AGAIN) {
6054 | +					anon_vma_unlock_read(anon_vma);
6055 | +					goto out;
6056 | +				}
6057 | +
6058 | +				if (rwc->done && rwc->done(page)) {
6059 | +					anon_vma_unlock_read(anon_vma);
6060 | +					goto out;
6061 | +				}
6062 | +			}
6063 | +			anon_vma_unlock_read(anon_vma);
6064 | +		}
6065 | +	}
6066 | +	if (!search_new_forks++)
6067 | +		goto again;
6068 | +out:
6069 | +	return ret;
6070 | +}
6071 | +
6072 | +#ifdef CONFIG_MIGRATION
6073 | +/* Common ksm interface but may be specific to uksm */
6074 | +void ksm_migrate_page(struct page *newpage, struct page *oldpage)
6075 | +{
6076 | +	struct stable_node *stable_node;
6077 | +
6078 | +	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6079 | +	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6080 | +	VM_BUG_ON(newpage->mapping != oldpage->mapping);
6081 | +
6082 | +	stable_node = page_stable_node(newpage);
6083 | +	if (stable_node) {
6084 | +		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
6085 | +		stable_node->kpfn = page_to_pfn(newpage);
6086 | +		/*
6087 | +		 * newpage->mapping was set in advance; now we need smp_wmb()
6088 | +		 * to make sure that the new stable_node->kpfn is visible
6089 | +		 * to get_ksm_page() before it can see that oldpage->mapping
6090 | +		 * has gone stale (or that PageSwapCache has been cleared).
6091 | +		 */
6092 | +		smp_wmb();
6093 | +		set_page_stable_node(oldpage, NULL);
6094 | +	}
6095 | +}
6096 | +#endif /* CONFIG_MIGRATION */
6097 | +
6098 | +#ifdef CONFIG_MEMORY_HOTREMOVE
6099 | +static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
6100 | +						 unsigned long end_pfn)
6101 | +{
6102 | +	struct rb_node *node;
6103 | +
6104 | +	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
6105 | +		struct stable_node *stable_node;
6106 | +
6107 | +		stable_node = rb_entry(node, struct stable_node, node);
6108 | +		if (stable_node->kpfn >= start_pfn &&
6109 | +		    stable_node->kpfn < end_pfn)
6110 | +			return stable_node;
6111 | +	}
6112 | +	return NULL;
6113 | +}
6114 | +
6115 | +static int uksm_memory_callback(struct notifier_block *self,
6116 | +			       unsigned long action, void *arg)
6117 | +{
6118 | +	struct memory_notify *mn = arg;
6119 | +	struct stable_node *stable_node;
6120 | +
6121 | +	switch (action) {
6122 | +	case MEM_GOING_OFFLINE:
6123 | +		/*
6124 | +		 * Keep it very simple for now: just lock out ksmd and
6125 | +		 * MADV_UNMERGEABLE while any memory is going offline.
6126 | +		 * mutex_lock_nested() is necessary because lockdep was alarmed
6127 | +		 * that here we take uksm_thread_mutex inside notifier chain
6128 | +		 * mutex, and later take notifier chain mutex inside
6129 | +		 * uksm_thread_mutex to unlock it.   But that's safe because both
6130 | +		 * are inside mem_hotplug_mutex.
6131 | +		 */
6132 | +		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
6133 | +		break;
6134 | +
6135 | +	case MEM_OFFLINE:
6136 | +		/*
6137 | +		 * Most of the work is done by page migration; but there might
6138 | +		 * be a few stable_nodes left over, still pointing to struct
6139 | +		 * pages which have been offlined: prune those from the tree.
6140 | +		 */
6141 | +		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
6142 | +					mn->start_pfn + mn->nr_pages)) != NULL)
6143 | +			remove_node_from_stable_tree(stable_node, 1, 1);
6144 | +		/* fallthrough */
6145 | +
6146 | +	case MEM_CANCEL_OFFLINE:
6147 | +		mutex_unlock(&uksm_thread_mutex);
6148 | +		break;
6149 | +	}
6150 | +	return NOTIFY_OK;
6151 | +}
6152 | +#endif /* CONFIG_MEMORY_HOTREMOVE */
6153 | +
6154 | +#ifdef CONFIG_SYSFS
6155 | +/*
6156 | + * This all compiles without CONFIG_SYSFS, but is a waste of space.
6157 | + */
6158 | +
6159 | +#define UKSM_ATTR_RO(_name) \
6160 | +	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
6161 | +#define UKSM_ATTR(_name) \
6162 | +	static struct kobj_attribute _name##_attr = \
6163 | +		__ATTR(_name, 0644, _name##_show, _name##_store)
6164 | +
6165 | +static ssize_t max_cpu_percentage_show(struct kobject *kobj,
6166 | +				    struct kobj_attribute *attr, char *buf)
6167 | +{
6168 | +	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
6169 | +}
6170 | +
6171 | +static ssize_t max_cpu_percentage_store(struct kobject *kobj,
6172 | +				     struct kobj_attribute *attr,
6173 | +				     const char *buf, size_t count)
6174 | +{
6175 | +	unsigned long max_cpu_percentage;
6176 | +	int err;
6177 | +
6178 | +	err = kstrtoul(buf, 10, &max_cpu_percentage);
6179 | +	if (err || max_cpu_percentage > 100)
6180 | +		return -EINVAL;
6181 | +
6182 | +	if (max_cpu_percentage == 100)
6183 | +		max_cpu_percentage = 99;
6184 | +	else if (max_cpu_percentage < 10)
6185 | +		max_cpu_percentage = 10;
6186 | +
6187 | +	uksm_max_cpu_percentage = max_cpu_percentage;
6188 | +
6189 | +	return count;
6190 | +}
6191 | +UKSM_ATTR(max_cpu_percentage);
6192 | +
6193 | +static ssize_t sleep_millisecs_show(struct kobject *kobj,
6194 | +				    struct kobj_attribute *attr, char *buf)
6195 | +{
6196 | +	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
6197 | +}
6198 | +
6199 | +static ssize_t sleep_millisecs_store(struct kobject *kobj,
6200 | +				     struct kobj_attribute *attr,
6201 | +				     const char *buf, size_t count)
6202 | +{
6203 | +	unsigned long msecs;
6204 | +	int err;
6205 | +
6206 | +	err = kstrtoul(buf, 10, &msecs);
6207 | +	if (err || msecs > MSEC_PER_SEC)
6208 | +		return -EINVAL;
6209 | +
6210 | +	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
6211 | +	uksm_sleep_saved = uksm_sleep_jiffies;
6212 | +
6213 | +	return count;
6214 | +}
6215 | +UKSM_ATTR(sleep_millisecs);
6216 | +
6217 | +
6218 | +static ssize_t cpu_governor_show(struct kobject *kobj,
6219 | +				  struct kobj_attribute *attr, char *buf)
6220 | +{
6221 | +	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6222 | +	int i;
6223 | +
6224 | +	buf[0] = '\0';
6225 | +	for (i = 0; i < n ; i++) {
6226 | +		if (uksm_cpu_governor == i)
6227 | +			strcat(buf, "[");
6228 | +
6229 | +		strcat(buf, uksm_cpu_governor_str[i]);
6230 | +
6231 | +		if (uksm_cpu_governor == i)
6232 | +			strcat(buf, "]");
6233 | +
6234 | +		strcat(buf, " ");
6235 | +	}
6236 | +	strcat(buf, "\n");
6237 | +
6238 | +	return strlen(buf);
6239 | +}
6240 | +
6241 | +static inline void init_performance_values(void)
6242 | +{
6243 | +	int i;
6244 | +	struct scan_rung *rung;
6245 | +	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
6246 | +
6247 | +
6248 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6249 | +		rung = uksm_scan_ladder + i;
6250 | +		rung->cpu_ratio = preset->cpu_ratio[i];
6251 | +		rung->cover_msecs = preset->cover_msecs[i];
6252 | +	}
6253 | +
6254 | +	uksm_max_cpu_percentage = preset->max_cpu;
6255 | +}
6256 | +
6257 | +static ssize_t cpu_governor_store(struct kobject *kobj,
6258 | +				   struct kobj_attribute *attr,
6259 | +				   const char *buf, size_t count)
6260 | +{
6261 | +	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
6262 | +
6263 | +	for (n--; n >=0 ; n--) {
6264 | +		if (!strncmp(buf, uksm_cpu_governor_str[n],
6265 | +			     strlen(uksm_cpu_governor_str[n])))
6266 | +			break;
6267 | +	}
6268 | +
6269 | +	if (n < 0)
6270 | +		return -EINVAL;
6271 | +	else
6272 | +		uksm_cpu_governor = n;
6273 | +
6274 | +	init_performance_values();
6275 | +
6276 | +	return count;
6277 | +}
6278 | +UKSM_ATTR(cpu_governor);
6279 | +
6280 | +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
6281 | +			char *buf)
6282 | +{
6283 | +	return sprintf(buf, "%u\n", uksm_run);
6284 | +}
6285 | +
6286 | +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
6287 | +			 const char *buf, size_t count)
6288 | +{
6289 | +	int err;
6290 | +	unsigned long flags;
6291 | +
6292 | +	err = kstrtoul(buf, 10, &flags);
6293 | +	if (err || flags > UINT_MAX)
6294 | +		return -EINVAL;
6295 | +	if (flags > UKSM_RUN_MERGE)
6296 | +		return -EINVAL;
6297 | +
6298 | +	mutex_lock(&uksm_thread_mutex);
6299 | +	if (uksm_run != flags) {
6300 | +		uksm_run = flags;
6301 | +	}
6302 | +	mutex_unlock(&uksm_thread_mutex);
6303 | +
6304 | +	if (flags & UKSM_RUN_MERGE)
6305 | +		wake_up_interruptible(&uksm_thread_wait);
6306 | +
6307 | +	return count;
6308 | +}
6309 | +UKSM_ATTR(run);
6310 | +
6311 | +static ssize_t abundant_threshold_show(struct kobject *kobj,
6312 | +				     struct kobj_attribute *attr, char *buf)
6313 | +{
6314 | +	return sprintf(buf, "%u\n", uksm_abundant_threshold);
6315 | +}
6316 | +
6317 | +static ssize_t abundant_threshold_store(struct kobject *kobj,
6318 | +				      struct kobj_attribute *attr,
6319 | +				      const char *buf, size_t count)
6320 | +{
6321 | +	int err;
6322 | +	unsigned long flags;
6323 | +
6324 | +	err = kstrtoul(buf, 10, &flags);
6325 | +	if (err || flags > 99)
6326 | +		return -EINVAL;
6327 | +
6328 | +	uksm_abundant_threshold = flags;
6329 | +
6330 | +	return count;
6331 | +}
6332 | +UKSM_ATTR(abundant_threshold);
6333 | +
6334 | +static ssize_t thrash_threshold_show(struct kobject *kobj,
6335 | +				     struct kobj_attribute *attr, char *buf)
6336 | +{
6337 | +	return sprintf(buf, "%u\n", uksm_thrash_threshold);
6338 | +}
6339 | +
6340 | +static ssize_t thrash_threshold_store(struct kobject *kobj,
6341 | +				      struct kobj_attribute *attr,
6342 | +				      const char *buf, size_t count)
6343 | +{
6344 | +	int err;
6345 | +	unsigned long flags;
6346 | +
6347 | +	err = kstrtoul(buf, 10, &flags);
6348 | +	if (err || flags > 99)
6349 | +		return -EINVAL;
6350 | +
6351 | +	uksm_thrash_threshold = flags;
6352 | +
6353 | +	return count;
6354 | +}
6355 | +UKSM_ATTR(thrash_threshold);
6356 | +
6357 | +static ssize_t cpu_ratios_show(struct kobject *kobj,
6358 | +			       struct kobj_attribute *attr, char *buf)
6359 | +{
6360 | +	int i, size;
6361 | +	struct scan_rung *rung;
6362 | +	char *p = buf;
6363 | +
6364 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6365 | +		rung = &uksm_scan_ladder[i];
6366 | +
6367 | +		if (rung->cpu_ratio > 0)
6368 | +			size = sprintf(p, "%d ", rung->cpu_ratio);
6369 | +		else
6370 | +			size = sprintf(p, "MAX/%d ",
6371 | +					TIME_RATIO_SCALE / -rung->cpu_ratio);
6372 | +
6373 | +		p += size;
6374 | +	}
6375 | +
6376 | +	*p++ = '\n';
6377 | +	*p = '\0';
6378 | +
6379 | +	return p - buf;
6380 | +}
6381 | +
6382 | +static ssize_t cpu_ratios_store(struct kobject *kobj,
6383 | +				      struct kobj_attribute *attr,
6384 | +				      const char *buf, size_t count)
6385 | +{
6386 | +	int i, cpuratios[SCAN_LADDER_SIZE], err;
6387 | +	unsigned long value;
6388 | +	struct scan_rung *rung;
6389 | +	char *p, *end = NULL;
6390 | +
6391 | +	p = kzalloc(count, GFP_KERNEL);
6392 | +	if (!p)
6393 | +		return -ENOMEM;
6394 | +
6395 | +	memcpy(p, buf, count);
6396 | +
6397 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6398 | +		if (i != SCAN_LADDER_SIZE -1) {
6399 | +			end = strchr(p, ' ');
6400 | +			if (!end)
6401 | +				return -EINVAL;
6402 | +
6403 | +			*end = '\0';
6404 | +		}
6405 | +
6406 | +		if (strstr(p, "MAX/")) {
6407 | +			p = strchr(p, '/') + 1;
6408 | +			err = kstrtoul(p, 10, &value);
6409 | +			if (err || value > TIME_RATIO_SCALE || !value)
6410 | +				return -EINVAL;
6411 | +
6412 | +			cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
6413 | +		} else {
6414 | +			err = kstrtoul(p, 10, &value);
6415 | +			if (err || value > TIME_RATIO_SCALE || !value)
6416 | +				return -EINVAL;
6417 | +
6418 | +			cpuratios[i] = value;
6419 | +		}
6420 | +
6421 | +		p = end + 1;
6422 | +	}
6423 | +
6424 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6425 | +		rung = &uksm_scan_ladder[i];
6426 | +
6427 | +		rung->cpu_ratio = cpuratios[i];
6428 | +	}
6429 | +
6430 | +	return count;
6431 | +}
6432 | +UKSM_ATTR(cpu_ratios);
6433 | +
6434 | +static ssize_t eval_intervals_show(struct kobject *kobj,
6435 | +			       struct kobj_attribute *attr, char *buf)
6436 | +{
6437 | +	int i, size;
6438 | +	struct scan_rung *rung;
6439 | +	char *p = buf;
6440 | +
6441 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6442 | +		rung = &uksm_scan_ladder[i];
6443 | +		size = sprintf(p, "%u ", rung->cover_msecs);
6444 | +		p += size;
6445 | +	}
6446 | +
6447 | +	*p++ = '\n';
6448 | +	*p = '\0';
6449 | +
6450 | +	return p - buf;
6451 | +}
6452 | +
6453 | +static ssize_t eval_intervals_store(struct kobject *kobj,
6454 | +				      struct kobj_attribute *attr,
6455 | +				      const char *buf, size_t count)
6456 | +{
6457 | +	int i, err;
6458 | +	unsigned long values[SCAN_LADDER_SIZE];
6459 | +	struct scan_rung *rung;
6460 | +	char *p, *end = NULL;
6461 | +	ssize_t ret = count;
6462 | +
6463 | +	p = kzalloc(count + 2, GFP_KERNEL);
6464 | +	if (!p)
6465 | +		return -ENOMEM;
6466 | +
6467 | +	memcpy(p, buf, count);
6468 | +
6469 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6470 | +		if (i != SCAN_LADDER_SIZE -1) {
6471 | +			end = strchr(p, ' ');
6472 | +			if (!end) {
6473 | +				ret = -EINVAL;
6474 | +				goto out;
6475 | +			}
6476 | +
6477 | +			*end = '\0';
6478 | +		}
6479 | +
6480 | +		err = kstrtoul(p, 10, &values[i]);
6481 | +		if (err) {
6482 | +			ret = -EINVAL;
6483 | +			goto out;
6484 | +		}
6485 | +
6486 | +		p = end + 1;
6487 | +	}
6488 | +
6489 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6490 | +		rung = &uksm_scan_ladder[i];
6491 | +
6492 | +		rung->cover_msecs = values[i];
6493 | +	}
6494 | +
6495 | +out:
6496 | +	kfree(p);
6497 | +	return ret;
6498 | +}
6499 | +UKSM_ATTR(eval_intervals);
6500 | +
6501 | +static ssize_t ema_per_page_time_show(struct kobject *kobj,
6502 | +				 struct kobj_attribute *attr, char *buf)
6503 | +{
6504 | +	return sprintf(buf, "%lu\n", uksm_ema_page_time);
6505 | +}
6506 | +UKSM_ATTR_RO(ema_per_page_time);
6507 | +
6508 | +static ssize_t pages_shared_show(struct kobject *kobj,
6509 | +				 struct kobj_attribute *attr, char *buf)
6510 | +{
6511 | +	return sprintf(buf, "%lu\n", uksm_pages_shared);
6512 | +}
6513 | +UKSM_ATTR_RO(pages_shared);
6514 | +
6515 | +static ssize_t pages_sharing_show(struct kobject *kobj,
6516 | +				  struct kobj_attribute *attr, char *buf)
6517 | +{
6518 | +	return sprintf(buf, "%lu\n", uksm_pages_sharing);
6519 | +}
6520 | +UKSM_ATTR_RO(pages_sharing);
6521 | +
6522 | +static ssize_t pages_unshared_show(struct kobject *kobj,
6523 | +				   struct kobj_attribute *attr, char *buf)
6524 | +{
6525 | +	return sprintf(buf, "%lu\n", uksm_pages_unshared);
6526 | +}
6527 | +UKSM_ATTR_RO(pages_unshared);
6528 | +
6529 | +static ssize_t full_scans_show(struct kobject *kobj,
6530 | +			       struct kobj_attribute *attr, char *buf)
6531 | +{
6532 | +	return sprintf(buf, "%llu\n", fully_scanned_round);
6533 | +}
6534 | +UKSM_ATTR_RO(full_scans);
6535 | +
6536 | +static ssize_t pages_scanned_show(struct kobject *kobj,
6537 | +				  struct kobj_attribute *attr, char *buf)
6538 | +{
6539 | +	unsigned long base = 0;
6540 | +	u64 delta, ret;
6541 | +
6542 | +	if (pages_scanned_stored) {
6543 | +		base = pages_scanned_base;
6544 | +		ret = pages_scanned_stored;
6545 | +		delta = uksm_pages_scanned >> base;
6546 | +		if (CAN_OVERFLOW_U64(ret, delta)) {
6547 | +			ret >>= 1;
6548 | +			delta >>= 1;
6549 | +			base++;
6550 | +			ret += delta;
6551 | +		}
6552 | +	} else {
6553 | +		ret = uksm_pages_scanned;
6554 | +	}
6555 | +
6556 | +	while (ret > ULONG_MAX) {
6557 | +		ret >>= 1;
6558 | +		base++;
6559 | +	}
6560 | +
6561 | +	if (base)
6562 | +		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
6563 | +	else
6564 | +		return sprintf(buf, "%lu\n", (unsigned long)ret);
6565 | +}
6566 | +UKSM_ATTR_RO(pages_scanned);
6567 | +
6568 | +static ssize_t hash_strength_show(struct kobject *kobj,
6569 | +				  struct kobj_attribute *attr, char *buf)
6570 | +{
6571 | +	return sprintf(buf, "%lu\n", hash_strength);
6572 | +}
6573 | +UKSM_ATTR_RO(hash_strength);
6574 | +
6575 | +static ssize_t sleep_times_show(struct kobject *kobj,
6576 | +				  struct kobj_attribute *attr, char *buf)
6577 | +{
6578 | +	return sprintf(buf, "%llu\n", uksm_sleep_times);
6579 | +}
6580 | +UKSM_ATTR_RO(sleep_times);
6581 | +
6582 | +
6583 | +static struct attribute *uksm_attrs[] = {
6584 | +	&max_cpu_percentage_attr.attr,
6585 | +	&sleep_millisecs_attr.attr,
6586 | +	&cpu_governor_attr.attr,
6587 | +	&run_attr.attr,
6588 | +	&ema_per_page_time_attr.attr,
6589 | +	&pages_shared_attr.attr,
6590 | +	&pages_sharing_attr.attr,
6591 | +	&pages_unshared_attr.attr,
6592 | +	&full_scans_attr.attr,
6593 | +	&pages_scanned_attr.attr,
6594 | +	&hash_strength_attr.attr,
6595 | +	&sleep_times_attr.attr,
6596 | +	&thrash_threshold_attr.attr,
6597 | +	&abundant_threshold_attr.attr,
6598 | +	&cpu_ratios_attr.attr,
6599 | +	&eval_intervals_attr.attr,
6600 | +	NULL,
6601 | +};
6602 | +
6603 | +static struct attribute_group uksm_attr_group = {
6604 | +	.attrs = uksm_attrs,
6605 | +	.name = "uksm",
6606 | +};
6607 | +#endif /* CONFIG_SYSFS */
6608 | +
6609 | +static inline void init_scan_ladder(void)
6610 | +{
6611 | +	int i;
6612 | +	struct scan_rung *rung;
6613 | +
6614 | +	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
6615 | +		rung = uksm_scan_ladder + i;
6616 | +		slot_tree_init_root(&rung->vma_root);
6617 | +	}
6618 | +
6619 | +	init_performance_values();
6620 | +	uksm_calc_scan_pages();
6621 | +}
6622 | +
6623 | +static inline int cal_positive_negative_costs(void)
6624 | +{
6625 | +	struct page *p1, *p2;
6626 | +	unsigned char *addr1, *addr2;
6627 | +	unsigned long i, time_start, hash_cost;
6628 | +	unsigned long loopnum = 0;
6629 | +
6630 | +	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
6631 | +	volatile u32 hash;
6632 | +	volatile int ret;
6633 | +
6634 | +	p1 = alloc_page(GFP_KERNEL);
6635 | +	if (!p1)
6636 | +		return -ENOMEM;
6637 | +
6638 | +	p2 = alloc_page(GFP_KERNEL);
6639 | +	if (!p2)
6640 | +		return -ENOMEM;
6641 | +
6642 | +	addr1 = kmap_atomic(p1);
6643 | +	addr2 = kmap_atomic(p2);
6644 | +	memset(addr1, prandom_u32(), PAGE_SIZE);
6645 | +	memcpy(addr2, addr1, PAGE_SIZE);
6646 | +
6647 | +	/* make sure that the two pages differ in last byte */
6648 | +	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
6649 | +	kunmap_atomic(addr2);
6650 | +	kunmap_atomic(addr1);
6651 | +
6652 | +	time_start = jiffies;
6653 | +	while (jiffies - time_start < 100) {
6654 | +		for (i = 0; i < 100; i++)
6655 | +			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
6656 | +		loopnum += 100;
6657 | +	}
6658 | +	hash_cost = (jiffies - time_start);
6659 | +
6660 | +	time_start = jiffies;
6661 | +	for (i = 0; i < loopnum; i++)
6662 | +		ret = pages_identical(p1, p2);
6663 | +	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
6664 | +	memcmp_cost /= hash_cost;
6665 | +	printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
6666 | +			 "hash=%u cmp_ret=%d.\n",
6667 | +	       memcmp_cost, hash, ret);
6668 | +
6669 | +	__free_page(p1);
6670 | +	__free_page(p2);
6671 | +	return 0;
6672 | +}
6673 | +
6674 | +static int init_zeropage_hash_table(void)
6675 | +{
6676 | +	struct page *page;
6677 | +	char *addr;
6678 | +	int i;
6679 | +
6680 | +	page = alloc_page(GFP_KERNEL);
6681 | +	if (!page)
6682 | +		return -ENOMEM;
6683 | +
6684 | +	addr = kmap_atomic(page);
6685 | +	memset(addr, 0, PAGE_SIZE);
6686 | +	kunmap_atomic(addr);
6687 | +
6688 | +	zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
6689 | +		GFP_KERNEL);
6690 | +	if (!zero_hash_table)
6691 | +		return -ENOMEM;
6692 | +
6693 | +	for (i = 0; i < HASH_STRENGTH_MAX; i++)
6694 | +		zero_hash_table[i] = page_hash(page, i, 0);
6695 | +
6696 | +	__free_page(page);
6697 | +
6698 | +	return 0;
6699 | +}
6700 | +
6701 | +static inline int init_random_sampling(void)
6702 | +{
6703 | +	unsigned long i;
6704 | +	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
6705 | +	if (!random_nums)
6706 | +		return -ENOMEM;
6707 | +
6708 | +	for (i = 0; i < HASH_STRENGTH_FULL; i++)
6709 | +		random_nums[i] = i;
6710 | +
6711 | +	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
6712 | +		unsigned long rand_range, swap_index, tmp;
6713 | +
6714 | +		rand_range = HASH_STRENGTH_FULL - i;
6715 | +		swap_index = i + prandom_u32() % rand_range;
6716 | +		tmp = random_nums[i];
6717 | +		random_nums[i] =  random_nums[swap_index];
6718 | +		random_nums[swap_index] = tmp;
6719 | +	}
6720 | +
6721 | +	rshash_state.state = RSHASH_NEW;
6722 | +	rshash_state.below_count = 0;
6723 | +	rshash_state.lookup_window_index = 0;
6724 | +
6725 | +	return cal_positive_negative_costs();
6726 | +}
6727 | +
6728 | +static int __init uksm_slab_init(void)
6729 | +{
6730 | +	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
6731 | +	if (!rmap_item_cache)
6732 | +		goto out;
6733 | +
6734 | +	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
6735 | +	if (!stable_node_cache)
6736 | +		goto out_free1;
6737 | +
6738 | +	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
6739 | +	if (!node_vma_cache)
6740 | +		goto out_free2;
6741 | +
6742 | +	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
6743 | +	if (!vma_slot_cache)
6744 | +		goto out_free3;
6745 | +
6746 | +	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
6747 | +	if (!tree_node_cache)
6748 | +		goto out_free4;
6749 | +
6750 | +	return 0;
6751 | +
6752 | +out_free4:
6753 | +	kmem_cache_destroy(vma_slot_cache);
6754 | +out_free3:
6755 | +	kmem_cache_destroy(node_vma_cache);
6756 | +out_free2:
6757 | +	kmem_cache_destroy(stable_node_cache);
6758 | +out_free1:
6759 | +	kmem_cache_destroy(rmap_item_cache);
6760 | +out:
6761 | +	return -ENOMEM;
6762 | +}
6763 | +
6764 | +static void __init uksm_slab_free(void)
6765 | +{
6766 | +	kmem_cache_destroy(stable_node_cache);
6767 | +	kmem_cache_destroy(rmap_item_cache);
6768 | +	kmem_cache_destroy(node_vma_cache);
6769 | +	kmem_cache_destroy(vma_slot_cache);
6770 | +	kmem_cache_destroy(tree_node_cache);
6771 | +}
6772 | +
6773 | +/* Common interface to ksm, different to it. */
6774 | +int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
6775 | +		unsigned long end, int advice, unsigned long *vm_flags)
6776 | +{
6777 | +	int err;
6778 | +
6779 | +	switch (advice) {
6780 | +	case MADV_MERGEABLE:
6781 | +		return 0;		/* just ignore the advice */
6782 | +
6783 | +	case MADV_UNMERGEABLE:
6784 | +		if (!(*vm_flags & VM_MERGEABLE))
6785 | +			return 0;		/* just ignore the advice */
6786 | +
6787 | +		if (vma->anon_vma) {
6788 | +			err = unmerge_uksm_pages(vma, start, end);
6789 | +			if (err)
6790 | +				return err;
6791 | +		}
6792 | +
6793 | +		uksm_remove_vma(vma);
6794 | +		*vm_flags &= ~VM_MERGEABLE;
6795 | +		break;
6796 | +	}
6797 | +
6798 | +	return 0;
6799 | +}
6800 | +
6801 | +/* Common interface to ksm, actually the same. */
6802 | +struct page *ksm_might_need_to_copy(struct page *page,
6803 | +			struct vm_area_struct *vma, unsigned long address)
6804 | +{
6805 | +	struct anon_vma *anon_vma = page_anon_vma(page);
6806 | +	struct page *new_page;
6807 | +
6808 | +	if (PageKsm(page)) {
6809 | +		if (page_stable_node(page))
6810 | +			return page;	/* no need to copy it */
6811 | +	} else if (!anon_vma) {
6812 | +		return page;		/* no need to copy it */
6813 | +	} else if (anon_vma->root == vma->anon_vma->root &&
6814 | +		 page->index == linear_page_index(vma, address)) {
6815 | +		return page;		/* still no need to copy it */
6816 | +	}
6817 | +	if (!PageUptodate(page))
6818 | +		return page;		/* let do_swap_page report the error */
6819 | +
6820 | +	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
6821 | +	if (new_page) {
6822 | +		copy_user_highpage(new_page, page, address, vma);
6823 | +
6824 | +		SetPageDirty(new_page);
6825 | +		__SetPageUptodate(new_page);
6826 | +		__SetPageLocked(new_page);
6827 | +	}
6828 | +
6829 | +	return new_page;
6830 | +}
6831 | +
6832 | +static int __init uksm_init(void)
6833 | +{
6834 | +	struct task_struct *uksm_thread;
6835 | +	int err;
6836 | +
6837 | +	uksm_sleep_jiffies = msecs_to_jiffies(100);
6838 | +	uksm_sleep_saved = uksm_sleep_jiffies;
6839 | +
6840 | +	slot_tree_init();
6841 | +	init_scan_ladder();
6842 | +
6843 | +
6844 | +	err = init_random_sampling();
6845 | +	if (err)
6846 | +		goto out_free2;
6847 | +
6848 | +	err = uksm_slab_init();
6849 | +	if (err)
6850 | +		goto out_free1;
6851 | +
6852 | +	err = init_zeropage_hash_table();
6853 | +	if (err)
6854 | +		goto out_free0;
6855 | +
6856 | +	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
6857 | +	if (IS_ERR(uksm_thread)) {
6858 | +		printk(KERN_ERR "uksm: creating kthread failed\n");
6859 | +		err = PTR_ERR(uksm_thread);
6860 | +		goto out_free;
6861 | +	}
6862 | +
6863 | +#ifdef CONFIG_SYSFS
6864 | +	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
6865 | +	if (err) {
6866 | +		printk(KERN_ERR "uksm: register sysfs failed\n");
6867 | +		kthread_stop(uksm_thread);
6868 | +		goto out_free;
6869 | +	}
6870 | +#else
6871 | +	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */
6872 | +
6873 | +#endif /* CONFIG_SYSFS */
6874 | +
6875 | +#ifdef CONFIG_MEMORY_HOTREMOVE
6876 | +	/*
6877 | +	 * Choose a high priority since the callback takes uksm_thread_mutex:
6878 | +	 * later callbacks could only be taking locks which nest within that.
6879 | +	 */
6880 | +	hotplug_memory_notifier(uksm_memory_callback, 100);
6881 | +#endif
6882 | +	return 0;
6883 | +
6884 | +out_free:
6885 | +	kfree(zero_hash_table);
6886 | +out_free0:
6887 | +	uksm_slab_free();
6888 | +out_free1:
6889 | +	kfree(random_nums);
6890 | +out_free2:
6891 | +	kfree(uksm_scan_ladder);
6892 | +	return err;
6893 | +}
6894 | +
6895 | +#ifdef MODULE
6896 | +subsys_initcall(ksm_init);
6897 | +#else
6898 | +late_initcall(uksm_init);
6899 | +#endif
6900 | +
6901 | diff --git a/mm/vmstat.c b/mm/vmstat.c
6902 | index cb2a67b..912b86f 100644
6903 | --- a/mm/vmstat.c
6904 | +++ b/mm/vmstat.c
6905 | @@ -733,6 +733,9 @@ const char * const vmstat_text[] = {
6906 |  	"nr_anon_transparent_hugepages",
6907 |  	"nr_free_cma",
6908 |  
6909 | +#ifdef CONFIG_UKSM
6910 | +	"nr_uksm_zero_pages",
6911 | +#endif
6912 |  	/* enum writeback_stat_item counters */
6913 |  	"nr_dirty_threshold",
6914 |  	"nr_dirty_background_threshold",
6915 | 


--------------------------------------------------------------------------------